aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2014-01-02 15:55:18 +0100
committerMatěj Cepl <mcepl@redhat.com>2014-01-02 15:55:18 +0100
commit4b3df4fabb92dab4028cb794de897eaf3b8fde82 (patch)
tree059ed1bd61151435830f2349368d61a29ae9a52e
parent163aa69fd2b435b2ef180a2fe91c8112e12e15c3 (diff)
downloadgg_scraper-4b3df4fabb92dab4028cb794de897eaf3b8fde82.tar.gz
Creating raw MBOX fixed (tests included)
Fix #278 and #271
-rwxr-xr-xgg_scrapper.py68
-rw-r--r--test/group.yaml154
-rw-r--r--test/mbox.mbx252
-rw-r--r--test/test_functional.py4
-rw-r--r--test/test_unit.py23
5 files changed, 467 insertions, 34 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py
index 2ea9f92..f1cde21 100755
--- a/gg_scrapper.py
+++ b/gg_scrapper.py
@@ -1,21 +1,20 @@
#!/usr/bin/python3
import mailbox
+import os.path
import re
+import shutil
import subprocess
-import urllib.request
+import sys
import urllib.error
import urllib.parse
+import urllib.request
#from concurrent.futures import ProcessPoolExecutor
from bs4 import BeautifulSoup
import logging
logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
level=logging.DEBUG)
-TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$')
-ARTICL_MSG_URL_RE = re.compile(r'https://groups.google.com/d/msg/')
-ARTICLE_COUNT_RE = re.compile(r'\D+ \d+\D+\d+ \D+ (\d+) \D+$')
-
class Page(object):
verb_handler = urllib.request.HTTPHandler()
@@ -62,7 +61,7 @@ class Page(object):
class Article(Page):
def __init__(self, URL):
super(Article, self).__init__()
- self.root = URL.replace('#!msg/', 'message/raw?msg=')
+ self.root = URL.replace('d/msg/', 'forum/message/raw?msg=')
self.raw_message = ''
def collect_message(self):
@@ -70,9 +69,10 @@ class Article(Page):
raw_msg = res.read()
proc = subprocess.Popen(['/usr/bin/formail'],
stdin=subprocess.PIPE,
- stdout=subprocess.PIPE)
- result = proc.communicate(raw_msg)[0]
- return result.decode()
+ stdout=subprocess.PIPE,
+ universal_newlines=True)
+ result = proc.communicate(raw_msg.decode())[0]
+ return result
class Topic(Page):
@@ -99,9 +99,7 @@ class Topic(Page):
raise ValueError('Cannot find count of topics!')
i_str = i_elem[0].string
- logging.debug('i_str = {}'.format(i_str))
- logging.debug('RE = {}'.format(ARTICLE_COUNT_RE.pattern))
- return int(ARTICLE_COUNT_RE.match(i_str).group(1))
+ return int(re.match(r'\D+ \d+\D+\d+ \D+ (\d+) \D+$', i_str).group(1))
def get_articles(self):
out = []
@@ -109,8 +107,8 @@ class Topic(Page):
for a_elem in page.find_all('a'):
if 'href' in a_elem.attrs:
a_href = a_elem['href']
- if ARTICL_MSG_URL_RE.match(a_href) is not None:
- logging.debug('a_elem = %s', a_href)
+ if re.match(r'https://groups.google.com/d/msg/',
+ a_href) is not None:
out.append(Article(a_href))
return out
@@ -120,6 +118,11 @@ class Group(Page):
def __init__(self, URL):
super(Group, self).__init__()
self.group_URL = URL
+ self.topics = []
+ match = re.match(r'https://groups.google.com/forum/#!forum/(.+)',
+ URL)
+ if match is not None:
+ self.name = match.group(1)
@staticmethod
def get_count_topics(BS):
@@ -134,7 +137,7 @@ class Group(Page):
raise ValueError('Cannot find count of topics!')
i_str = i_elem[0].string
- return int(TOPIC_COUNT_RE.match(i_str).group(1))
+ return int(re.match(r'\D+ \d+ - \d+ \D+ (\d+) \D+$', i_str).group(1))
@staticmethod
def get_one_topic(elem):
@@ -172,30 +175,49 @@ class Group(Page):
return out
def collect_group(self):
- topics = self.get_topics()
- for top in topics:
+ self.topics = self.get_topics()
+ for top in self.topics:
arts = top.get_articles()
top.articles = arts
for a in arts:
msg = a.collect_message()
a.raw_message = msg
+ def all_messages(self):
+ '''Iterate over all messages in the group'''
+ for top in self.topics:
+ for art in top.articles:
+ yield art.raw_message
+
class MBOX(mailbox.mbox):
def __init__(self, filename):
- super(MBOX, self).__init__()
+ if os.path.exists(filename):
+ shutil.move(filename, '{}.bak'.format(filename))
+ super(MBOX, self).__init__(filename)
self.box_name = filename
def write_group(self, group_object):
- pass
+ self.lock()
+ for mbx_str in group_object.all_messages():
+ self.add(mbx_str.encode())
+ self.unlock()
+ self.close()
-def main(group_name, group_URL):
+def main(group_URL):
# Collect all messages to the internal variables
grp = Group(group_URL)
grp.collect_group()
+ import yaml
+ # dump the state for debugging
+ with open('group.yaml', 'w') as yf:
+ yaml.dump(grp, yf)
+
# Write MBOX
- mbx = MBOX()
- mbx.format_mbox(grp)
- mbx.save("{}.mbx".format(group_name))
+ mbx = MBOX("{}.mbx".format(grp.name))
+ mbx.write_group(grp)
+
+if __name__ == '__main__':
+ main(sys.argv[1])
diff --git a/test/group.yaml b/test/group.yaml
new file mode 100644
index 0000000..44cac25
--- /dev/null
+++ b/test/group.yaml
@@ -0,0 +1,154 @@
+!!python/object:gg_scrapper.Group
+group_URL: https://groups.google.com/forum/#!forum/javascriptcz
+name: javascriptcz
+topics:
+- !!python/object:gg_scrapper.Topic
+ articles:
+ - !!python/object:gg_scrapper.Article {raw_message: "From scho...@schovi.cz Thu Jan\
+ \ 2 15:15:33 2014\nReceived: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917;\n\
+ \ Sat, 28 May 2011 08:18:46 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
+ Received: by 10.224.187.145 with SMTP id cw17ls698645qab.2.gmail; Sat, 28 May\n\
+ \ 2011 08:18:45 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.205.130\
+ \ with SMTP id fq2mr904146qab.18.1306595925702; Sat,\n 28 May 2011 08:18:45\
+ \ -0700 (PDT)\nReceived: by p6g2000vbn.googlegroups.com with HTTP; Sat, 28 May\
+ \ 2011 08:18:45\n -0700 (PDT)\nDate: Sat, 28 May 2011 08:18:45 -0700 (PDT)\n\
+ User-Agent: G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X\
+ \ 10_6_7)\n AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.772.0 Safari/535.1,gzip(gfe)\n\
+ Message-ID: <c4ee7911-2a16-487b-8d96-1c0997cc1e24@p6g2000vbn.googlegroups.com>\n\
+ Subject: =?ISO-8859-2?Q?Zdroje=2C_kter=E9_byste_nem=ECli_minout=2E?=\nFrom:\
+ \ David Schovanec <scho...@schovi.cz>\nTo: \"javascript.cz\" <javasc...@googlegroups.com>\n\
+ Content-Type: text/plain; charset=ISO-8859-2\nContent-Transfer-Encoding: quoted-printable\n\
+ \n\nRozs=E1hl=E9 pojedn=E1n=ED o tom, jak se pracuje s funkcema -\nhttp://kangax.github.com/nfe/\n\
+ \nBlog o v=B9em mo=BEn=E9m a nemo=BEn=E9m kolem javascriptu\nhttp://javascriptweblog.wordpress.com/\n\
+ \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/5tbTMhlt4s0/t7QWA3IHsV0J'}
+ name: "Zdroje, kter\xE9 byste nem\u011Bli minout."
+ root: https://groups.google.com/forum/?_escaped_fragment_=topic/javascriptcz/5tbTMhlt4s0
+- !!python/object:gg_scrapper.Topic
+ articles:
+ - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\
+ \ Jan 2 15:15:35 2014\nReceived: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334;\n\
+ \ Thu, 26 May 2011 01:34:27 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
+ Received: by 10.224.187.145 with SMTP id cw17ls261575qab.2.gmail; Thu, 26 May\n\
+ \ 2011 01:34:26 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.32.129 with\
+ \ SMTP id c1mr163718qad.40.1306398866188; Thu,\n 26 May 2011 01:34:26 -0700\
+ \ (PDT)\nReceived: by a26g2000vbo.googlegroups.com with HTTP; Thu, 26 May 2011\
+ \ 01:34:26\n -0700 (PDT)\nDate: Thu, 26 May 2011 01:34:26 -0700 (PDT)\nUser-Agent:\
+ \ G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)\n\
+ \ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.24,gzip(gfe)\n\
+ Message-ID: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>\n\
+ Subject: SproutCore\nFrom: Josef Richter <richte...@gmail.com>\nTo: \"javascript.cz\"\
+ \ <javasc...@googlegroups.com>\nContent-Type: text/plain; charset=ISO-8859-2\n\
+ Content-Transfer-Encoding: quoted-printable\n\n\nTak dneska bylo ohl=E1=B9eno\
+ \ SproutCore 2.0 a je k dispozici developer\npreview.\n\nHlavn=ED zm=ECna je\
+ \ ve view layer - bude se pou=BE=EDvat nom=E1ln=EC html+c=\nss a\nhandlebars\
+ \ templating. To mi na p=F8edchoz=EDch verz=EDch vadilo nejv=EDc, =\n=BEe\n\
+ se celej view vlastn=EC psal z javascriptu, proto=BEe tam byly\np=F8eddefinovan=FD\
+ \ v=B9echny pages, panes, buttons, widgets, atd. tak=BEe t=\ny\nappky pak vypadaly\
+ \ v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1dn=\n=FD.\n\nhttp://blog.sproutcore.com/announcing-sproutcore-2-0/\n\
+ \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/pUoGBDjK_HcJ'}
+ - !!python/object:gg_scrapper.Article {raw_message: "From damn...@gmail.com Thu Jan\
+ \ 2 15:15:36 2014\nReceived: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592;\n\
+ \ Fri, 27 May 2011 04:29:49 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
+ Received: by 10.90.58.22 with SMTP id g22ls443766aga.2.gmail; Fri, 27 May 2011\n\
+ \ 04:29:49 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.90.1.10 with SMTP\
+ \ id 10mr123075aga.25.1306495789240; Fri, 27\n May 2011 04:29:49 -0700 (PDT)\n\
+ Received: by n10g2000yqf.googlegroups.com with HTTP; Fri, 27 May 2011 04:29:49\n\
+ \ -0700 (PDT)\nDate: Fri, 27 May 2011 04:29:49 -0700 (PDT)\nIn-Reply-To: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>\n\
+ References: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>\n\
+ User-Agent: G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; U; Intel Mac OS\
+ \ X 10_6_7; cs-cz)\n AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5\
+ \ Safari/533.21.1,gzip(gfe)\nMessage-ID: <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com>\n\
+ Subject: Re: SproutCore\nFrom: pepe <damn...@gmail.com>\nTo: \"javascript.cz\"\
+ \ <javasc...@googlegroups.com>\nContent-Type: text/plain; charset=ISO-8859-2\n\
+ Content-Transfer-Encoding: quoted-printable\n\n\nPou=BE=EDval jsi to u=BE na\
+ \ n=EC=E8em? Ja se st=E1le nemu=BEu p=F8esv=ECd=\n=E8it, =BEe to m=E1m\nvyzkou=B9et.\
+ \ A jak zn=E1mo dobr=E9 p=F8=EDklady t=E1hnou.\n\n\nOn 26 kv=EC, 10:34, Josef\
+ \ Richter <richte...@gmail.com> wrote:\n> Tak dneska bylo ohl=E1=B9eno SproutCore\
+ \ 2.0 a je k dispozici developer\n> preview.\n>\n> Hlavn=ED zm=ECna je ve view\
+ \ layer - bude se pou=BE=EDvat nom=E1ln=EC html=\n+css a\n> handlebars templating.\
+ \ To mi na p=F8edchoz=EDch verz=EDch vadilo nejv=EDc=\n, =BEe\n> se celej view\
+ \ vlastn=EC psal z javascriptu, proto=BEe tam byly\n> p=F8eddefinovan=FD v=B9echny\
+ \ pages, panes, buttons, widgets, atd. tak=BEe=\n ty\n> appky pak vypadaly v=B9echny\
+ \ stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1d=\nn=FD.\n>\n> http://blog.sproutcore.com/announcing-sproutcore-2-0/\n\
+ \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/Gxus9ddtp5wJ'}
+ - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\
+ \ Jan 2 15:15:36 2014\nReceived: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290;\n\
+ \ Fri, 27 May 2011 04:56:01 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
+ Received: by 10.224.175.74 with SMTP id w10ls508844qaz.0.gmail; Fri, 27 May\n\
+ \ 2011 04:56:01 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.186.16 with\
+ \ SMTP id cq16mr523722qab.19.1306497361142; Fri,\n 27 May 2011 04:56:01 -0700\
+ \ (PDT)\nReceived: by v31g2000vbs.googlegroups.com with HTTP; Fri, 27 May 2011\
+ \ 04:56:01\n -0700 (PDT)\nDate: Fri, 27 May 2011 04:56:01 -0700 (PDT)\nIn-Reply-To:\
+ \ <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com>\nReferences:\
+ \ <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>\n <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com>\n\
+ User-Agent: G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X\
+ \ 10_6_7)\n AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.24,gzip(gfe)\n\
+ Message-ID: <c6e41779-e622-4559-9381-9f250e90370d@v31g2000vbs.googlegroups.com>\n\
+ Subject: Re: SproutCore\nFrom: Josef Richter <richte...@gmail.com>\nTo: \"javascript.cz\"\
+ \ <javasc...@googlegroups.com>\nContent-Type: text/plain; charset=ISO-8859-2\n\
+ Content-Transfer-Encoding: quoted-printable\n\n\nNa produk=E8n=ED v=ECc ne.\
+ \ Sp=ED=B9 jsem si s t=EDm jenom hr=E1l, zkou=B9el=\n tutorialy\na tak. Tam\
+ \ byl obrovskej probl=E9m (podle m=EC) ta View vrstva, kter=E1 byl=\na\nstra=B9nej\
+ \ mastodont a tla=E8ila t=EC do n=ECjak=FDho stylu. Proto ty appky\nv=B9echny\
+ \ vypadaly prakticky stejn=EC jako MobileMe.\n\nA proto taky spoustu lid=ED\
+ \ nadchlo sp=ED=B9 Backbone, kter=FD na to =B9lo =\nz\ndruh=E9 strany a naopak\
+ \ je ultrajednoduch=FD. Nem=E1 v sob=EC nap=F8. v=F9b=\nec\n=BE=E1dn=FD templatov=E1n=ED\
+ \ a i ten MVC je tam trochu specifickej.\n\nTen SproutCore 2.0 by kone=E8n=EC\
+ \ mohl b=FDt pr=F9lom v popularit=EC. Ten Y=\nehuda\nnen=ED blbej a jeho ruka\
+ \ na tom za=E8=EDn=E1 b=FDt vid=ECt :-)\n\nOn May 27, 1:29=A0pm, pepe <damn...@gmail.com>\
+ \ wrote:\n> Pou=BE=EDval jsi to u=BE na n=EC=E8em? Ja se st=E1le nemu=BEu p=F8esv=ECd=\n\
+ =E8it, =BEe to m=E1m\n> vyzkou=B9et. A jak zn=E1mo dobr=E9 p=F8=EDklady t=E1hnou.\n\
+ >\n> On 26 kv=EC, 10:34, Josef Richter <richte...@gmail.com> wrote:\n>\n>\n\
+ >\n>\n>\n>\n>\n> > Tak dneska bylo ohl=E1=B9eno SproutCore 2.0 a je k dispozici\
+ \ developer\n> > preview.\n>\n> > Hlavn=ED zm=ECna je ve view layer - bude se\
+ \ pou=BE=EDvat nom=E1ln=EC ht=\nml+css a\n> > handlebars templating. To mi na\
+ \ p=F8edchoz=EDch verz=EDch vadilo nejv=\n=EDc, =BEe\n> > se celej view vlastn=EC\
+ \ psal z javascriptu, proto=BEe tam byly\n> > p=F8eddefinovan=FD v=B9echny pages,\
+ \ panes, buttons, widgets, atd. tak=\n=BEe ty\n> > appky pak vypadaly v=B9echny\
+ \ stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=\n=E1dn=FD.\n>\n> >http://blog.sproutcore.com/announcing-sproutcore-2-0/\n\
+ \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/IpL3eL0yancJ'}
+ name: SproutCore
+ root: https://groups.google.com/forum/?_escaped_fragment_=topic/javascriptcz/-4cy0XTGGaU
+- !!python/object:gg_scrapper.Topic
+ articles:
+ - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\
+ \ Jan 2 15:15:39 2014\nReceived: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553;\n\
+ \ Wed, 25 May 2011 12:25:09 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
+ Received: by 10.224.181.131 with SMTP id by3ls169717qab.3.gmail; Wed, 25 May\n\
+ \ 2011 12:25:08 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.203.8 with\
+ \ SMTP id fg8mr904907qab.54.1306351508426; Wed,\n 25 May 2011 12:25:08 -0700\
+ \ (PDT)\nReceived: by n10g2000vby.googlegroups.com with HTTP; Wed, 25 May 2011\
+ \ 12:25:08\n -0700 (PDT)\nDate: Wed, 25 May 2011 12:25:08 -0700 (PDT)\nUser-Agent:\
+ \ G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)\n\
+ \ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24,gzip(gfe)\n\
+ Message-ID: <32a62e7b-28f6-4b99-920a-eba49518b9a4@n10g2000vby.googlegroups.com>\n\
+ Subject: =?ISO-8859-1?Q?IRC_kan=E1l?=\nFrom: Josef Richter <richte...@gmail.com>\n\
+ To: \"javascript.cz\" <javasc...@googlegroups.com>\nContent-Type: text/plain;\
+ \ charset=ISO-8859-1\n\n\n#javascript.cz na irc.freenode.net\n\n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/KpLLZ7thax4/nxKqd5qBVTIJ'}
+ name: "IRC kan\xE1l"
+ root: https://groups.google.com/forum/?_escaped_fragment_=topic/javascriptcz/KpLLZ7thax4
+- !!python/object:gg_scrapper.Topic
+ articles:
+ - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\
+ \ Jan 2 15:15:40 2014\nReceived: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198;\n\
+ \ Wed, 25 May 2011 05:05:20 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
+ Received: by 10.224.138.148 with SMTP id a20ls76694qau.5.gmail; Wed, 25 May\n\
+ \ 2011 05:05:20 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.217.200\
+ \ with SMTP id hn8mr722125qab.0.1306325120116; Wed,\n 25 May 2011 05:05:20 -0700\
+ \ (PDT)\nReceived: by 32g2000vbe.googlegroups.com with HTTP; Wed, 25 May 2011\
+ \ 05:05:20\n -0700 (PDT)\nDate: Wed, 25 May 2011 05:05:20 -0700 (PDT)\nUser-Agent:\
+ \ G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)\n\
+ \ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24,gzip(gfe)\n\
+ Message-ID: <df869047-365d-40da-8641-8c74d1ae2090@32g2000vbe.googlegroups.com>\n\
+ Subject: =?ISO-8859-2?Q?=C8lenov=E9?=\nFrom: Josef Richter <richte...@gmail.com>\n\
+ To: \"javascript.cz\" <javasc...@googlegroups.com>\nContent-Type: text/plain;\
+ \ charset=ISO-8859-2\nContent-Transfer-Encoding: quoted-printable\n\n\nbylo\
+ \ by fajn kdyby ka=BEd=FD =E8len napsal krati=E8k=E9 info o sob=EC\n\nvykop=E1v=E1m:\n\
+ \njsem web designer a front-end developer, d=ECl=E1m hlavn=EC jednodu=B9=B9=\n\
+ =ED v=ECci\nv jQuery, ale jak se st=E1vaj=ED slo=BEit=ECj=B9=EDmi, tak se sna=BE=EDm\
+ \ pr=\noniknout do\nBackbone.js, SproutCore, apod. Experimentuju i s jQuery\
+ \ Mobile a\nmobiln=EDm aplikacemi v=F9bec. Slu=B9n=EC se orientuju v Ruby on\
+ \ Rails a ve=\ndu\nmal=FD developersk=FD team, zam=EC=F8en=FD na agiln=ED development\
+ \ webov=FD=\nch\naplikac=ED.\n\n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/hB3Rjgd5SBA/aKCbqJFbN-sJ'}
+ name: "\u010Clenov\xE9"
+ root: https://groups.google.com/forum/?_escaped_fragment_=topic/javascriptcz/hB3Rjgd5SBA
diff --git a/test/mbox.mbx b/test/mbox.mbx
new file mode 100644
index 0000000..026dc5a
--- /dev/null
+++ b/test/mbox.mbx
@@ -0,0 +1,252 @@
+From scho...@schovi.cz Thu Jan 2 15:15:33 2014
+Received: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917;
+ Sat, 28 May 2011 08:18:46 -0700 (PDT)
+X-BeenThere: javascriptcz@googlegroups.com
+Received: by 10.224.187.145 with SMTP id cw17ls698645qab.2.gmail; Sat, 28 May
+ 2011 08:18:45 -0700 (PDT)
+MIME-Version: 1.0
+Received: by 10.224.205.130 with SMTP id fq2mr904146qab.18.1306595925702; Sat,
+ 28 May 2011 08:18:45 -0700 (PDT)
+Received: by p6g2000vbn.googlegroups.com with HTTP; Sat, 28 May 2011 08:18:45
+ -0700 (PDT)
+Date: Sat, 28 May 2011 08:18:45 -0700 (PDT)
+User-Agent: G2/1.0
+X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)
+ AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.772.0 Safari/535.1,gzip(gfe)
+Message-ID: <c4ee7911-2a16-487b-8d96-1c0997cc1e24@p6g2000vbn.googlegroups.com>
+Subject: =?ISO-8859-2?Q?Zdroje=2C_kter=E9_byste_nem=ECli_minout=2E?=
+From: David Schovanec <scho...@schovi.cz>
+To: "javascript.cz" <javasc...@googlegroups.com>
+Content-Type: text/plain; charset=ISO-8859-2
+Content-Transfer-Encoding: quoted-printable
+
+
+Rozs=E1hl=E9 pojedn=E1n=ED o tom, jak se pracuje s funkcema -
+http://kangax.github.com/nfe/
+
+Blog o v=B9em mo=BEn=E9m a nemo=BEn=E9m kolem javascriptu
+http://javascriptweblog.wordpress.com/
+
+
+From richte...@gmail.com Thu Jan 2 15:15:35 2014
+Received: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334;
+ Thu, 26 May 2011 01:34:27 -0700 (PDT)
+X-BeenThere: javascriptcz@googlegroups.com
+Received: by 10.224.187.145 with SMTP id cw17ls261575qab.2.gmail; Thu, 26 May
+ 2011 01:34:26 -0700 (PDT)
+MIME-Version: 1.0
+Received: by 10.224.32.129 with SMTP id c1mr163718qad.40.1306398866188; Thu,
+ 26 May 2011 01:34:26 -0700 (PDT)
+Received: by a26g2000vbo.googlegroups.com with HTTP; Thu, 26 May 2011 01:34:26
+ -0700 (PDT)
+Date: Thu, 26 May 2011 01:34:26 -0700 (PDT)
+User-Agent: G2/1.0
+X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)
+ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.24,gzip(gfe)
+Message-ID: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>
+Subject: SproutCore
+From: Josef Richter <richte...@gmail.com>
+To: "javascript.cz" <javasc...@googlegroups.com>
+Content-Type: text/plain; charset=ISO-8859-2
+Content-Transfer-Encoding: quoted-printable
+
+
+Tak dneska bylo ohl=E1=B9eno SproutCore 2.0 a je k dispozici developer
+preview.
+
+Hlavn=ED zm=ECna je ve view layer - bude se pou=BE=EDvat nom=E1ln=EC html+c=
+ss a
+handlebars templating. To mi na p=F8edchoz=EDch verz=EDch vadilo nejv=EDc, =
+=BEe
+se celej view vlastn=EC psal z javascriptu, proto=BEe tam byly
+p=F8eddefinovan=FD v=B9echny pages, panes, buttons, widgets, atd. tak=BEe t=
+y
+appky pak vypadaly v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1dn=
+=FD.
+
+http://blog.sproutcore.com/announcing-sproutcore-2-0/
+
+
+From damn...@gmail.com Thu Jan 2 15:15:36 2014
+Received: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592;
+ Fri, 27 May 2011 04:29:49 -0700 (PDT)
+X-BeenThere: javascriptcz@googlegroups.com
+Received: by 10.90.58.22 with SMTP id g22ls443766aga.2.gmail; Fri, 27 May 2011
+ 04:29:49 -0700 (PDT)
+MIME-Version: 1.0
+Received: by 10.90.1.10 with SMTP id 10mr123075aga.25.1306495789240; Fri, 27
+ May 2011 04:29:49 -0700 (PDT)
+Received: by n10g2000yqf.googlegroups.com with HTTP; Fri, 27 May 2011 04:29:49
+ -0700 (PDT)
+Date: Fri, 27 May 2011 04:29:49 -0700 (PDT)
+In-Reply-To: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>
+References: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>
+User-Agent: G2/1.0
+X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; cs-cz)
+ AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1,gzip(gfe)
+Message-ID: <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com>
+Subject: Re: SproutCore
+From: pepe <damn...@gmail.com>
+To: "javascript.cz" <javasc...@googlegroups.com>
+Content-Type: text/plain; charset=ISO-8859-2
+Content-Transfer-Encoding: quoted-printable
+
+
+Pou=BE=EDval jsi to u=BE na n=EC=E8em? Ja se st=E1le nemu=BEu p=F8esv=ECd=
+=E8it, =BEe to m=E1m
+vyzkou=B9et. A jak zn=E1mo dobr=E9 p=F8=EDklady t=E1hnou.
+
+
+On 26 kv=EC, 10:34, Josef Richter <richte...@gmail.com> wrote:
+> Tak dneska bylo ohl=E1=B9eno SproutCore 2.0 a je k dispozici developer
+> preview.
+>
+> Hlavn=ED zm=ECna je ve view layer - bude se pou=BE=EDvat nom=E1ln=EC html=
++css a
+> handlebars templating. To mi na p=F8edchoz=EDch verz=EDch vadilo nejv=EDc=
+, =BEe
+> se celej view vlastn=EC psal z javascriptu, proto=BEe tam byly
+> p=F8eddefinovan=FD v=B9echny pages, panes, buttons, widgets, atd. tak=BEe=
+ ty
+> appky pak vypadaly v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1d=
+n=FD.
+>
+> http://blog.sproutcore.com/announcing-sproutcore-2-0/
+
+
+From richte...@gmail.com Thu Jan 2 15:15:36 2014
+Received: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290;
+ Fri, 27 May 2011 04:56:01 -0700 (PDT)
+X-BeenThere: javascriptcz@googlegroups.com
+Received: by 10.224.175.74 with SMTP id w10ls508844qaz.0.gmail; Fri, 27 May
+ 2011 04:56:01 -0700 (PDT)
+MIME-Version: 1.0
+Received: by 10.224.186.16 with SMTP id cq16mr523722qab.19.1306497361142; Fri,
+ 27 May 2011 04:56:01 -0700 (PDT)
+Received: by v31g2000vbs.googlegroups.com with HTTP; Fri, 27 May 2011 04:56:01
+ -0700 (PDT)
+Date: Fri, 27 May 2011 04:56:01 -0700 (PDT)
+In-Reply-To: <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com>
+References: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>
+ <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com>
+User-Agent: G2/1.0
+X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)
+ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.24,gzip(gfe)
+Message-ID: <c6e41779-e622-4559-9381-9f250e90370d@v31g2000vbs.googlegroups.com>
+Subject: Re: SproutCore
+From: Josef Richter <richte...@gmail.com>
+To: "javascript.cz" <javasc...@googlegroups.com>
+Content-Type: text/plain; charset=ISO-8859-2
+Content-Transfer-Encoding: quoted-printable
+
+
+Na produk=E8n=ED v=ECc ne. Sp=ED=B9 jsem si s t=EDm jenom hr=E1l, zkou=B9el=
+ tutorialy
+a tak. Tam byl obrovskej probl=E9m (podle m=EC) ta View vrstva, kter=E1 byl=
+a
+stra=B9nej mastodont a tla=E8ila t=EC do n=ECjak=FDho stylu. Proto ty appky
+v=B9echny vypadaly prakticky stejn=EC jako MobileMe.
+
+A proto taky spoustu lid=ED nadchlo sp=ED=B9 Backbone, kter=FD na to =B9lo =
+z
+druh=E9 strany a naopak je ultrajednoduch=FD. Nem=E1 v sob=EC nap=F8. v=F9b=
+ec
+=BE=E1dn=FD templatov=E1n=ED a i ten MVC je tam trochu specifickej.
+
+Ten SproutCore 2.0 by kone=E8n=EC mohl b=FDt pr=F9lom v popularit=EC. Ten Y=
+ehuda
+nen=ED blbej a jeho ruka na tom za=E8=EDn=E1 b=FDt vid=ECt :-)
+
+On May 27, 1:29=A0pm, pepe <damn...@gmail.com> wrote:
+> Pou=BE=EDval jsi to u=BE na n=EC=E8em? Ja se st=E1le nemu=BEu p=F8esv=ECd=
+=E8it, =BEe to m=E1m
+> vyzkou=B9et. A jak zn=E1mo dobr=E9 p=F8=EDklady t=E1hnou.
+>
+> On 26 kv=EC, 10:34, Josef Richter <richte...@gmail.com> wrote:
+>
+>
+>
+>
+>
+>
+>
+> > Tak dneska bylo ohl=E1=B9eno SproutCore 2.0 a je k dispozici developer
+> > preview.
+>
+> > Hlavn=ED zm=ECna je ve view layer - bude se pou=BE=EDvat nom=E1ln=EC ht=
+ml+css a
+> > handlebars templating. To mi na p=F8edchoz=EDch verz=EDch vadilo nejv=
+=EDc, =BEe
+> > se celej view vlastn=EC psal z javascriptu, proto=BEe tam byly
+> > p=F8eddefinovan=FD v=B9echny pages, panes, buttons, widgets, atd. tak=
+=BEe ty
+> > appky pak vypadaly v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=
+=E1dn=FD.
+>
+> >http://blog.sproutcore.com/announcing-sproutcore-2-0/
+
+
+From richte...@gmail.com Thu Jan 2 15:15:39 2014
+Received: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553;
+ Wed, 25 May 2011 12:25:09 -0700 (PDT)
+X-BeenThere: javascriptcz@googlegroups.com
+Received: by 10.224.181.131 with SMTP id by3ls169717qab.3.gmail; Wed, 25 May
+ 2011 12:25:08 -0700 (PDT)
+MIME-Version: 1.0
+Received: by 10.224.203.8 with SMTP id fg8mr904907qab.54.1306351508426; Wed,
+ 25 May 2011 12:25:08 -0700 (PDT)
+Received: by n10g2000vby.googlegroups.com with HTTP; Wed, 25 May 2011 12:25:08
+ -0700 (PDT)
+Date: Wed, 25 May 2011 12:25:08 -0700 (PDT)
+User-Agent: G2/1.0
+X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)
+ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24,gzip(gfe)
+Message-ID: <32a62e7b-28f6-4b99-920a-eba49518b9a4@n10g2000vby.googlegroups.com>
+Subject: =?ISO-8859-1?Q?IRC_kan=E1l?=
+From: Josef Richter <richte...@gmail.com>
+To: "javascript.cz" <javasc...@googlegroups.com>
+Content-Type: text/plain; charset=ISO-8859-1
+
+
+#javascript.cz na irc.freenode.net
+
+
+From richte...@gmail.com Thu Jan 2 15:15:40 2014
+Received: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198;
+ Wed, 25 May 2011 05:05:20 -0700 (PDT)
+X-BeenThere: javascriptcz@googlegroups.com
+Received: by 10.224.138.148 with SMTP id a20ls76694qau.5.gmail; Wed, 25 May
+ 2011 05:05:20 -0700 (PDT)
+MIME-Version: 1.0
+Received: by 10.224.217.200 with SMTP id hn8mr722125qab.0.1306325120116; Wed,
+ 25 May 2011 05:05:20 -0700 (PDT)
+Received: by 32g2000vbe.googlegroups.com with HTTP; Wed, 25 May 2011 05:05:20
+ -0700 (PDT)
+Date: Wed, 25 May 2011 05:05:20 -0700 (PDT)
+User-Agent: G2/1.0
+X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)
+ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24,gzip(gfe)
+Message-ID: <df869047-365d-40da-8641-8c74d1ae2090@32g2000vbe.googlegroups.com>
+Subject: =?ISO-8859-2?Q?=C8lenov=E9?=
+From: Josef Richter <richte...@gmail.com>
+To: "javascript.cz" <javasc...@googlegroups.com>
+Content-Type: text/plain; charset=ISO-8859-2
+Content-Transfer-Encoding: quoted-printable
+
+
+bylo by fajn kdyby ka=BEd=FD =E8len napsal krati=E8k=E9 info o sob=EC
+
+vykop=E1v=E1m:
+
+jsem web designer a front-end developer, d=ECl=E1m hlavn=EC jednodu=B9=B9=
+=ED v=ECci
+v jQuery, ale jak se st=E1vaj=ED slo=BEit=ECj=B9=EDmi, tak se sna=BE=EDm pr=
+oniknout do
+Backbone.js, SproutCore, apod. Experimentuju i s jQuery Mobile a
+mobiln=EDm aplikacemi v=F9bec. Slu=B9n=EC se orientuju v Ruby on Rails a ve=
+du
+mal=FD developersk=FD team, zam=EC=F8en=FD na agiln=ED development webov=FD=
+ch
+aplikac=ED.
+
+
diff --git a/test/test_functional.py b/test/test_functional.py
index 3f657f6..3e8d874 100644
--- a/test/test_functional.py
+++ b/test/test_functional.py
@@ -10,7 +10,7 @@ ORIG_URL = 'http://groups.google.com/d/forum/jbrout'
EXP_URL = 'https://groups.google.com/forum/' + \
'?_escaped_fragment_=forum/jbrout'
TOPIC_URL = 'https://groups.google.com/forum/#!topic/jbrout/xNwoVmC07KI'
-ARTICLE_URL = 'https://groups.google.com/forum/#!msg/jbrout' + \
+ARTICLE_URL = 'https://groups.google.com/d/msg/jbrout' + \
'/xNwoVmC07KI/OfpRHFscUkwJ'
@@ -34,9 +34,7 @@ class TestGGScrapperFunctional(unittest.TestCase):
def test_get_raw_article(self):
self.maxDiff = None
- logging.debug('article = URL {}'.format(ARTICLE_URL))
article = gg_scrapper.Article(ARTICLE_URL)
- logging.debug('article = raw URL {}'.format(article.root))
rfc_msg = article.collect_message().replace('\r\n', '\n')
rfc_msg = '\n'.join(rfc_msg.split('\n')[1:])
diff --git a/test/test_unit.py b/test/test_unit.py
index e79c635..56ec08f 100644
--- a/test/test_unit.py
+++ b/test/test_unit.py
@@ -1,6 +1,9 @@
-import pickle
+import os
+import tempfile
+import yaml
import unittest
import gg_scrapper
+from gg_scrapper import Group, Topic, Article # noqa
IN_URL = 'https://groups.google.com/forum/#!forum/jbrout'
ORIG_URL = 'http://groups.google.com/d/forum/jbrout'
@@ -22,15 +25,19 @@ class TestMBOX(unittest.TestCase):
def test_create_mbox(self):
'''Create a mbox file from (pickled) Group
'''
- group_file_name = 'test/group.pickle'
- with open(group_file_name, 'r', encoding='utf8') as group_f:
- group = pickle.load(group_f)
+ group_file_name = 'test/group.yaml'
+ with open(group_file_name, 'r') as group_f:
+ group = yaml.load(group_f)
- mbx = gg_scrapper.MBOX()
- mbx.format_mbox(group)
+ mbx_file = tempfile.NamedTemporaryFile('w', delete=False)
+ mbx = gg_scrapper.MBOX(mbx_file.name)
+ mbx.write_group(group)
- with open('test/generated_mbox.mbx') as exp_f:
- self.assertEqual(exp_f.read(), mbx.mbox_string)
+ with open('test/mbox.mbx') as exp_f:
+ with open(mbx_file.name) as mbx_f:
+ self.assertEqual(exp_f.read(), mbx_f.read())
+
+ os.unlink(mbx_file.name)
if __name__ == '__main__':
unittest.main()