diff options
-rwxr-xr-x | gg_scrapper.py | 68 | ||||
-rw-r--r-- | test/group.yaml | 154 | ||||
-rw-r--r-- | test/mbox.mbx | 252 | ||||
-rw-r--r-- | test/test_functional.py | 4 | ||||
-rw-r--r-- | test/test_unit.py | 23 |
5 files changed, 467 insertions, 34 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py index 2ea9f92..f1cde21 100755 --- a/gg_scrapper.py +++ b/gg_scrapper.py @@ -1,21 +1,20 @@ #!/usr/bin/python3 import mailbox +import os.path import re +import shutil import subprocess -import urllib.request +import sys import urllib.error import urllib.parse +import urllib.request #from concurrent.futures import ProcessPoolExecutor from bs4 import BeautifulSoup import logging logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', level=logging.DEBUG) -TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$') -ARTICL_MSG_URL_RE = re.compile(r'https://groups.google.com/d/msg/') -ARTICLE_COUNT_RE = re.compile(r'\D+ \d+\D+\d+ \D+ (\d+) \D+$') - class Page(object): verb_handler = urllib.request.HTTPHandler() @@ -62,7 +61,7 @@ class Page(object): class Article(Page): def __init__(self, URL): super(Article, self).__init__() - self.root = URL.replace('#!msg/', 'message/raw?msg=') + self.root = URL.replace('d/msg/', 'forum/message/raw?msg=') self.raw_message = '' def collect_message(self): @@ -70,9 +69,10 @@ class Article(Page): raw_msg = res.read() proc = subprocess.Popen(['/usr/bin/formail'], stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - result = proc.communicate(raw_msg)[0] - return result.decode() + stdout=subprocess.PIPE, + universal_newlines=True) + result = proc.communicate(raw_msg.decode())[0] + return result class Topic(Page): @@ -99,9 +99,7 @@ class Topic(Page): raise ValueError('Cannot find count of topics!') i_str = i_elem[0].string - logging.debug('i_str = {}'.format(i_str)) - logging.debug('RE = {}'.format(ARTICLE_COUNT_RE.pattern)) - return int(ARTICLE_COUNT_RE.match(i_str).group(1)) + return int(re.match(r'\D+ \d+\D+\d+ \D+ (\d+) \D+$', i_str).group(1)) def get_articles(self): out = [] @@ -109,8 +107,8 @@ class Topic(Page): for a_elem in page.find_all('a'): if 'href' in a_elem.attrs: a_href = a_elem['href'] - if ARTICL_MSG_URL_RE.match(a_href) is not None: - logging.debug('a_elem = %s', a_href) + if re.match(r'https://groups.google.com/d/msg/', + a_href) is not None: out.append(Article(a_href)) return out @@ -120,6 +118,11 @@ class Group(Page): def __init__(self, URL): super(Group, self).__init__() self.group_URL = URL + self.topics = [] + match = re.match(r'https://groups.google.com/forum/#!forum/(.+)', + URL) + if match is not None: + self.name = match.group(1) @staticmethod def get_count_topics(BS): @@ -134,7 +137,7 @@ class Group(Page): raise ValueError('Cannot find count of topics!') i_str = i_elem[0].string - return int(TOPIC_COUNT_RE.match(i_str).group(1)) + return int(re.match(r'\D+ \d+ - \d+ \D+ (\d+) \D+$', i_str).group(1)) @staticmethod def get_one_topic(elem): @@ -172,30 +175,49 @@ class Group(Page): return out def collect_group(self): - topics = self.get_topics() - for top in topics: + self.topics = self.get_topics() + for top in self.topics: arts = top.get_articles() top.articles = arts for a in arts: msg = a.collect_message() a.raw_message = msg + def all_messages(self): + '''Iterate over all messages in the group''' + for top in self.topics: + for art in top.articles: + yield art.raw_message + class MBOX(mailbox.mbox): def __init__(self, filename): - super(MBOX, self).__init__() + if os.path.exists(filename): + shutil.move(filename, '{}.bak'.format(filename)) + super(MBOX, self).__init__(filename) self.box_name = filename def write_group(self, group_object): - pass + self.lock() + for mbx_str in group_object.all_messages(): + self.add(mbx_str.encode()) + self.unlock() + self.close() -def main(group_name, group_URL): +def main(group_URL): # Collect all messages to the internal variables grp = Group(group_URL) grp.collect_group() + import yaml + # dump the state for debugging + with open('group.yaml', 'w') as yf: + yaml.dump(grp, yf) + # Write MBOX - mbx = MBOX() - mbx.format_mbox(grp) - mbx.save("{}.mbx".format(group_name)) + mbx = MBOX("{}.mbx".format(grp.name)) + mbx.write_group(grp) + +if __name__ == '__main__': + main(sys.argv[1]) diff --git a/test/group.yaml b/test/group.yaml new file mode 100644 index 0000000..44cac25 --- /dev/null +++ b/test/group.yaml @@ -0,0 +1,154 @@ +!!python/object:gg_scrapper.Group +group_URL: https://groups.google.com/forum/#!forum/javascriptcz +name: javascriptcz +topics: +- !!python/object:gg_scrapper.Topic + articles: + - !!python/object:gg_scrapper.Article {raw_message: "From scho...@schovi.cz Thu Jan\ + \ 2 15:15:33 2014\nReceived: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917;\n\ + \ Sat, 28 May 2011 08:18:46 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ + Received: by 10.224.187.145 with SMTP id cw17ls698645qab.2.gmail; Sat, 28 May\n\ + \ 2011 08:18:45 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.205.130\ + \ with SMTP id fq2mr904146qab.18.1306595925702; Sat,\n 28 May 2011 08:18:45\ + \ -0700 (PDT)\nReceived: by p6g2000vbn.googlegroups.com with HTTP; Sat, 28 May\ + \ 2011 08:18:45\n -0700 (PDT)\nDate: Sat, 28 May 2011 08:18:45 -0700 (PDT)\n\ + User-Agent: G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X\ + \ 10_6_7)\n AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.772.0 Safari/535.1,gzip(gfe)\n\ + Message-ID: <c4ee7911-2a16-487b-8d96-1c0997cc1e24@p6g2000vbn.googlegroups.com>\n\ + Subject: =?ISO-8859-2?Q?Zdroje=2C_kter=E9_byste_nem=ECli_minout=2E?=\nFrom:\ + \ David Schovanec <scho...@schovi.cz>\nTo: \"javascript.cz\" <javasc...@googlegroups.com>\n\ + Content-Type: text/plain; charset=ISO-8859-2\nContent-Transfer-Encoding: quoted-printable\n\ + \n\nRozs=E1hl=E9 pojedn=E1n=ED o tom, jak se pracuje s funkcema -\nhttp://kangax.github.com/nfe/\n\ + \nBlog o v=B9em mo=BEn=E9m a nemo=BEn=E9m kolem javascriptu\nhttp://javascriptweblog.wordpress.com/\n\ + \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/5tbTMhlt4s0/t7QWA3IHsV0J'} + name: "Zdroje, kter\xE9 byste nem\u011Bli minout." + root: https://groups.google.com/forum/?_escaped_fragment_=topic/javascriptcz/5tbTMhlt4s0 +- !!python/object:gg_scrapper.Topic + articles: + - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\ + \ Jan 2 15:15:35 2014\nReceived: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334;\n\ + \ Thu, 26 May 2011 01:34:27 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ + Received: by 10.224.187.145 with SMTP id cw17ls261575qab.2.gmail; Thu, 26 May\n\ + \ 2011 01:34:26 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.32.129 with\ + \ SMTP id c1mr163718qad.40.1306398866188; Thu,\n 26 May 2011 01:34:26 -0700\ + \ (PDT)\nReceived: by a26g2000vbo.googlegroups.com with HTTP; Thu, 26 May 2011\ + \ 01:34:26\n -0700 (PDT)\nDate: Thu, 26 May 2011 01:34:26 -0700 (PDT)\nUser-Agent:\ + \ G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)\n\ + \ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.24,gzip(gfe)\n\ + Message-ID: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>\n\ + Subject: SproutCore\nFrom: Josef Richter <richte...@gmail.com>\nTo: \"javascript.cz\"\ + \ <javasc...@googlegroups.com>\nContent-Type: text/plain; charset=ISO-8859-2\n\ + Content-Transfer-Encoding: quoted-printable\n\n\nTak dneska bylo ohl=E1=B9eno\ + \ SproutCore 2.0 a je k dispozici developer\npreview.\n\nHlavn=ED zm=ECna je\ + \ ve view layer - bude se pou=BE=EDvat nom=E1ln=EC html+c=\nss a\nhandlebars\ + \ templating. To mi na p=F8edchoz=EDch verz=EDch vadilo nejv=EDc, =\n=BEe\n\ + se celej view vlastn=EC psal z javascriptu, proto=BEe tam byly\np=F8eddefinovan=FD\ + \ v=B9echny pages, panes, buttons, widgets, atd. tak=BEe t=\ny\nappky pak vypadaly\ + \ v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1dn=\n=FD.\n\nhttp://blog.sproutcore.com/announcing-sproutcore-2-0/\n\ + \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/pUoGBDjK_HcJ'} + - !!python/object:gg_scrapper.Article {raw_message: "From damn...@gmail.com Thu Jan\ + \ 2 15:15:36 2014\nReceived: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592;\n\ + \ Fri, 27 May 2011 04:29:49 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ + Received: by 10.90.58.22 with SMTP id g22ls443766aga.2.gmail; Fri, 27 May 2011\n\ + \ 04:29:49 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.90.1.10 with SMTP\ + \ id 10mr123075aga.25.1306495789240; Fri, 27\n May 2011 04:29:49 -0700 (PDT)\n\ + Received: by n10g2000yqf.googlegroups.com with HTTP; Fri, 27 May 2011 04:29:49\n\ + \ -0700 (PDT)\nDate: Fri, 27 May 2011 04:29:49 -0700 (PDT)\nIn-Reply-To: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>\n\ + References: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>\n\ + User-Agent: G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; U; Intel Mac OS\ + \ X 10_6_7; cs-cz)\n AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5\ + \ Safari/533.21.1,gzip(gfe)\nMessage-ID: <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com>\n\ + Subject: Re: SproutCore\nFrom: pepe <damn...@gmail.com>\nTo: \"javascript.cz\"\ + \ <javasc...@googlegroups.com>\nContent-Type: text/plain; charset=ISO-8859-2\n\ + Content-Transfer-Encoding: quoted-printable\n\n\nPou=BE=EDval jsi to u=BE na\ + \ n=EC=E8em? Ja se st=E1le nemu=BEu p=F8esv=ECd=\n=E8it, =BEe to m=E1m\nvyzkou=B9et.\ + \ A jak zn=E1mo dobr=E9 p=F8=EDklady t=E1hnou.\n\n\nOn 26 kv=EC, 10:34, Josef\ + \ Richter <richte...@gmail.com> wrote:\n> Tak dneska bylo ohl=E1=B9eno SproutCore\ + \ 2.0 a je k dispozici developer\n> preview.\n>\n> Hlavn=ED zm=ECna je ve view\ + \ layer - bude se pou=BE=EDvat nom=E1ln=EC html=\n+css a\n> handlebars templating.\ + \ To mi na p=F8edchoz=EDch verz=EDch vadilo nejv=EDc=\n, =BEe\n> se celej view\ + \ vlastn=EC psal z javascriptu, proto=BEe tam byly\n> p=F8eddefinovan=FD v=B9echny\ + \ pages, panes, buttons, widgets, atd. tak=BEe=\n ty\n> appky pak vypadaly v=B9echny\ + \ stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1d=\nn=FD.\n>\n> http://blog.sproutcore.com/announcing-sproutcore-2-0/\n\ + \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/Gxus9ddtp5wJ'} + - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\ + \ Jan 2 15:15:36 2014\nReceived: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290;\n\ + \ Fri, 27 May 2011 04:56:01 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ + Received: by 10.224.175.74 with SMTP id w10ls508844qaz.0.gmail; Fri, 27 May\n\ + \ 2011 04:56:01 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.186.16 with\ + \ SMTP id cq16mr523722qab.19.1306497361142; Fri,\n 27 May 2011 04:56:01 -0700\ + \ (PDT)\nReceived: by v31g2000vbs.googlegroups.com with HTTP; Fri, 27 May 2011\ + \ 04:56:01\n -0700 (PDT)\nDate: Fri, 27 May 2011 04:56:01 -0700 (PDT)\nIn-Reply-To:\ + \ <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com>\nReferences:\ + \ <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com>\n <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com>\n\ + User-Agent: G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X\ + \ 10_6_7)\n AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.24,gzip(gfe)\n\ + Message-ID: <c6e41779-e622-4559-9381-9f250e90370d@v31g2000vbs.googlegroups.com>\n\ + Subject: Re: SproutCore\nFrom: Josef Richter <richte...@gmail.com>\nTo: \"javascript.cz\"\ + \ <javasc...@googlegroups.com>\nContent-Type: text/plain; charset=ISO-8859-2\n\ + Content-Transfer-Encoding: quoted-printable\n\n\nNa produk=E8n=ED v=ECc ne.\ + \ Sp=ED=B9 jsem si s t=EDm jenom hr=E1l, zkou=B9el=\n tutorialy\na tak. Tam\ + \ byl obrovskej probl=E9m (podle m=EC) ta View vrstva, kter=E1 byl=\na\nstra=B9nej\ + \ mastodont a tla=E8ila t=EC do n=ECjak=FDho stylu. Proto ty appky\nv=B9echny\ + \ vypadaly prakticky stejn=EC jako MobileMe.\n\nA proto taky spoustu lid=ED\ + \ nadchlo sp=ED=B9 Backbone, kter=FD na to =B9lo =\nz\ndruh=E9 strany a naopak\ + \ je ultrajednoduch=FD. Nem=E1 v sob=EC nap=F8. v=F9b=\nec\n=BE=E1dn=FD templatov=E1n=ED\ + \ a i ten MVC je tam trochu specifickej.\n\nTen SproutCore 2.0 by kone=E8n=EC\ + \ mohl b=FDt pr=F9lom v popularit=EC. Ten Y=\nehuda\nnen=ED blbej a jeho ruka\ + \ na tom za=E8=EDn=E1 b=FDt vid=ECt :-)\n\nOn May 27, 1:29=A0pm, pepe <damn...@gmail.com>\ + \ wrote:\n> Pou=BE=EDval jsi to u=BE na n=EC=E8em? Ja se st=E1le nemu=BEu p=F8esv=ECd=\n\ + =E8it, =BEe to m=E1m\n> vyzkou=B9et. A jak zn=E1mo dobr=E9 p=F8=EDklady t=E1hnou.\n\ + >\n> On 26 kv=EC, 10:34, Josef Richter <richte...@gmail.com> wrote:\n>\n>\n\ + >\n>\n>\n>\n>\n> > Tak dneska bylo ohl=E1=B9eno SproutCore 2.0 a je k dispozici\ + \ developer\n> > preview.\n>\n> > Hlavn=ED zm=ECna je ve view layer - bude se\ + \ pou=BE=EDvat nom=E1ln=EC ht=\nml+css a\n> > handlebars templating. To mi na\ + \ p=F8edchoz=EDch verz=EDch vadilo nejv=\n=EDc, =BEe\n> > se celej view vlastn=EC\ + \ psal z javascriptu, proto=BEe tam byly\n> > p=F8eddefinovan=FD v=B9echny pages,\ + \ panes, buttons, widgets, atd. tak=\n=BEe ty\n> > appky pak vypadaly v=B9echny\ + \ stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=\n=E1dn=FD.\n>\n> >http://blog.sproutcore.com/announcing-sproutcore-2-0/\n\ + \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/IpL3eL0yancJ'} + name: SproutCore + root: https://groups.google.com/forum/?_escaped_fragment_=topic/javascriptcz/-4cy0XTGGaU +- !!python/object:gg_scrapper.Topic + articles: + - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\ + \ Jan 2 15:15:39 2014\nReceived: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553;\n\ + \ Wed, 25 May 2011 12:25:09 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ + Received: by 10.224.181.131 with SMTP id by3ls169717qab.3.gmail; Wed, 25 May\n\ + \ 2011 12:25:08 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.203.8 with\ + \ SMTP id fg8mr904907qab.54.1306351508426; Wed,\n 25 May 2011 12:25:08 -0700\ + \ (PDT)\nReceived: by n10g2000vby.googlegroups.com with HTTP; Wed, 25 May 2011\ + \ 12:25:08\n -0700 (PDT)\nDate: Wed, 25 May 2011 12:25:08 -0700 (PDT)\nUser-Agent:\ + \ G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)\n\ + \ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24,gzip(gfe)\n\ + Message-ID: <32a62e7b-28f6-4b99-920a-eba49518b9a4@n10g2000vby.googlegroups.com>\n\ + Subject: =?ISO-8859-1?Q?IRC_kan=E1l?=\nFrom: Josef Richter <richte...@gmail.com>\n\ + To: \"javascript.cz\" <javasc...@googlegroups.com>\nContent-Type: text/plain;\ + \ charset=ISO-8859-1\n\n\n#javascript.cz na irc.freenode.net\n\n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/KpLLZ7thax4/nxKqd5qBVTIJ'} + name: "IRC kan\xE1l" + root: https://groups.google.com/forum/?_escaped_fragment_=topic/javascriptcz/KpLLZ7thax4 +- !!python/object:gg_scrapper.Topic + articles: + - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\ + \ Jan 2 15:15:40 2014\nReceived: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198;\n\ + \ Wed, 25 May 2011 05:05:20 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ + Received: by 10.224.138.148 with SMTP id a20ls76694qau.5.gmail; Wed, 25 May\n\ + \ 2011 05:05:20 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.217.200\ + \ with SMTP id hn8mr722125qab.0.1306325120116; Wed,\n 25 May 2011 05:05:20 -0700\ + \ (PDT)\nReceived: by 32g2000vbe.googlegroups.com with HTTP; Wed, 25 May 2011\ + \ 05:05:20\n -0700 (PDT)\nDate: Wed, 25 May 2011 05:05:20 -0700 (PDT)\nUser-Agent:\ + \ G2/1.0\nX-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7)\n\ + \ AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24,gzip(gfe)\n\ + Message-ID: <df869047-365d-40da-8641-8c74d1ae2090@32g2000vbe.googlegroups.com>\n\ + Subject: =?ISO-8859-2?Q?=C8lenov=E9?=\nFrom: Josef Richter <richte...@gmail.com>\n\ + To: \"javascript.cz\" <javasc...@googlegroups.com>\nContent-Type: text/plain;\ + \ charset=ISO-8859-2\nContent-Transfer-Encoding: quoted-printable\n\n\nbylo\ + \ by fajn kdyby ka=BEd=FD =E8len napsal krati=E8k=E9 info o sob=EC\n\nvykop=E1v=E1m:\n\ + \njsem web designer a front-end developer, d=ECl=E1m hlavn=EC jednodu=B9=B9=\n\ + =ED v=ECci\nv jQuery, ale jak se st=E1vaj=ED slo=BEit=ECj=B9=EDmi, tak se sna=BE=EDm\ + \ pr=\noniknout do\nBackbone.js, SproutCore, apod. Experimentuju i s jQuery\ + \ Mobile a\nmobiln=EDm aplikacemi v=F9bec. Slu=B9n=EC se orientuju v Ruby on\ + \ Rails a ve=\ndu\nmal=FD developersk=FD team, zam=EC=F8en=FD na agiln=ED development\ + \ webov=FD=\nch\naplikac=ED.\n\n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/hB3Rjgd5SBA/aKCbqJFbN-sJ'} + name: "\u010Clenov\xE9" + root: https://groups.google.com/forum/?_escaped_fragment_=topic/javascriptcz/hB3Rjgd5SBA diff --git a/test/mbox.mbx b/test/mbox.mbx new file mode 100644 index 0000000..026dc5a --- /dev/null +++ b/test/mbox.mbx @@ -0,0 +1,252 @@ +From scho...@schovi.cz Thu Jan 2 15:15:33 2014 +Received: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917; + Sat, 28 May 2011 08:18:46 -0700 (PDT) +X-BeenThere: javascriptcz@googlegroups.com +Received: by 10.224.187.145 with SMTP id cw17ls698645qab.2.gmail; Sat, 28 May + 2011 08:18:45 -0700 (PDT) +MIME-Version: 1.0 +Received: by 10.224.205.130 with SMTP id fq2mr904146qab.18.1306595925702; Sat, + 28 May 2011 08:18:45 -0700 (PDT) +Received: by p6g2000vbn.googlegroups.com with HTTP; Sat, 28 May 2011 08:18:45 + -0700 (PDT) +Date: Sat, 28 May 2011 08:18:45 -0700 (PDT) +User-Agent: G2/1.0 +X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) + AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.772.0 Safari/535.1,gzip(gfe) +Message-ID: <c4ee7911-2a16-487b-8d96-1c0997cc1e24@p6g2000vbn.googlegroups.com> +Subject: =?ISO-8859-2?Q?Zdroje=2C_kter=E9_byste_nem=ECli_minout=2E?= +From: David Schovanec <scho...@schovi.cz> +To: "javascript.cz" <javasc...@googlegroups.com> +Content-Type: text/plain; charset=ISO-8859-2 +Content-Transfer-Encoding: quoted-printable + + +Rozs=E1hl=E9 pojedn=E1n=ED o tom, jak se pracuje s funkcema - +http://kangax.github.com/nfe/ + +Blog o v=B9em mo=BEn=E9m a nemo=BEn=E9m kolem javascriptu +http://javascriptweblog.wordpress.com/ + + +From richte...@gmail.com Thu Jan 2 15:15:35 2014 +Received: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334; + Thu, 26 May 2011 01:34:27 -0700 (PDT) +X-BeenThere: javascriptcz@googlegroups.com +Received: by 10.224.187.145 with SMTP id cw17ls261575qab.2.gmail; Thu, 26 May + 2011 01:34:26 -0700 (PDT) +MIME-Version: 1.0 +Received: by 10.224.32.129 with SMTP id c1mr163718qad.40.1306398866188; Thu, + 26 May 2011 01:34:26 -0700 (PDT) +Received: by a26g2000vbo.googlegroups.com with HTTP; Thu, 26 May 2011 01:34:26 + -0700 (PDT) +Date: Thu, 26 May 2011 01:34:26 -0700 (PDT) +User-Agent: G2/1.0 +X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) + AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.24,gzip(gfe) +Message-ID: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com> +Subject: SproutCore +From: Josef Richter <richte...@gmail.com> +To: "javascript.cz" <javasc...@googlegroups.com> +Content-Type: text/plain; charset=ISO-8859-2 +Content-Transfer-Encoding: quoted-printable + + +Tak dneska bylo ohl=E1=B9eno SproutCore 2.0 a je k dispozici developer +preview. + +Hlavn=ED zm=ECna je ve view layer - bude se pou=BE=EDvat nom=E1ln=EC html+c= +ss a +handlebars templating. To mi na p=F8edchoz=EDch verz=EDch vadilo nejv=EDc, = +=BEe +se celej view vlastn=EC psal z javascriptu, proto=BEe tam byly +p=F8eddefinovan=FD v=B9echny pages, panes, buttons, widgets, atd. tak=BEe t= +y +appky pak vypadaly v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1dn= +=FD. + +http://blog.sproutcore.com/announcing-sproutcore-2-0/ + + +From damn...@gmail.com Thu Jan 2 15:15:36 2014 +Received: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592; + Fri, 27 May 2011 04:29:49 -0700 (PDT) +X-BeenThere: javascriptcz@googlegroups.com +Received: by 10.90.58.22 with SMTP id g22ls443766aga.2.gmail; Fri, 27 May 2011 + 04:29:49 -0700 (PDT) +MIME-Version: 1.0 +Received: by 10.90.1.10 with SMTP id 10mr123075aga.25.1306495789240; Fri, 27 + May 2011 04:29:49 -0700 (PDT) +Received: by n10g2000yqf.googlegroups.com with HTTP; Fri, 27 May 2011 04:29:49 + -0700 (PDT) +Date: Fri, 27 May 2011 04:29:49 -0700 (PDT) +In-Reply-To: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com> +References: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com> +User-Agent: G2/1.0 +X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; cs-cz) + AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1,gzip(gfe) +Message-ID: <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com> +Subject: Re: SproutCore +From: pepe <damn...@gmail.com> +To: "javascript.cz" <javasc...@googlegroups.com> +Content-Type: text/plain; charset=ISO-8859-2 +Content-Transfer-Encoding: quoted-printable + + +Pou=BE=EDval jsi to u=BE na n=EC=E8em? Ja se st=E1le nemu=BEu p=F8esv=ECd= +=E8it, =BEe to m=E1m +vyzkou=B9et. A jak zn=E1mo dobr=E9 p=F8=EDklady t=E1hnou. + + +On 26 kv=EC, 10:34, Josef Richter <richte...@gmail.com> wrote: +> Tak dneska bylo ohl=E1=B9eno SproutCore 2.0 a je k dispozici developer +> preview. +> +> Hlavn=ED zm=ECna je ve view layer - bude se pou=BE=EDvat nom=E1ln=EC html= ++css a +> handlebars templating. To mi na p=F8edchoz=EDch verz=EDch vadilo nejv=EDc= +, =BEe +> se celej view vlastn=EC psal z javascriptu, proto=BEe tam byly +> p=F8eddefinovan=FD v=B9echny pages, panes, buttons, widgets, atd. tak=BEe= + ty +> appky pak vypadaly v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1d= +n=FD. +> +> http://blog.sproutcore.com/announcing-sproutcore-2-0/ + + +From richte...@gmail.com Thu Jan 2 15:15:36 2014 +Received: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290; + Fri, 27 May 2011 04:56:01 -0700 (PDT) +X-BeenThere: javascriptcz@googlegroups.com +Received: by 10.224.175.74 with SMTP id w10ls508844qaz.0.gmail; Fri, 27 May + 2011 04:56:01 -0700 (PDT) +MIME-Version: 1.0 +Received: by 10.224.186.16 with SMTP id cq16mr523722qab.19.1306497361142; Fri, + 27 May 2011 04:56:01 -0700 (PDT) +Received: by v31g2000vbs.googlegroups.com with HTTP; Fri, 27 May 2011 04:56:01 + -0700 (PDT) +Date: Fri, 27 May 2011 04:56:01 -0700 (PDT) +In-Reply-To: <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com> +References: <7f58c175-0cc9-4fc9-850d-d4f5ec825fca@a26g2000vbo.googlegroups.com> + <71c066ac-67fa-4301-8aa2-e65bef981ad7@n10g2000yqf.googlegroups.com> +User-Agent: G2/1.0 +X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) + AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.24,gzip(gfe) +Message-ID: <c6e41779-e622-4559-9381-9f250e90370d@v31g2000vbs.googlegroups.com> +Subject: Re: SproutCore +From: Josef Richter <richte...@gmail.com> +To: "javascript.cz" <javasc...@googlegroups.com> +Content-Type: text/plain; charset=ISO-8859-2 +Content-Transfer-Encoding: quoted-printable + + +Na produk=E8n=ED v=ECc ne. Sp=ED=B9 jsem si s t=EDm jenom hr=E1l, zkou=B9el= + tutorialy +a tak. Tam byl obrovskej probl=E9m (podle m=EC) ta View vrstva, kter=E1 byl= +a +stra=B9nej mastodont a tla=E8ila t=EC do n=ECjak=FDho stylu. Proto ty appky +v=B9echny vypadaly prakticky stejn=EC jako MobileMe. + +A proto taky spoustu lid=ED nadchlo sp=ED=B9 Backbone, kter=FD na to =B9lo = +z +druh=E9 strany a naopak je ultrajednoduch=FD. Nem=E1 v sob=EC nap=F8. v=F9b= +ec +=BE=E1dn=FD templatov=E1n=ED a i ten MVC je tam trochu specifickej. + +Ten SproutCore 2.0 by kone=E8n=EC mohl b=FDt pr=F9lom v popularit=EC. Ten Y= +ehuda +nen=ED blbej a jeho ruka na tom za=E8=EDn=E1 b=FDt vid=ECt :-) + +On May 27, 1:29=A0pm, pepe <damn...@gmail.com> wrote: +> Pou=BE=EDval jsi to u=BE na n=EC=E8em? Ja se st=E1le nemu=BEu p=F8esv=ECd= +=E8it, =BEe to m=E1m +> vyzkou=B9et. A jak zn=E1mo dobr=E9 p=F8=EDklady t=E1hnou. +> +> On 26 kv=EC, 10:34, Josef Richter <richte...@gmail.com> wrote: +> +> +> +> +> +> +> +> > Tak dneska bylo ohl=E1=B9eno SproutCore 2.0 a je k dispozici developer +> > preview. +> +> > Hlavn=ED zm=ECna je ve view layer - bude se pou=BE=EDvat nom=E1ln=EC ht= +ml+css a +> > handlebars templating. To mi na p=F8edchoz=EDch verz=EDch vadilo nejv= +=EDc, =BEe +> > se celej view vlastn=EC psal z javascriptu, proto=BEe tam byly +> > p=F8eddefinovan=FD v=B9echny pages, panes, buttons, widgets, atd. tak= +=BEe ty +> > appky pak vypadaly v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop= +=E1dn=FD. +> +> >http://blog.sproutcore.com/announcing-sproutcore-2-0/ + + +From richte...@gmail.com Thu Jan 2 15:15:39 2014 +Received: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553; + Wed, 25 May 2011 12:25:09 -0700 (PDT) +X-BeenThere: javascriptcz@googlegroups.com +Received: by 10.224.181.131 with SMTP id by3ls169717qab.3.gmail; Wed, 25 May + 2011 12:25:08 -0700 (PDT) +MIME-Version: 1.0 +Received: by 10.224.203.8 with SMTP id fg8mr904907qab.54.1306351508426; Wed, + 25 May 2011 12:25:08 -0700 (PDT) +Received: by n10g2000vby.googlegroups.com with HTTP; Wed, 25 May 2011 12:25:08 + -0700 (PDT) +Date: Wed, 25 May 2011 12:25:08 -0700 (PDT) +User-Agent: G2/1.0 +X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) + AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24,gzip(gfe) +Message-ID: <32a62e7b-28f6-4b99-920a-eba49518b9a4@n10g2000vby.googlegroups.com> +Subject: =?ISO-8859-1?Q?IRC_kan=E1l?= +From: Josef Richter <richte...@gmail.com> +To: "javascript.cz" <javasc...@googlegroups.com> +Content-Type: text/plain; charset=ISO-8859-1 + + +#javascript.cz na irc.freenode.net + + +From richte...@gmail.com Thu Jan 2 15:15:40 2014 +Received: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198; + Wed, 25 May 2011 05:05:20 -0700 (PDT) +X-BeenThere: javascriptcz@googlegroups.com +Received: by 10.224.138.148 with SMTP id a20ls76694qau.5.gmail; Wed, 25 May + 2011 05:05:20 -0700 (PDT) +MIME-Version: 1.0 +Received: by 10.224.217.200 with SMTP id hn8mr722125qab.0.1306325120116; Wed, + 25 May 2011 05:05:20 -0700 (PDT) +Received: by 32g2000vbe.googlegroups.com with HTTP; Wed, 25 May 2011 05:05:20 + -0700 (PDT) +Date: Wed, 25 May 2011 05:05:20 -0700 (PDT) +User-Agent: G2/1.0 +X-HTTP-UserAgent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) + AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24,gzip(gfe) +Message-ID: <df869047-365d-40da-8641-8c74d1ae2090@32g2000vbe.googlegroups.com> +Subject: =?ISO-8859-2?Q?=C8lenov=E9?= +From: Josef Richter <richte...@gmail.com> +To: "javascript.cz" <javasc...@googlegroups.com> +Content-Type: text/plain; charset=ISO-8859-2 +Content-Transfer-Encoding: quoted-printable + + +bylo by fajn kdyby ka=BEd=FD =E8len napsal krati=E8k=E9 info o sob=EC + +vykop=E1v=E1m: + +jsem web designer a front-end developer, d=ECl=E1m hlavn=EC jednodu=B9=B9= +=ED v=ECci +v jQuery, ale jak se st=E1vaj=ED slo=BEit=ECj=B9=EDmi, tak se sna=BE=EDm pr= +oniknout do +Backbone.js, SproutCore, apod. Experimentuju i s jQuery Mobile a +mobiln=EDm aplikacemi v=F9bec. Slu=B9n=EC se orientuju v Ruby on Rails a ve= +du +mal=FD developersk=FD team, zam=EC=F8en=FD na agiln=ED development webov=FD= +ch +aplikac=ED. + + diff --git a/test/test_functional.py b/test/test_functional.py index 3f657f6..3e8d874 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -10,7 +10,7 @@ ORIG_URL = 'http://groups.google.com/d/forum/jbrout' EXP_URL = 'https://groups.google.com/forum/' + \ '?_escaped_fragment_=forum/jbrout' TOPIC_URL = 'https://groups.google.com/forum/#!topic/jbrout/xNwoVmC07KI' -ARTICLE_URL = 'https://groups.google.com/forum/#!msg/jbrout' + \ +ARTICLE_URL = 'https://groups.google.com/d/msg/jbrout' + \ '/xNwoVmC07KI/OfpRHFscUkwJ' @@ -34,9 +34,7 @@ class TestGGScrapperFunctional(unittest.TestCase): def test_get_raw_article(self): self.maxDiff = None - logging.debug('article = URL {}'.format(ARTICLE_URL)) article = gg_scrapper.Article(ARTICLE_URL) - logging.debug('article = raw URL {}'.format(article.root)) rfc_msg = article.collect_message().replace('\r\n', '\n') rfc_msg = '\n'.join(rfc_msg.split('\n')[1:]) diff --git a/test/test_unit.py b/test/test_unit.py index e79c635..56ec08f 100644 --- a/test/test_unit.py +++ b/test/test_unit.py @@ -1,6 +1,9 @@ -import pickle +import os +import tempfile +import yaml import unittest import gg_scrapper +from gg_scrapper import Group, Topic, Article # noqa IN_URL = 'https://groups.google.com/forum/#!forum/jbrout' ORIG_URL = 'http://groups.google.com/d/forum/jbrout' @@ -22,15 +25,19 @@ class TestMBOX(unittest.TestCase): def test_create_mbox(self): '''Create a mbox file from (pickled) Group ''' - group_file_name = 'test/group.pickle' - with open(group_file_name, 'r', encoding='utf8') as group_f: - group = pickle.load(group_f) + group_file_name = 'test/group.yaml' + with open(group_file_name, 'r') as group_f: + group = yaml.load(group_f) - mbx = gg_scrapper.MBOX() - mbx.format_mbox(group) + mbx_file = tempfile.NamedTemporaryFile('w', delete=False) + mbx = gg_scrapper.MBOX(mbx_file.name) + mbx.write_group(group) - with open('test/generated_mbox.mbx') as exp_f: - self.assertEqual(exp_f.read(), mbx.mbox_string) + with open('test/mbox.mbx') as exp_f: + with open(mbx_file.name) as mbx_f: + self.assertEqual(exp_f.read(), mbx_f.read()) + + os.unlink(mbx_file.name) if __name__ == '__main__': unittest.main() |