From 80f10eaaf784db7c0dfa04ed8917f98aac60eff0 Mon Sep 17 00:00:00 2001 From: Matěj Cepl Date: Thu, 2 Jan 2014 16:48:57 +0100 Subject: For each group generate also a list of all mangled addresses. Google Groups (rightly) protects addresses against spammers. There is (obviously) no way how to find true value of these addresses programmatically, so we just generate list of all affected ones, which could be later completed with true values (collected somehow) and fixed by some other script. Fixes #275 --- gg_scrapper.py | 27 +++++++++++++++++++++++++++ test/group.yaml | 12 ++++++------ test/mangled_address.cnf | 6 ++++++ test/mbox.mbx | 12 ++++++------ test/test_unit.py | 23 ++++++++++++++++++----- 5 files changed, 63 insertions(+), 17 deletions(-) create mode 100644 test/mangled_address.cnf diff --git a/gg_scrapper.py b/gg_scrapper.py index f1cde21..05eecca 100755 --- a/gg_scrapper.py +++ b/gg_scrapper.py @@ -1,5 +1,6 @@ #!/usr/bin/python3 +from configparser import ConfigParser import mailbox import os.path import re @@ -189,6 +190,29 @@ class Group(Page): for art in top.articles: yield art.raw_message + def collect_mangled_addrs(self): + addrs = set() + addr_sec_label = 'addresses' + for top in self.topics: + for art in top.articles: + msg_str = art.raw_message + # see http://stackoverflow.com/questions/201323 + msg_matches = re.findall( + r'([a-zA-Z0-9_.+-]+\.\.\.@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', + msg_str, re.IGNORECASE) + if msg_matches is not None: + for mtch in msg_matches: + addrs.add(mtch) + + addrs = sorted(list(addrs)) + + with open('{}.cnf'.format(self.name), 'w') as cnf_f: + cnf_p = ConfigParser() + cnf_p.add_section(addr_sec_label) + for addr in addrs: + cnf_p.set(addr_sec_label, addr, '') + cnf_p.write(cnf_f) + class MBOX(mailbox.mbox): def __init__(self, filename): @@ -219,5 +243,8 @@ def main(group_URL): mbx = MBOX("{}.mbx".format(grp.name)) mbx.write_group(grp) + # generate list of addresses protected against spammers + grp.collect_mangled_addrs() + if __name__ == '__main__': main(sys.argv[1]) diff --git a/test/group.yaml b/test/group.yaml index 44cac25..59a33d4 100644 --- a/test/group.yaml +++ b/test/group.yaml @@ -5,7 +5,7 @@ topics: - !!python/object:gg_scrapper.Topic articles: - !!python/object:gg_scrapper.Article {raw_message: "From scho...@schovi.cz Thu Jan\ - \ 2 15:15:33 2014\nReceived: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917;\n\ + \ 2 16:12:04 2014\nReceived: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917;\n\ \ Sat, 28 May 2011 08:18:46 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ Received: by 10.224.187.145 with SMTP id cw17ls698645qab.2.gmail; Sat, 28 May\n\ \ 2011 08:18:45 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.205.130\ @@ -26,7 +26,7 @@ topics: - !!python/object:gg_scrapper.Topic articles: - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\ - \ Jan 2 15:15:35 2014\nReceived: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334;\n\ + \ Jan 2 16:12:06 2014\nReceived: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334;\n\ \ Thu, 26 May 2011 01:34:27 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ Received: by 10.224.187.145 with SMTP id cw17ls261575qab.2.gmail; Thu, 26 May\n\ \ 2011 01:34:26 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.32.129 with\ @@ -47,7 +47,7 @@ topics: \ v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1dn=\n=FD.\n\nhttp://blog.sproutcore.com/announcing-sproutcore-2-0/\n\ \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/pUoGBDjK_HcJ'} - !!python/object:gg_scrapper.Article {raw_message: "From damn...@gmail.com Thu Jan\ - \ 2 15:15:36 2014\nReceived: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592;\n\ + \ 2 16:12:07 2014\nReceived: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592;\n\ \ Fri, 27 May 2011 04:29:49 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ Received: by 10.90.58.22 with SMTP id g22ls443766aga.2.gmail; Fri, 27 May 2011\n\ \ 04:29:49 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.90.1.10 with SMTP\ @@ -72,7 +72,7 @@ topics: \ stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1d=\nn=FD.\n>\n> http://blog.sproutcore.com/announcing-sproutcore-2-0/\n\ \n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/Gxus9ddtp5wJ'} - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\ - \ Jan 2 15:15:36 2014\nReceived: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290;\n\ + \ Jan 2 16:12:07 2014\nReceived: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290;\n\ \ Fri, 27 May 2011 04:56:01 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ Received: by 10.224.175.74 with SMTP id w10ls508844qaz.0.gmail; Fri, 27 May\n\ \ 2011 04:56:01 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.186.16 with\ @@ -112,7 +112,7 @@ topics: - !!python/object:gg_scrapper.Topic articles: - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\ - \ Jan 2 15:15:39 2014\nReceived: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553;\n\ + \ Jan 2 16:12:09 2014\nReceived: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553;\n\ \ Wed, 25 May 2011 12:25:09 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ Received: by 10.224.181.131 with SMTP id by3ls169717qab.3.gmail; Wed, 25 May\n\ \ 2011 12:25:08 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.203.8 with\ @@ -130,7 +130,7 @@ topics: - !!python/object:gg_scrapper.Topic articles: - !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\ - \ Jan 2 15:15:40 2014\nReceived: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198;\n\ + \ Jan 2 16:12:10 2014\nReceived: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198;\n\ \ Wed, 25 May 2011 05:05:20 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\ Received: by 10.224.138.148 with SMTP id a20ls76694qau.5.gmail; Wed, 25 May\n\ \ 2011 05:05:20 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.217.200\ diff --git a/test/mangled_address.cnf b/test/mangled_address.cnf new file mode 100644 index 0000000..52700d2 --- /dev/null +++ b/test/mangled_address.cnf @@ -0,0 +1,6 @@ +[addresses] +damn...@gmail.com = +javasc...@googlegroups.com = +richte...@gmail.com = +scho...@schovi.cz = + diff --git a/test/mbox.mbx b/test/mbox.mbx index 026dc5a..1d67988 100644 --- a/test/mbox.mbx +++ b/test/mbox.mbx @@ -1,4 +1,4 @@ -From scho...@schovi.cz Thu Jan 2 15:15:33 2014 +From scho...@schovi.cz Thu Jan 2 16:12:04 2014 Received: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917; Sat, 28 May 2011 08:18:46 -0700 (PDT) X-BeenThere: javascriptcz@googlegroups.com @@ -28,7 +28,7 @@ Blog o v=B9em mo=BEn=E9m a nemo=BEn=E9m kolem javascriptu http://javascriptweblog.wordpress.com/ -From richte...@gmail.com Thu Jan 2 15:15:35 2014 +From richte...@gmail.com Thu Jan 2 16:12:06 2014 Received: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334; Thu, 26 May 2011 01:34:27 -0700 (PDT) X-BeenThere: javascriptcz@googlegroups.com @@ -67,7 +67,7 @@ appky pak vypadaly v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1dn= http://blog.sproutcore.com/announcing-sproutcore-2-0/ -From damn...@gmail.com Thu Jan 2 15:15:36 2014 +From damn...@gmail.com Thu Jan 2 16:12:07 2014 Received: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592; Fri, 27 May 2011 04:29:49 -0700 (PDT) X-BeenThere: javascriptcz@googlegroups.com @@ -114,7 +114,7 @@ n=FD. > http://blog.sproutcore.com/announcing-sproutcore-2-0/ -From richte...@gmail.com Thu Jan 2 15:15:36 2014 +From richte...@gmail.com Thu Jan 2 16:12:07 2014 Received: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290; Fri, 27 May 2011 04:56:01 -0700 (PDT) X-BeenThere: javascriptcz@googlegroups.com @@ -186,7 +186,7 @@ ml+css a > >http://blog.sproutcore.com/announcing-sproutcore-2-0/ -From richte...@gmail.com Thu Jan 2 15:15:39 2014 +From richte...@gmail.com Thu Jan 2 16:12:09 2014 Received: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553; Wed, 25 May 2011 12:25:09 -0700 (PDT) X-BeenThere: javascriptcz@googlegroups.com @@ -211,7 +211,7 @@ Content-Type: text/plain; charset=ISO-8859-1 #javascript.cz na irc.freenode.net -From richte...@gmail.com Thu Jan 2 15:15:40 2014 +From richte...@gmail.com Thu Jan 2 16:12:10 2014 Received: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198; Wed, 25 May 2011 05:05:20 -0700 (PDT) X-BeenThere: javascriptcz@googlegroups.com diff --git a/test/test_unit.py b/test/test_unit.py index 56ec08f..9cb04fc 100644 --- a/test/test_unit.py +++ b/test/test_unit.py @@ -22,16 +22,17 @@ class TestGGScrapper(unittest.TestCase): class TestMBOX(unittest.TestCase): - def test_create_mbox(self): - '''Create a mbox file from (pickled) Group - ''' + def setUp(self): group_file_name = 'test/group.yaml' with open(group_file_name, 'r') as group_f: - group = yaml.load(group_f) + self.group = yaml.load(group_f) + def test_create_mbox(self): + '''Create a mbox file from (YAMLed) Group + ''' mbx_file = tempfile.NamedTemporaryFile('w', delete=False) mbx = gg_scrapper.MBOX(mbx_file.name) - mbx.write_group(group) + mbx.write_group(self.group) with open('test/mbox.mbx') as exp_f: with open(mbx_file.name) as mbx_f: @@ -39,5 +40,17 @@ class TestMBOX(unittest.TestCase): os.unlink(mbx_file.name) + def test_generate_list_mangled_addrs(self): + self.maxDiff = None + with open('test/mangled_address.cnf') as exp_addr_f: + exp_str = exp_addr_f.read() + + self.group.collect_mangled_addrs() + + with open('{}.cnf'.format(self.group.name)) as obs_f: + mang_addres = obs_f.read() + self.assertEqual(exp_str, mang_addres) + + if __name__ == '__main__': unittest.main() -- cgit