aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2014-01-02 16:48:57 +0100
committerMatěj Cepl <mcepl@redhat.com>2014-01-02 16:48:57 +0100
commit80f10eaaf784db7c0dfa04ed8917f98aac60eff0 (patch)
tree06c4351c0b53bb795925ca9db5d2fa3673663582
parent4b3df4fabb92dab4028cb794de897eaf3b8fde82 (diff)
downloadgg_scraper-80f10eaaf784db7c0dfa04ed8917f98aac60eff0.tar.gz
For each group generate also a list of all mangled addresses.
Google Groups (rightly) protects addresses against spammers. There is (obviously) no way how to find true value of these addresses programmatically, so we just generate list of all affected ones, which could be later completed with true values (collected somehow) and fixed by some other script. Fixes #275
-rwxr-xr-xgg_scrapper.py27
-rw-r--r--test/group.yaml12
-rw-r--r--test/mangled_address.cnf6
-rw-r--r--test/mbox.mbx12
-rw-r--r--test/test_unit.py23
5 files changed, 63 insertions, 17 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py
index f1cde21..05eecca 100755
--- a/gg_scrapper.py
+++ b/gg_scrapper.py
@@ -1,5 +1,6 @@
#!/usr/bin/python3
+from configparser import ConfigParser
import mailbox
import os.path
import re
@@ -189,6 +190,29 @@ class Group(Page):
for art in top.articles:
yield art.raw_message
+ def collect_mangled_addrs(self):
+ addrs = set()
+ addr_sec_label = 'addresses'
+ for top in self.topics:
+ for art in top.articles:
+ msg_str = art.raw_message
+ # see http://stackoverflow.com/questions/201323
+ msg_matches = re.findall(
+ r'([a-zA-Z0-9_.+-]+\.\.\.@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',
+ msg_str, re.IGNORECASE)
+ if msg_matches is not None:
+ for mtch in msg_matches:
+ addrs.add(mtch)
+
+ addrs = sorted(list(addrs))
+
+ with open('{}.cnf'.format(self.name), 'w') as cnf_f:
+ cnf_p = ConfigParser()
+ cnf_p.add_section(addr_sec_label)
+ for addr in addrs:
+ cnf_p.set(addr_sec_label, addr, '')
+ cnf_p.write(cnf_f)
+
class MBOX(mailbox.mbox):
def __init__(self, filename):
@@ -219,5 +243,8 @@ def main(group_URL):
mbx = MBOX("{}.mbx".format(grp.name))
mbx.write_group(grp)
+ # generate list of addresses protected against spammers
+ grp.collect_mangled_addrs()
+
if __name__ == '__main__':
main(sys.argv[1])
diff --git a/test/group.yaml b/test/group.yaml
index 44cac25..59a33d4 100644
--- a/test/group.yaml
+++ b/test/group.yaml
@@ -5,7 +5,7 @@ topics:
- !!python/object:gg_scrapper.Topic
articles:
- !!python/object:gg_scrapper.Article {raw_message: "From scho...@schovi.cz Thu Jan\
- \ 2 15:15:33 2014\nReceived: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917;\n\
+ \ 2 16:12:04 2014\nReceived: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917;\n\
\ Sat, 28 May 2011 08:18:46 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
Received: by 10.224.187.145 with SMTP id cw17ls698645qab.2.gmail; Sat, 28 May\n\
\ 2011 08:18:45 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.205.130\
@@ -26,7 +26,7 @@ topics:
- !!python/object:gg_scrapper.Topic
articles:
- !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\
- \ Jan 2 15:15:35 2014\nReceived: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334;\n\
+ \ Jan 2 16:12:06 2014\nReceived: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334;\n\
\ Thu, 26 May 2011 01:34:27 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
Received: by 10.224.187.145 with SMTP id cw17ls261575qab.2.gmail; Thu, 26 May\n\
\ 2011 01:34:26 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.32.129 with\
@@ -47,7 +47,7 @@ topics:
\ v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1dn=\n=FD.\n\nhttp://blog.sproutcore.com/announcing-sproutcore-2-0/\n\
\n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/pUoGBDjK_HcJ'}
- !!python/object:gg_scrapper.Article {raw_message: "From damn...@gmail.com Thu Jan\
- \ 2 15:15:36 2014\nReceived: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592;\n\
+ \ 2 16:12:07 2014\nReceived: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592;\n\
\ Fri, 27 May 2011 04:29:49 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
Received: by 10.90.58.22 with SMTP id g22ls443766aga.2.gmail; Fri, 27 May 2011\n\
\ 04:29:49 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.90.1.10 with SMTP\
@@ -72,7 +72,7 @@ topics:
\ stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1d=\nn=FD.\n>\n> http://blog.sproutcore.com/announcing-sproutcore-2-0/\n\
\n", root: 'https://groups.google.com/forum/message/raw?msg=javascriptcz/-4cy0XTGGaU/Gxus9ddtp5wJ'}
- !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\
- \ Jan 2 15:15:36 2014\nReceived: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290;\n\
+ \ Jan 2 16:12:07 2014\nReceived: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290;\n\
\ Fri, 27 May 2011 04:56:01 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
Received: by 10.224.175.74 with SMTP id w10ls508844qaz.0.gmail; Fri, 27 May\n\
\ 2011 04:56:01 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.186.16 with\
@@ -112,7 +112,7 @@ topics:
- !!python/object:gg_scrapper.Topic
articles:
- !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\
- \ Jan 2 15:15:39 2014\nReceived: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553;\n\
+ \ Jan 2 16:12:09 2014\nReceived: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553;\n\
\ Wed, 25 May 2011 12:25:09 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
Received: by 10.224.181.131 with SMTP id by3ls169717qab.3.gmail; Wed, 25 May\n\
\ 2011 12:25:08 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.203.8 with\
@@ -130,7 +130,7 @@ topics:
- !!python/object:gg_scrapper.Topic
articles:
- !!python/object:gg_scrapper.Article {raw_message: "From richte...@gmail.com Thu\
- \ Jan 2 15:15:40 2014\nReceived: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198;\n\
+ \ Jan 2 16:12:10 2014\nReceived: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198;\n\
\ Wed, 25 May 2011 05:05:20 -0700 (PDT)\nX-BeenThere: javascriptcz@googlegroups.com\n\
Received: by 10.224.138.148 with SMTP id a20ls76694qau.5.gmail; Wed, 25 May\n\
\ 2011 05:05:20 -0700 (PDT)\nMIME-Version: 1.0\nReceived: by 10.224.217.200\
diff --git a/test/mangled_address.cnf b/test/mangled_address.cnf
new file mode 100644
index 0000000..52700d2
--- /dev/null
+++ b/test/mangled_address.cnf
@@ -0,0 +1,6 @@
+[addresses]
+damn...@gmail.com =
+javasc...@googlegroups.com =
+richte...@gmail.com =
+scho...@schovi.cz =
+
diff --git a/test/mbox.mbx b/test/mbox.mbx
index 026dc5a..1d67988 100644
--- a/test/mbox.mbx
+++ b/test/mbox.mbx
@@ -1,4 +1,4 @@
-From scho...@schovi.cz Thu Jan 2 15:15:33 2014
+From scho...@schovi.cz Thu Jan 2 16:12:04 2014
Received: by 10.224.192.193 with SMTP id dr1mr1092656qab.9.1306595926917;
Sat, 28 May 2011 08:18:46 -0700 (PDT)
X-BeenThere: javascriptcz@googlegroups.com
@@ -28,7 +28,7 @@ Blog o v=B9em mo=BEn=E9m a nemo=BEn=E9m kolem javascriptu
http://javascriptweblog.wordpress.com/
-From richte...@gmail.com Thu Jan 2 15:15:35 2014
+From richte...@gmail.com Thu Jan 2 16:12:06 2014
Received: by 10.224.136.200 with SMTP id s8mr198006qat.21.1306398867334;
Thu, 26 May 2011 01:34:27 -0700 (PDT)
X-BeenThere: javascriptcz@googlegroups.com
@@ -67,7 +67,7 @@ appky pak vypadaly v=B9echny stejn=EC (hnusn=EC) a bylo to t=EC=BEkop=E1dn=
http://blog.sproutcore.com/announcing-sproutcore-2-0/
-From damn...@gmail.com Thu Jan 2 15:15:36 2014
+From damn...@gmail.com Thu Jan 2 16:12:07 2014
Received: by 10.100.168.2 with SMTP id q2mr1217945ane.14.1306495789592;
Fri, 27 May 2011 04:29:49 -0700 (PDT)
X-BeenThere: javascriptcz@googlegroups.com
@@ -114,7 +114,7 @@ n=FD.
> http://blog.sproutcore.com/announcing-sproutcore-2-0/
-From richte...@gmail.com Thu Jan 2 15:15:36 2014
+From richte...@gmail.com Thu Jan 2 16:12:07 2014
Received: by 10.224.9.144 with SMTP id l16mr693260qal.26.1306497361290;
Fri, 27 May 2011 04:56:01 -0700 (PDT)
X-BeenThere: javascriptcz@googlegroups.com
@@ -186,7 +186,7 @@ ml+css a
> >http://blog.sproutcore.com/announcing-sproutcore-2-0/
-From richte...@gmail.com Thu Jan 2 15:15:39 2014
+From richte...@gmail.com Thu Jan 2 16:12:09 2014
Received: by 10.224.126.72 with SMTP id b8mr1923833qas.13.1306351509553;
Wed, 25 May 2011 12:25:09 -0700 (PDT)
X-BeenThere: javascriptcz@googlegroups.com
@@ -211,7 +211,7 @@ Content-Type: text/plain; charset=ISO-8859-1
#javascript.cz na irc.freenode.net
-From richte...@gmail.com Thu Jan 2 15:15:40 2014
+From richte...@gmail.com Thu Jan 2 16:12:10 2014
Received: by 10.224.215.3 with SMTP id hc3mr1823425qab.4.1306325120198;
Wed, 25 May 2011 05:05:20 -0700 (PDT)
X-BeenThere: javascriptcz@googlegroups.com
diff --git a/test/test_unit.py b/test/test_unit.py
index 56ec08f..9cb04fc 100644
--- a/test/test_unit.py
+++ b/test/test_unit.py
@@ -22,16 +22,17 @@ class TestGGScrapper(unittest.TestCase):
class TestMBOX(unittest.TestCase):
- def test_create_mbox(self):
- '''Create a mbox file from (pickled) Group
- '''
+ def setUp(self):
group_file_name = 'test/group.yaml'
with open(group_file_name, 'r') as group_f:
- group = yaml.load(group_f)
+ self.group = yaml.load(group_f)
+ def test_create_mbox(self):
+ '''Create a mbox file from (YAMLed) Group
+ '''
mbx_file = tempfile.NamedTemporaryFile('w', delete=False)
mbx = gg_scrapper.MBOX(mbx_file.name)
- mbx.write_group(group)
+ mbx.write_group(self.group)
with open('test/mbox.mbx') as exp_f:
with open(mbx_file.name) as mbx_f:
@@ -39,5 +40,17 @@ class TestMBOX(unittest.TestCase):
os.unlink(mbx_file.name)
+ def test_generate_list_mangled_addrs(self):
+ self.maxDiff = None
+ with open('test/mangled_address.cnf') as exp_addr_f:
+ exp_str = exp_addr_f.read()
+
+ self.group.collect_mangled_addrs()
+
+ with open('{}.cnf'.format(self.group.name)) as obs_f:
+ mang_addres = obs_f.read()
+ self.assertEqual(exp_str, mang_addres)
+
+
if __name__ == '__main__':
unittest.main()