aboutsummaryrefslogtreecommitdiffstats
path: root/gg_scrapper.py
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2014-01-02 16:48:57 +0100
committerMatěj Cepl <mcepl@redhat.com>2014-01-02 16:48:57 +0100
commit80f10eaaf784db7c0dfa04ed8917f98aac60eff0 (patch)
tree06c4351c0b53bb795925ca9db5d2fa3673663582 /gg_scrapper.py
parent4b3df4fabb92dab4028cb794de897eaf3b8fde82 (diff)
downloadgg_scraper-80f10eaaf784db7c0dfa04ed8917f98aac60eff0.tar.gz
For each group generate also a list of all mangled addresses.
Google Groups (rightly) protects addresses against spammers. There is (obviously) no way how to find true value of these addresses programmatically, so we just generate list of all affected ones, which could be later completed with true values (collected somehow) and fixed by some other script. Fixes #275
Diffstat (limited to 'gg_scrapper.py')
-rwxr-xr-xgg_scrapper.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py
index f1cde21..05eecca 100755
--- a/gg_scrapper.py
+++ b/gg_scrapper.py
@@ -1,5 +1,6 @@
#!/usr/bin/python3
+from configparser import ConfigParser
import mailbox
import os.path
import re
@@ -189,6 +190,29 @@ class Group(Page):
for art in top.articles:
yield art.raw_message
+ def collect_mangled_addrs(self):
+ addrs = set()
+ addr_sec_label = 'addresses'
+ for top in self.topics:
+ for art in top.articles:
+ msg_str = art.raw_message
+ # see http://stackoverflow.com/questions/201323
+ msg_matches = re.findall(
+ r'([a-zA-Z0-9_.+-]+\.\.\.@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',
+ msg_str, re.IGNORECASE)
+ if msg_matches is not None:
+ for mtch in msg_matches:
+ addrs.add(mtch)
+
+ addrs = sorted(list(addrs))
+
+ with open('{}.cnf'.format(self.name), 'w') as cnf_f:
+ cnf_p = ConfigParser()
+ cnf_p.add_section(addr_sec_label)
+ for addr in addrs:
+ cnf_p.set(addr_sec_label, addr, '')
+ cnf_p.write(cnf_f)
+
class MBOX(mailbox.mbox):
def __init__(self, filename):
@@ -219,5 +243,8 @@ def main(group_URL):
mbx = MBOX("{}.mbx".format(grp.name))
mbx.write_group(grp)
+ # generate list of addresses protected against spammers
+ grp.collect_mangled_addrs()
+
if __name__ == '__main__':
main(sys.argv[1])