aboutsummaryrefslogtreecommitdiffstats
path: root/gg_scrapper.py
diff options
context:
space:
mode:
Diffstat (limited to 'gg_scrapper.py')
-rwxr-xr-xgg_scrapper.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py
index f1cde21..05eecca 100755
--- a/gg_scrapper.py
+++ b/gg_scrapper.py
@@ -1,5 +1,6 @@
#!/usr/bin/python3
+from configparser import ConfigParser
import mailbox
import os.path
import re
@@ -189,6 +190,29 @@ class Group(Page):
for art in top.articles:
yield art.raw_message
+ def collect_mangled_addrs(self):
+ addrs = set()
+ addr_sec_label = 'addresses'
+ for top in self.topics:
+ for art in top.articles:
+ msg_str = art.raw_message
+ # see http://stackoverflow.com/questions/201323
+ msg_matches = re.findall(
+ r'([a-zA-Z0-9_.+-]+\.\.\.@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',
+ msg_str, re.IGNORECASE)
+ if msg_matches is not None:
+ for mtch in msg_matches:
+ addrs.add(mtch)
+
+ addrs = sorted(list(addrs))
+
+ with open('{}.cnf'.format(self.name), 'w') as cnf_f:
+ cnf_p = ConfigParser()
+ cnf_p.add_section(addr_sec_label)
+ for addr in addrs:
+ cnf_p.set(addr_sec_label, addr, '')
+ cnf_p.write(cnf_f)
+
class MBOX(mailbox.mbox):
def __init__(self, filename):
@@ -219,5 +243,8 @@ def main(group_URL):
mbx = MBOX("{}.mbx".format(grp.name))
mbx.write_group(grp)
+ # generate list of addresses protected against spammers
+ grp.collect_mangled_addrs()
+
if __name__ == '__main__':
main(sys.argv[1])