diff options
author | Matěj Cepl <mcepl@redhat.com> | 2014-01-08 00:00:17 +0100 |
---|---|---|
committer | Matěj Cepl <mcepl@redhat.com> | 2014-01-11 10:50:35 +0100 |
commit | 0009e30e4ad61f40ca56c58ecbdfbdc73809beee (patch) | |
tree | 8282f7db673b62e3dd891947c078edfaa7c8a992 | |
parent | c5c5b68dd2ed591e0ad411bf3dde4611cb39f18c (diff) | |
download | gg_scraper-0009e30e4ad61f40ca56c58ecbdfbdc73809beee.tar.gz |
Sort unmangled addresses in the configuration file by frequency.
Fixes #287
-rwxr-xr-x | gg_scraper.py | 15 | ||||
-rw-r--r-- | test/mangled_address.cnf | 4 | ||||
-rw-r--r-- | test/unmangled_address.cnf | 5 |
3 files changed, 15 insertions, 9 deletions
diff --git a/gg_scraper.py b/gg_scraper.py index 209b92c..0628dd3 100755 --- a/gg_scraper.py +++ b/gg_scraper.py @@ -20,6 +20,8 @@ with this program. If not, see <http://www.gnu.org/licenses/>.' from __future__ import absolute_import, print_function, unicode_literals import argparse +from collections import OrderedDict +import operator try: from configparser import ConfigParser except ImportError: @@ -237,7 +239,7 @@ class Group(Page): yield art.raw_message def collect_mangled_addrs(self): - addrs = set() + addrs = {} for top in self.topics: for art in top.articles: msg_str = art.raw_message @@ -245,9 +247,14 @@ class Group(Page): msg_matches = MANGLED_ADDR_RE.findall(msg_str) if msg_matches is not None: for mtch in msg_matches: - addrs.add(mtch) - - addrs = sorted(list(addrs)) + if mtch in addrs: + addrs[mtch] += 1 + else: + addrs[mtch] = 1 + + addrs = OrderedDict(sorted(addrs.items(), + key=operator.itemgetter(1), + reverse=True)) with open('{}.cnf'.format(self.name), 'w') as cnf_f: cnf_p = ConfigParser() diff --git a/test/mangled_address.cnf b/test/mangled_address.cnf index 52700d2..84994f6 100644 --- a/test/mangled_address.cnf +++ b/test/mangled_address.cnf @@ -1,6 +1,6 @@ [addresses] -damn...@gmail.com = -javasc...@googlegroups.com = richte...@gmail.com = +javasc...@googlegroups.com = +damn...@gmail.com = scho...@schovi.cz = diff --git a/test/unmangled_address.cnf b/test/unmangled_address.cnf index 9d01b3e..b38f6c5 100644 --- a/test/unmangled_address.cnf +++ b/test/unmangled_address.cnf @@ -1,6 +1,5 @@ [addresses] -damn...@gmail.com = damn.pepe@gmail.com -javasc...@googlegroups.com = javascriptcz@googlegroups.com richte...@gmail.com = richter.josef@gmail.com +javasc...@googlegroups.com = javascriptcz@googlegroups.com +damn...@gmail.com = damn.pepe@gmail.com scho...@schovi.cz = schovanec@schovi.cz - |