aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2014-01-08 00:00:17 +0100
committerMatěj Cepl <mcepl@redhat.com>2014-01-11 10:50:35 +0100
commit0009e30e4ad61f40ca56c58ecbdfbdc73809beee (patch)
tree8282f7db673b62e3dd891947c078edfaa7c8a992
parentc5c5b68dd2ed591e0ad411bf3dde4611cb39f18c (diff)
downloadgg_scraper-0009e30e4ad61f40ca56c58ecbdfbdc73809beee.tar.gz
Sort unmangled addresses in the configuration file by frequency.
Fixes #287
-rwxr-xr-xgg_scraper.py15
-rw-r--r--test/mangled_address.cnf4
-rw-r--r--test/unmangled_address.cnf5
3 files changed, 15 insertions, 9 deletions
diff --git a/gg_scraper.py b/gg_scraper.py
index 209b92c..0628dd3 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -20,6 +20,8 @@ with this program. If not, see <http://www.gnu.org/licenses/>.'
from __future__ import absolute_import, print_function, unicode_literals
import argparse
+from collections import OrderedDict
+import operator
try:
from configparser import ConfigParser
except ImportError:
@@ -237,7 +239,7 @@ class Group(Page):
yield art.raw_message
def collect_mangled_addrs(self):
- addrs = set()
+ addrs = {}
for top in self.topics:
for art in top.articles:
msg_str = art.raw_message
@@ -245,9 +247,14 @@ class Group(Page):
msg_matches = MANGLED_ADDR_RE.findall(msg_str)
if msg_matches is not None:
for mtch in msg_matches:
- addrs.add(mtch)
-
- addrs = sorted(list(addrs))
+ if mtch in addrs:
+ addrs[mtch] += 1
+ else:
+ addrs[mtch] = 1
+
+ addrs = OrderedDict(sorted(addrs.items(),
+ key=operator.itemgetter(1),
+ reverse=True))
with open('{}.cnf'.format(self.name), 'w') as cnf_f:
cnf_p = ConfigParser()
diff --git a/test/mangled_address.cnf b/test/mangled_address.cnf
index 52700d2..84994f6 100644
--- a/test/mangled_address.cnf
+++ b/test/mangled_address.cnf
@@ -1,6 +1,6 @@
[addresses]
-damn...@gmail.com =
-javasc...@googlegroups.com =
richte...@gmail.com =
+javasc...@googlegroups.com =
+damn...@gmail.com =
scho...@schovi.cz =
diff --git a/test/unmangled_address.cnf b/test/unmangled_address.cnf
index 9d01b3e..b38f6c5 100644
--- a/test/unmangled_address.cnf
+++ b/test/unmangled_address.cnf
@@ -1,6 +1,5 @@
[addresses]
-damn...@gmail.com = damn.pepe@gmail.com
-javasc...@googlegroups.com = javascriptcz@googlegroups.com
richte...@gmail.com = richter.josef@gmail.com
+javasc...@googlegroups.com = javascriptcz@googlegroups.com
+damn...@gmail.com = damn.pepe@gmail.com
scho...@schovi.cz = schovanec@schovi.cz
-