aboutsummaryrefslogtreecommitdiffstats
path: root/gg_scraper.py
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2014-01-08 00:00:17 +0100
committerMatěj Cepl <mcepl@redhat.com>2014-01-11 10:50:35 +0100
commit0009e30e4ad61f40ca56c58ecbdfbdc73809beee (patch)
tree8282f7db673b62e3dd891947c078edfaa7c8a992 /gg_scraper.py
parentc5c5b68dd2ed591e0ad411bf3dde4611cb39f18c (diff)
downloadgg_scraper-0009e30e4ad61f40ca56c58ecbdfbdc73809beee.tar.gz
Sort unmangled addresses in the configuration file by frequency.
Fixes #287
Diffstat (limited to 'gg_scraper.py')
-rwxr-xr-xgg_scraper.py15
1 files changed, 11 insertions, 4 deletions
diff --git a/gg_scraper.py b/gg_scraper.py
index 209b92c..0628dd3 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -20,6 +20,8 @@ with this program. If not, see <http://www.gnu.org/licenses/>.'
from __future__ import absolute_import, print_function, unicode_literals
import argparse
+from collections import OrderedDict
+import operator
try:
from configparser import ConfigParser
except ImportError:
@@ -237,7 +239,7 @@ class Group(Page):
yield art.raw_message
def collect_mangled_addrs(self):
- addrs = set()
+ addrs = {}
for top in self.topics:
for art in top.articles:
msg_str = art.raw_message
@@ -245,9 +247,14 @@ class Group(Page):
msg_matches = MANGLED_ADDR_RE.findall(msg_str)
if msg_matches is not None:
for mtch in msg_matches:
- addrs.add(mtch)
-
- addrs = sorted(list(addrs))
+ if mtch in addrs:
+ addrs[mtch] += 1
+ else:
+ addrs[mtch] = 1
+
+ addrs = OrderedDict(sorted(addrs.items(),
+ key=operator.itemgetter(1),
+ reverse=True))
with open('{}.cnf'.format(self.name), 'w') as cnf_f:
cnf_p = ConfigParser()