aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2014-03-17 21:08:14 +0100
committerMatěj Cepl <mcepl@redhat.com>2014-03-17 21:15:34 +0100
commit9f4254aad67783926ae8dc74a0b0ad2d528dd0f1 (patch)
tree67ff283eef9665f9c6db23928711beddefee26d6
parentfd0dcabe2cb54f2365abe1af0eac603bcf8901a4 (diff)
downloadgg_scraper-9f4254aad67783926ae8dc74a0b0ad2d528dd0f1.tar.gz
Fix demangling.0.8
* remove some reasons why import could crash * use case ignoring RE instead of plain .replace (cache those REs)
-rwxr-xr-xgg_scraper.py27
1 files changed, 21 insertions, 6 deletions
diff --git a/gg_scraper.py b/gg_scraper.py
index 0f3c5ab..b1c3260 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -54,7 +54,7 @@ MANGLED_ADDR_RE = re.compile(
r'([a-zA-Z0-9_.+-]+(\.)+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',
re.IGNORECASE)
-__version__ = '0.7'
+__version__ = '0.8'
pyver = sys.version_info
py26 = pyver[:2] < (2, 7)
@@ -266,6 +266,8 @@ class Group(Page):
msg_matches = MANGLED_ADDR_RE.findall(msg_str)
if msg_matches is not None:
for mtch in msg_matches:
+ if isinstance(mtch, tuple):
+ mtch = mtch[0]
if mtch in addrs:
addrs[mtch] += 1
else:
@@ -279,7 +281,12 @@ class Group(Page):
cnf_p = ConfigParser(dict_type=OrderedDict)
cnf_p.add_section(ADDR_SEC_LABEL)
for addr in addrs:
- cnf_p.set(ADDR_SEC_LABEL, addr, '')
+ try:
+ cnf_p.set(ADDR_SEC_LABEL, addr, '')
+ except TypeError:
+ # Failed with addr = ('richte...@gmail.com', '.')
+ logging.info('Failed with addr = {0}'.format(addr))
+ raise
cnf_p.write(cnf_f)
@@ -334,7 +341,12 @@ def main(group_URL):
def demangle(correct_list, orig_mbx, out_mbx):
cnf_p = ConfigParser(dict_type=OrderedDict)
cnf_p.read(correct_list)
- pairs = dict(cnf_p.items(ADDR_SEC_LABEL))
+ #pairs = dict(cnf_p.items(ADDR_SEC_LABEL))
+ pairs = dict((k, {'repl': v, 'RE': re.compile(r'\b%s\b' % k,
+ re.IGNORECASE)})
+ for (k, v) in cnf_p.items(ADDR_SEC_LABEL)
+ if v is not None)
+ counter = 0
if os.path.exists(out_mbx):
shutil.move(out_mbx, '{0}.bak'.format(out_mbx))
@@ -349,9 +361,11 @@ def demangle(correct_list, orig_mbx, out_mbx):
if matches is not None:
u_from = msg.get_from()
for orig, fixed in pairs.items():
- if (orig is None) or (fixed is None):
- continue
- msg_str = msg_str.replace(orig, fixed)
+ #if (orig is None) or (fixed is None):
+ # continue
+ #msg_str = msg_str.replace(orig, fixed)
+ msg_str = fixed['RE'].sub(fixed['repl'], msg_str)
+ counter += 1 # This is wrong
out_msg = mailbox.mboxMessage(msg_str)
out_msg.set_from(u_from)
@@ -360,6 +374,7 @@ def demangle(correct_list, orig_mbx, out_mbx):
out_mbx.add(msg)
out_mbx.close()
in_mbx.close()
+ logging.info('Change counter = {0}'.format(counter))
if __name__ == '__main__':