diff options
Diffstat (limited to 'interfaces/xml/be-mbox-to-xml')
-rw-r--r-- | interfaces/xml/be-mbox-to-xml | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/interfaces/xml/be-mbox-to-xml b/interfaces/xml/be-mbox-to-xml index 335f92f..a740117 100644 --- a/interfaces/xml/be-mbox-to-xml +++ b/interfaces/xml/be-mbox-to-xml @@ -25,11 +25,10 @@ followed by a blank line. import base64 import email.utils from libbe.encoding import get_encoding, set_IO_stream_encodings +from libbe.utility import time_to_str from mailbox import mbox, Message # the mailbox people really want an on-disk copy -from time import asctime, gmtime +from time import asctime, gmtime, mktime import types -from xml.sax import make_parser -from xml.sax.handler import ContentHandler from xml.sax.saxutils import escape DEFAULT_ENCODING = get_encoding() @@ -37,14 +36,34 @@ set_IO_stream_encodings(DEFAULT_ENCODING) KNOWN_IDS = [] +def normalize_email_address(address): + """ + Standardize whitespace, etc. + """ + return email.utils.formataddr(email.utils.parseaddr(address)) + +def normalize_RFC_2822_date(date): + """ + Some email clients write non-RFC 2822-compliant date tags like: + Fri, 18 Sep 2009 08:49:02 -0400 (EDT) + with the non-standard (EDT) timezone name. This funtion attempts + to deal with such inconsistencies. + """ + time_tuple = email.utils.parsedate(date) + assert time_tuple != None, \ + 'unparsable date: "%s"' % date + return time_to_str(mktime(time_tuple)) + def comment_message_to_xml(message, fields=None): if fields == None: fields = {} new_fields = {} new_fields[u'alt-id'] = message[u'message-id'] new_fields[u'in-reply-to'] = message[u'in-reply-to'] - new_fields[u'from'] = message[u'from'] + new_fields[u'author'] = normalize_email_address(message[u'from']) new_fields[u'date'] = message[u'date'] + if new_fields[u'date'] != None: + new_fields[u'date'] = normalize_RFC_2822_date(new_fields[u'date']) new_fields[u'content-type'] = message.get_content_type() for k,v in new_fields.items(): if v != None and type(v) != types.UnicodeType: @@ -67,25 +86,27 @@ def comment_message_to_xml(message, fields=None): fields[u'in-reply-to'] = refs[0] # default to the first else: # check for mutliple in-reply-to references. refs = fields[u'in-reply-to'].split() + found_ref = False for ref in refs: # search for a known reference id. if ref in KNOWN_IDS: fields[u'in-reply-to'] = ref + found_ref = True break - if fields[u'in-reply-to'] == None and len(refs) > 0: + if found_ref == False and len(refs) > 0: fields[u'in-reply-to'] = refs[0] # default to the first - if fields['alt-id'] != None: - KNOWN_IDS.append(fields['alt-id']) + if fields[u'alt-id'] != None: + KNOWN_IDS.append(fields[u'alt-id']) if message.is_multipart(): ret = [] alt_id = fields[u'alt-id'] - from_str = fields[u'from'] + from_str = fields[u'author'] date = fields[u'date'] for m in message.walk(): if m == message: continue - fields[u'from'] = from_str + fields[u'author'] = from_str fields[u'date'] = date if len(ret) > 0: # we've added one part already fields.pop(u'alt-id') # don't pass alt-id to other parts |