1 files changed, 30 insertions, 9 deletions
diff --git a/interfaces/xml/be-mbox-to-xml b/interfaces/xml/be-mbox-to-xml
index 335f92f..a740117 100755
--- a/interfaces/xml/be-mbox-to-xml
+++ b/interfaces/xml/be-mbox-to-xml
@@ -25,11 +25,10 @@ followed by a blank line.
 import base64
 import email.utils
 from libbe.encoding import get_encoding, set_IO_stream_encodings
+from libbe.utility import time_to_str
 from mailbox import mbox, Message  # the mailbox people really want an on-disk copy
-from time import asctime, gmtime
+from time import asctime, gmtime, mktime
 import types
-from xml.sax import make_parser
-from xml.sax.handler import ContentHandler
 from xml.sax.saxutils import escape
 
 DEFAULT_ENCODING = get_encoding()
@@ -37,14 +36,34 @@ set_IO_stream_encodings(DEFAULT_ENCODING)
 
 KNOWN_IDS = []
 
+def normalize_email_address(address):
+    """
+    Standardize whitespace, etc.
+    """
+    return email.utils.formataddr(email.utils.parseaddr(address))
+
+def normalize_RFC_2822_date(date):
+    """
+    Some email clients write non-RFC 2822-compliant date tags like:
+      Fri, 18 Sep 2009 08:49:02 -0400 (EDT)
+    with the non-standard (EDT) timezone name.  This funtion attempts
+    to deal with such inconsistencies.
+    """
+    time_tuple = email.utils.parsedate(date)
+    assert time_tuple != None, \
+        'unparsable date: "%s"' % date
+    return time_to_str(mktime(time_tuple))
+
 def comment_message_to_xml(message, fields=None):
     if fields == None:
         fields = {}
     new_fields = {}
     new_fields[u'alt-id'] = message[u'message-id']
     new_fields[u'in-reply-to'] = message[u'in-reply-to']
-    new_fields[u'from'] = message[u'from']
+    new_fields[u'author'] = normalize_email_address(message[u'from'])
     new_fields[u'date'] = message[u'date']
+    if new_fields[u'date'] != None:
+        new_fields[u'date'] = normalize_RFC_2822_date(new_fields[u'date'])
     new_fields[u'content-type'] = message.get_content_type()
     for k,v in new_fields.items():
         if v != None and type(v) != types.UnicodeType:
@@ -67,25 +86,27 @@ def comment_message_to_xml(message, fields=None):
                 fields[u'in-reply-to'] = refs[0] # default to the first
     else: # check for mutliple in-reply-to references.
         refs = fields[u'in-reply-to'].split()
+        found_ref = False
         for ref in refs: # search for a known reference id.
             if ref in KNOWN_IDS:
                 fields[u'in-reply-to'] = ref
+                found_ref = True
                 break
-        if fields[u'in-reply-to'] == None and len(refs) > 0:
+        if found_ref == False and len(refs) > 0:
             fields[u'in-reply-to'] = refs[0] # default to the first
 
-    if fields['alt-id'] != None:
-        KNOWN_IDS.append(fields['alt-id'])
+    if fields[u'alt-id'] != None:
+        KNOWN_IDS.append(fields[u'alt-id'])
 
     if message.is_multipart():
         ret = []
         alt_id = fields[u'alt-id']
-        from_str = fields[u'from']
+        from_str = fields[u'author']
         date = fields[u'date']
         for m in message.walk():
             if m == message:
                 continue
-            fields[u'from'] = from_str
+            fields[u'author'] = from_str
             fields[u'date'] = date
             if len(ret) > 0: # we've added one part already
                 fields.pop(u'alt-id') # don't pass alt-id to other parts