xml/be-mbox-to-xml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

#!/usr/bin/env python
# Copyright (C) 2009 W. Trevor King <wking@drexel.edu>
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""
Convert an mbox into xml suitable for imput into be.
  $ cat mbox | be-mbox-to-xml | be comment --xml <ID> -
mbox is a flat-file format, consisting of a series of messages.
Messages begin with a a From_ line, followed by RFC 822 email,
followed by a blank line.
"""

from mailbox import mbox, Message  # the mailbox people really want an on-disk copy
import email.utils
import types

import base64
from libbe.encoding import get_encoding, set_IO_stream_encodings
from time import asctime, gmtime
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
from xml.sax.saxutils import escape

DEFAULT_ENCODING = get_encoding()
set_IO_stream_encodings(DEFAULT_ENCODING)

def comment_message_to_xml(message, fields=None):
    if fields == None:
        fields = {}
    fields[u'alt-id'] = message[u'message-id']
    fields[u'in-reply-to'] = message[u'in-reply-to']
    fields[u'from'] = message[u'from']
    fields[u'date'] = message[u'date']
    fields[u'content-type'] = message.get_content_type()
    for k,v in fields.items():
        if v != None and type(v) != types.UnicodeType:
            fields[k] = unicode(v, encoding=DEFAULT_ENCODING)

    if message.is_multipart():
        ret = []
        alt_id = fields[u'alt-id']
        for m in message.walk():
            if m == message:
                continue
            if len(ret) >= 0:
                fields.pop(u'alt-id')
                fields[u'in-reply-to'] = alt_id
            ret.append(comment_message_to_xml(m, fields))
            return u'\n'.join(ret)

    charset = message.get_content_charset(DEFAULT_ENCODING).lower()
    #assert charset == DEFAULT_ENCODING.lower(), \
    #    u"Unknown charset: %s" % charset

    encoding = message[u'content-transfer-encoding'].lower()
    body = message.get_payload(decode=True) # attempt to decode
    assert body != None, "Unable to decode?"
    if fields[u'content-type'].startswith(u"text/"):
        body = unicode(body, encoding=charset).rstrip(u'\n')
    else:
        body = base64.encode(body)
    fields[u'body'] = body
    lines = [u"<comment>"]
    for tag,body in fields.items():
        if body != None:
            ebody = escape(body)
            lines.append(u"  <%s>%s</%s>" % (tag, ebody, tag))
    lines.append(u"</comment>")
    return u'\n'.join(lines)

def main(mbox_filename):
    mb = mbox(mbox_filename)
    print u'<?xml version="1.0" encoding="%s" ?>' % DEFAULT_ENCODING
    print u"<comment-list>"
    for message in mb:
        print comment_message_to_xml(message)
    print u"</comment-list>"


if __name__ == "__main__":
    import sys
    main(sys.argv[1])