interfaces/xml/be-mbox-to-xml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

#!/usr/bin/env python
# Copyright (C) 2009 W. Trevor King <wking@drexel.edu>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Convert an mbox into xml suitable for imput into be.
  $ cat mbox | be-mbox-to-xml | be comment --xml <ID> -
mbox is a flat-file format, consisting of a series of messages.
Messages begin with a a From_ line, followed by RFC 822 email,
followed by a blank line.
"""

import base64
import email.utils
from libbe.encoding import get_encoding, set_IO_stream_encodings
from mailbox import mbox, Message  # the mailbox people really want an on-disk copy
from time import asctime, gmtime
import types
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
from xml.sax.saxutils import escape

DEFAULT_ENCODING = get_encoding()
set_IO_stream_encodings(DEFAULT_ENCODING)

def comment_message_to_xml(message, fields=None):
    if fields == None:
        fields = {}
    new_fields = {}
    new_fields[u'alt-id'] = message[u'message-id']
    new_fields[u'in-reply-to'] = message[u'in-reply-to']
    new_fields[u'from'] = message[u'from']
    new_fields[u'date'] = message[u'date']
    new_fields[u'content-type'] = message.get_content_type()
    for k,v in new_fields.items():
        if v != None and type(v) != types.UnicodeType:
            fields[k] = unicode(v, encoding=DEFAULT_ENCODING)
        elif v == None and k in fields:
            new_fields[k] = fields[k]
    for k,v in fields.items():
        if k not in new_fields:
            new_fields.k = fields[k]
    fields = new_fields

    if message.is_multipart():
        ret = []
        alt_id = fields[u'alt-id']
        from_str = fields[u'from']
        date = fields[u'date']
        for m in message.walk():
            if m == message:
                continue
            fields[u'from'] = from_str
            fields[u'date'] = date
            if len(ret) >= 0:
                fields.pop(u'alt-id')
                fields[u'in-reply-to'] = alt_id
            ret.append(comment_message_to_xml(m, fields))
            return u'\n'.join(ret)

    charset = message.get_content_charset(DEFAULT_ENCODING).lower()
    #assert charset == DEFAULT_ENCODING.lower(), \
    #    u"Unknown charset: %s" % charset

    if message[u'content-transfer-encoding'] == None:
        encoding = DEFAULT_ENCODING
    else:
        encoding = message[u'content-transfer-encoding'].lower()
    body = message.get_payload(decode=True) # attempt to decode
    assert body != None, "Unable to decode?"
    if fields[u'content-type'].startswith(u"text/"):
        body = unicode(body, encoding=charset).rstrip(u'\n')
    else:
        body = base64.encode(body)
    fields[u'body'] = body
    lines = [u"<comment>"]
    for tag,body in fields.items():
        if body != None:
            ebody = escape(body)
            lines.append(u"  <%s>%s</%s>" % (tag, ebody, tag))
    lines.append(u"</comment>")
    return u'\n'.join(lines)

def main(mbox_filename):
    mb = mbox(mbox_filename)
    print u'<?xml version="1.0" encoding="%s" ?>' % DEFAULT_ENCODING
    print u"<comment-list>"
    for message in mb:
        print comment_message_to_xml(message)
    print u"</comment-list>"


if __name__ == "__main__":
    import sys
    main(sys.argv[1])