1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
#!/usr/bin/env python
# Copyright (C) 2009 W. Trevor King <wking@drexel.edu>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
Convert an mbox into xml suitable for imput into be.
$ cat mbox | be-mbox-to-xml | be comment --xml <ID> -
mbox is a flat-file format, consisting of a series of messages.
Messages begin with a a From_ line, followed by RFC 822 email,
followed by a blank line.
"""
from mailbox import mbox, Message # the mailbox people really want an on-disk copy
import email.utils
import types
import base64
from libbe.encoding import get_encoding, set_IO_stream_encodings
from time import asctime, gmtime
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
from xml.sax.saxutils import escape
DEFAULT_ENCODING = get_encoding()
set_IO_stream_encodings(DEFAULT_ENCODING)
def comment_message_to_xml(message, fields=None):
if fields == None:
fields = {}
fields[u'alt-id'] = message[u'message-id']
fields[u'in-reply-to'] = message[u'in-reply-to']
fields[u'from'] = message[u'from']
fields[u'date'] = message[u'date']
fields[u'content-type'] = message.get_content_type()
for k,v in fields.items():
if v != None and type(v) != types.UnicodeType:
fields[k] = unicode(v, encoding=DEFAULT_ENCODING)
if message.is_multipart():
ret = []
alt_id = fields[u'alt-id']
for m in message.walk():
if m == message:
continue
if len(ret) >= 0:
fields.pop(u'alt-id')
fields[u'in-reply-to'] = alt_id
ret.append(comment_message_to_xml(m, fields))
return u'\n'.join(ret)
charset = message.get_content_charset(DEFAULT_ENCODING).lower()
#assert charset == DEFAULT_ENCODING.lower(), \
# u"Unknown charset: %s" % charset
encoding = message[u'content-transfer-encoding'].lower()
body = message.get_payload(decode=True) # attempt to decode
assert body != None, "Unable to decode?"
if fields[u'content-type'].startswith(u"text/"):
body = unicode(body, encoding=charset).rstrip(u'\n')
else:
body = base64.encode(body)
fields[u'body'] = body
lines = [u"<comment>"]
for tag,body in fields.items():
if body != None:
ebody = escape(body)
lines.append(u" <%s>%s</%s>" % (tag, ebody, tag))
lines.append(u"</comment>")
return u'\n'.join(lines)
def main(mbox_filename):
mb = mbox(mbox_filename)
print u'<?xml version="1.0" encoding="%s" ?>' % DEFAULT_ENCODING
print u"<comment-list>"
for message in mb:
print comment_message_to_xml(message)
print u"</comment-list>"
if __name__ == "__main__":
import sys
main(sys.argv[1])
|