aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorW. Trevor King <wking@drexel.edu>2009-07-12 08:38:40 -0400
committerW. Trevor King <wking@drexel.edu>2009-07-12 08:38:40 -0400
commit76d552e5401df990a601f245f30f45d7c13cdd1e (patch)
tree5c510a12e8cb3df1dd5d30cd5aebb6b7938e2ceb
parenta65b273fa14df2a085342bac14abb8a2167ff98a (diff)
downloadbugseverywhere-76d552e5401df990a601f245f30f45d7c13cdd1e.tar.gz
Added be-mbox-to-xml.
Reworked to allow "be comment" to handle unicode strings (see bug e4ed63f6-9000-4d0b-98c3-487269140141). The solution was to escape all the unicode to produce and ASCII string before calling ElementTree.XML, and then converting back to unicode afterwards. Added a unicode-containing comment to the end of bug f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a so that there's a handy unicode comment for testing. XML headers (e.g. '<?xml version="1.0" encoding="UTF-8" ?>') are now added to all xml output from be. Switched non-text/* encoding library to base64 instead of email.encoders, which makes that code in libbe/comment.py simpler. Changed libbe/mapfile.py error encoding from string_escape to unicode_escape so it can handle unicode. Everything's still untested, and be-xml-to-mbox doesn't handle unicode yet, but I felt this commit was getting a bit unwieldy ;).
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body8
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values11
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body1
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values11
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body5
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values11
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body1
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values11
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body5
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values8
-rw-r--r--.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values17
-rw-r--r--.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values19
-rw-r--r--.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values19
-rw-r--r--.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values19
-rw-r--r--.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body1
-rw-r--r--.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values8
-rw-r--r--becommands/comment.py5
-rw-r--r--becommands/show.py2
-rw-r--r--libbe/comment.py24
-rw-r--r--libbe/mapfile.py4
-rwxr-xr-xxml/be-mbox-to-xml94
21 files changed, 227 insertions, 57 deletions
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body
new file mode 100644
index 0000000..0598d70
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body
@@ -0,0 +1,8 @@
+<type 'unicode'> <body>�</body>
+Traceback (most recent call last):
+ File "<string>", line 1, in <module>
+ File "/usr/lib/python2.5/xml/etree/ElementTree.py", line 963, in XML
+ parser.feed(text)
+ File "/usr/lib/python2.5/xml/etree/ElementTree.py", line 1245, in feed
+ self._parser.Parse(data, 0)
+UnicodeEncodeError: 'ascii' codec can't encode character u'\u1234' in position 6: ordinal not in range(128)
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values
new file mode 100644
index 0000000..cd8d8b9
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values
@@ -0,0 +1,11 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:34:22 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: faa686bf-c0eb-48bf-8a0b-d9a2e02bd132
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body
new file mode 100644
index 0000000..397d4b6
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body
@@ -0,0 +1 @@
+It looks like etree wants a byte string, not unicode input
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values
new file mode 100644
index 0000000..8bdaf52
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values
@@ -0,0 +1,11 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:42:16 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: faa686bf-c0eb-48bf-8a0b-d9a2e02bd132
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body
new file mode 100644
index 0000000..ce2bb8d
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body
@@ -0,0 +1,5 @@
+For example, this works:
+
+python -c 'from xml.etree import ElementTree; a=u"<body>\u1234</body>"; print type(a), a; b=ElementTree.XML(a.encode("unicode_escape")); print type(b.text), unicode(b.text).decode("unicode_escape");'
+
+Ugly though :p. Ah well.
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values
new file mode 100644
index 0000000..1784e0e
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values
@@ -0,0 +1,11 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:46:57 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: 520a9829-8d90-43ce-be64-868b8321e5b0
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body
new file mode 100644
index 0000000..89a8f8d
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body
@@ -0,0 +1 @@
+That's with Python 2.5.2 and ElementTree "2326 2005-03-17 07:45:21Z fredrik"
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values
new file mode 100644
index 0000000..cca07c3
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values
@@ -0,0 +1,11 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:37:55 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: 07fc448f-c42e-4846-929a-8924de485766
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body
new file mode 100644
index 0000000..57e050d
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body
@@ -0,0 +1,5 @@
+Isolated problem to:
+
+python -c 'from xml.etree import ElementTree; a=u"<body>\u1234</body>"; print type(a), a; b=ElementTree.XML(a);'
+
+Output attached below
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values
new file mode 100644
index 0000000..e430ea0
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values
@@ -0,0 +1,8 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:31:13 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values
new file mode 100644
index 0000000..4bc81f5
--- /dev/null
+++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values
@@ -0,0 +1,17 @@
+creator: W. Trevor King <wking@drexel.edu>
+
+
+reporter: W. Trevor King <wking@drexel.edu>
+
+
+severity: minor
+
+
+status: fixed
+
+
+summary: utf8 problems in xml parsing
+
+
+time: Sat, 11 Jul 2009 15:48:32 +0000
+
diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values
index eb56317..d39c4a1 100644
--- a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values
+++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values
@@ -1,21 +1,8 @@
+Content-type: text/plain
-
-Content-type=text/plain
-
-
-
-
-
-
-Date=Tue, 25 Nov 2008 19:41:02 +0000
-
-
-
-
-
-
-From=W. Trevor King <wking@drexel.edu>
+Date: Tue, 25 Nov 2008 19:41:02 +0000
+From: W. Trevor King <wking@drexel.edu>
diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values
index f976972..639fd4a 100644
--- a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values
+++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values
@@ -1,21 +1,8 @@
+Content-type: text/plain
-
-Content-type=text/plain
-
-
-
-
-
-
-Date=Tue, 25 Nov 2008 02:36:16 +0000
-
-
-
-
-
-
-From=W. Trevor King <wking@drexel.edu>
+Date: Tue, 25 Nov 2008 02:36:16 +0000
+From: W. Trevor King <wking@drexel.edu>
diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values
index bf5085b..2821b2f 100644
--- a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values
+++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values
@@ -1,21 +1,8 @@
+Content-type: text/plain
-
-Content-type=text/plain
-
-
-
-
-
-
-Date=Tue, 25 Nov 2008 03:02:59 +0000
-
-
-
-
-
-
-From=W. Trevor King <wking@drexel.edu>
+Date: Tue, 25 Nov 2008 03:02:59 +0000
+From: W. Trevor King <wking@drexel.edu>
diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body
new file mode 100644
index 0000000..b441da9
--- /dev/null
+++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body
@@ -0,0 +1 @@
+Test unicode �quotes�
diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values
new file mode 100644
index 0000000..a67680d
--- /dev/null
+++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values
@@ -0,0 +1,8 @@
+Content-type: text/plain
+
+
+Date: Sat, 11 Jul 2009 18:28:57 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
diff --git a/becommands/comment.py b/becommands/comment.py
index 1e6ecd4..c4b074f 100644
--- a/becommands/comment.py
+++ b/becommands/comment.py
@@ -117,7 +117,8 @@ def execute(args, test=False):
new.content_type = options.content_type
else: # import XML comment [list]
# read in the comments
- comment_list = ElementTree.XML(body)
+ str_body = body.strip().encode("unicode_escape")
+ comment_list = ElementTree.XML(str_body)
if comment_list.tag not in ["bug", "comment-list"]:
raise comment.InvalidXML(
comment_list, "root element must be <bug> or <comment-list>")
@@ -130,7 +131,7 @@ def execute(args, test=False):
for child in comment_list.getchildren():
if child.tag == "comment":
new = comment.Comment(bug)
- new.from_xml(ElementTree.tostring(child))
+ new.from_xml(unicode(ElementTree.tostring(child)).decode("unicode_escape"))
if new.alt_id in ids:
raise cmdutil.UserError(
"Clashing comment alt_id: %s" % new.alt_id)
diff --git a/becommands/show.py b/becommands/show.py
index a4208c3..f700caa 100644
--- a/becommands/show.py
+++ b/becommands/show.py
@@ -40,6 +40,7 @@ def execute(args, test=False):
Bug A
<BLANKLINE>
>>> execute (["--xml", "a"], test=True) # doctest: +ELLIPSIS
+ <?xml version="1.0" encoding="..." ?>
<bug>
<uuid>a</uuid>
<short-name>a</short-name>
@@ -70,6 +71,7 @@ def execute(args, test=False):
bug = bd.bug_from_shortname(bugname)
if is_comment == False:
if options.dumpXML:
+ print '<?xml version="1.0" encoding="%s" ?>' % bd.encoding
print bug.xml(show_comments=True)
else:
print bug.string(show_comments=True)
diff --git a/libbe/comment.py b/libbe/comment.py
index d4d47a8..7acbbb1 100644
--- a/libbe/comment.py
+++ b/libbe/comment.py
@@ -17,10 +17,11 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA
-import email.mime.base, email.encoders
+import base64
import os
import os.path
import time
+import types
try: # import core module, Python >= 2.5
from xml.etree import ElementTree
except ImportError: # look for non-core module
@@ -80,10 +81,13 @@ def list_to_root(comments, bug, root=None):
else:
uuid_map[root.uuid] = root
for comm in comments:
+ if comm.in_reply_to == INVALID_UUID:
+ comm.in_reply_to = None
rep = comm.in_reply_to
if rep == None or rep == bug.uuid:
root_comments.append(comm)
else:
+ print comm.in_reply_to
parentUUID = comm.in_reply_to
parent = uuid_map[parentUUID]
parent.add_reply(comm)
@@ -269,7 +273,7 @@ class Comment(Tree, settings_object.SavedSettingsObject):
msg = email.mime.base.MIMEBase(maintype, subtype)
msg.set_payload(self.body or "")
email.encoders.encode_base64(msg)
- body = msg.as_string()
+ body = base64.encodestring(self.body or "")
info = [("uuid", self.uuid),
("alt-id", self.alt_id),
("short-name", shortname),
@@ -310,11 +314,14 @@ class Comment(Tree, settings_object.SavedSettingsObject):
>>> commA.From
>>> commB.From
"""
+ if type(xml_string) == types.UnicodeType:
+ xml_string = xml_string.strip().encode("unicode_escape")
comment = ElementTree.XML(xml_string)
if comment.tag != "comment":
raise InvalidXML(comment, "root element must be <comment>")
tags=['uuid','alt-id','in-reply-to','from','date','content-type','body']
uuid = None
+ body = None
for child in comment.getchildren():
if child.tag == "short-name":
pass
@@ -322,24 +329,31 @@ class Comment(Tree, settings_object.SavedSettingsObject):
if child.text == None or len(child.text) == 0:
text = settings_object.EMPTY
else:
- text = xml.sax.saxutils.unescape(child.text.strip())
+ text = xml.sax.saxutils.unescape(child.text)
+ text = unicode(text).decode("unicode_escape").strip()
if child.tag == "uuid":
uuid = text
continue # don't set the bug's uuid tag.
+ if child.tag == "body":
+ body = text
+ continue # don't set the bug's body yet.
elif child.tag == 'from':
attr_name = "From"
elif child.tag == 'date':
attr_name = 'time_string'
else:
attr_name = child.tag.replace('-','_')
- if attr_name == "body":
- text += '\n' # replace strip()ed trailing newline
setattr(self, attr_name, text)
elif verbose == True:
print >> sys.stderr, "Ignoring unknown tag %s in %s" \
% (child.tag, comment.tag)
if self.alt_id == None and uuid not in [None, self.uuid]:
self.alt_id = uuid
+ if body != None:
+ if self.content_type.startswith("text/"):
+ self.body = body
+ else:
+ self.body = base64.decodestring(body)
def string(self, indent=0, shortname=None):
"""
diff --git a/libbe/mapfile.py b/libbe/mapfile.py
index 40386e2..b183bfe 100644
--- a/libbe/mapfile.py
+++ b/libbe/mapfile.py
@@ -67,9 +67,9 @@ def generate(map):
assert(':' not in key)
assert(len(key) > 0)
except AssertionError:
- raise IllegalKey(key.encode('string_escape'))
+ raise IllegalKey(unicode(key).encode('unicode_escape'))
if "\n" in map[key]:
- raise IllegalValue(map[key].encode('string_escape'))
+ raise IllegalValue(unicode(map[key]).encode('unicode_escape'))
lines = []
for key in keys:
diff --git a/xml/be-mbox-to-xml b/xml/be-mbox-to-xml
new file mode 100755
index 0000000..e9077b1
--- /dev/null
+++ b/xml/be-mbox-to-xml
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# Copyright (C) 2009 W. Trevor King <wking@drexel.edu>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+"""
+Convert an mbox into xml suitable for imput into be.
+ $ cat mbox | be-mbox-to-xml | be comment --xml <ID> -
+mbox is a flat-file format, consisting of a series of messages.
+Messages begin with a a From_ line, followed by RFC 822 email,
+followed by a blank line.
+"""
+
+from mailbox import mbox, Message # the mailbox people really want an on-disk copy
+import email.utils
+import types
+
+import base64
+from libbe.encoding import get_encoding, set_IO_stream_encodings
+from time import asctime, gmtime
+from xml.sax import make_parser
+from xml.sax.handler import ContentHandler
+from xml.sax.saxutils import escape
+
+DEFAULT_ENCODING = get_encoding()
+set_IO_stream_encodings(DEFAULT_ENCODING)
+
+def comment_message_to_xml(message, fields=None):
+ if fields == None:
+ fields = {}
+ fields[u'alt-id'] = message[u'message-id']
+ fields[u'in-reply-to'] = message[u'in-reply-to']
+ fields[u'from'] = message[u'from']
+ fields[u'date'] = message[u'date']
+ fields[u'content-type'] = message.get_content_type()
+ for k,v in fields.items():
+ if v != None and type(v) != types.UnicodeType:
+ fields[k] = unicode(v, encoding=DEFAULT_ENCODING)
+
+ if message.is_multipart():
+ ret = []
+ alt_id = fields[u'alt-id']
+ for m in message.walk():
+ if m == message:
+ continue
+ if len(ret) >= 0:
+ fields.pop(u'alt-id')
+ fields[u'in-reply-to'] = alt_id
+ ret.append(comment_message_to_xml(m, fields))
+ return u'\n'.join(ret)
+
+ charset = message.get_content_charset(DEFAULT_ENCODING).lower()
+ #assert charset == DEFAULT_ENCODING.lower(), \
+ # u"Unknown charset: %s" % charset
+
+ encoding = message[u'content-transfer-encoding'].lower()
+ body = message.get_payload(decode=True) # attempt to decode
+ assert body != None, "Unable to decode?"
+ if fields[u'content-type'].startswith(u"text/"):
+ body = unicode(body, encoding=charset).rstrip(u'\n')
+ else:
+ body = base64.encode(body)
+ fields[u'body'] = body
+ lines = [u"<comment>"]
+ for tag,body in fields.items():
+ if body != None:
+ ebody = escape(body)
+ lines.append(u" <%s>%s</%s>" % (tag, ebody, tag))
+ lines.append(u"</comment>")
+ return u'\n'.join(lines)
+
+def main(mbox_filename):
+ mb = mbox(mbox_filename)
+ print u'<?xml version="1.0" encoding="%s" ?>' % DEFAULT_ENCODING
+ print u"<comment-list>"
+ for message in mb:
+ print comment_message_to_xml(message)
+ print u"</comment-list>"
+
+
+if __name__ == "__main__":
+ import sys
+ main(sys.argv[1])