diff options
author | W. Trevor King <wking@drexel.edu> | 2009-07-12 08:38:40 -0400 |
---|---|---|
committer | W. Trevor King <wking@drexel.edu> | 2009-07-12 08:38:40 -0400 |
commit | 76d552e5401df990a601f245f30f45d7c13cdd1e (patch) | |
tree | 5c510a12e8cb3df1dd5d30cd5aebb6b7938e2ceb | |
parent | a65b273fa14df2a085342bac14abb8a2167ff98a (diff) | |
download | bugseverywhere-76d552e5401df990a601f245f30f45d7c13cdd1e.tar.gz |
Added be-mbox-to-xml.
Reworked to allow "be comment" to handle unicode strings (see bug
e4ed63f6-9000-4d0b-98c3-487269140141). The solution was to escape all
the unicode to produce and ASCII string before calling
ElementTree.XML, and then converting back to unicode afterwards.
Added a unicode-containing comment to the end of bug
f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a so that there's a handy unicode
comment for testing.
XML headers (e.g. '<?xml version="1.0" encoding="UTF-8" ?>') are
now added to all xml output from be.
Switched non-text/* encoding library to base64 instead of
email.encoders, which makes that code in libbe/comment.py simpler.
Changed libbe/mapfile.py error encoding from string_escape to
unicode_escape so it can handle unicode.
Everything's still untested, and be-xml-to-mbox doesn't handle unicode
yet, but I felt this commit was getting a bit unwieldy ;).
21 files changed, 227 insertions, 57 deletions
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body new file mode 100644 index 0000000..0598d70 --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body @@ -0,0 +1,8 @@ +<type 'unicode'> <body>�</body> +Traceback (most recent call last): + File "<string>", line 1, in <module> + File "/usr/lib/python2.5/xml/etree/ElementTree.py", line 963, in XML + parser.feed(text) + File "/usr/lib/python2.5/xml/etree/ElementTree.py", line 1245, in feed + self._parser.Parse(data, 0) +UnicodeEncodeError: 'ascii' codec can't encode character u'\u1234' in position 6: ordinal not in range(128) diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values new file mode 100644 index 0000000..cd8d8b9 --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values @@ -0,0 +1,11 @@ +Content-type: text/plain + + +Date: Sun, 12 Jul 2009 11:34:22 +0000 + + +From: W. Trevor King <wking@drexel.edu> + + +In-reply-to: faa686bf-c0eb-48bf-8a0b-d9a2e02bd132 + diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body new file mode 100644 index 0000000..397d4b6 --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body @@ -0,0 +1 @@ +It looks like etree wants a byte string, not unicode input diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values new file mode 100644 index 0000000..8bdaf52 --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values @@ -0,0 +1,11 @@ +Content-type: text/plain + + +Date: Sun, 12 Jul 2009 11:42:16 +0000 + + +From: W. Trevor King <wking@drexel.edu> + + +In-reply-to: faa686bf-c0eb-48bf-8a0b-d9a2e02bd132 + diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body new file mode 100644 index 0000000..ce2bb8d --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body @@ -0,0 +1,5 @@ +For example, this works: + +python -c 'from xml.etree import ElementTree; a=u"<body>\u1234</body>"; print type(a), a; b=ElementTree.XML(a.encode("unicode_escape")); print type(b.text), unicode(b.text).decode("unicode_escape");' + +Ugly though :p. Ah well. diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values new file mode 100644 index 0000000..1784e0e --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values @@ -0,0 +1,11 @@ +Content-type: text/plain + + +Date: Sun, 12 Jul 2009 11:46:57 +0000 + + +From: W. Trevor King <wking@drexel.edu> + + +In-reply-to: 520a9829-8d90-43ce-be64-868b8321e5b0 + diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body new file mode 100644 index 0000000..89a8f8d --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body @@ -0,0 +1 @@ +That's with Python 2.5.2 and ElementTree "2326 2005-03-17 07:45:21Z fredrik" diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values new file mode 100644 index 0000000..cca07c3 --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values @@ -0,0 +1,11 @@ +Content-type: text/plain + + +Date: Sun, 12 Jul 2009 11:37:55 +0000 + + +From: W. Trevor King <wking@drexel.edu> + + +In-reply-to: 07fc448f-c42e-4846-929a-8924de485766 + diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body new file mode 100644 index 0000000..57e050d --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body @@ -0,0 +1,5 @@ +Isolated problem to: + +python -c 'from xml.etree import ElementTree; a=u"<body>\u1234</body>"; print type(a), a; b=ElementTree.XML(a);' + +Output attached below diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values new file mode 100644 index 0000000..e430ea0 --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values @@ -0,0 +1,8 @@ +Content-type: text/plain + + +Date: Sun, 12 Jul 2009 11:31:13 +0000 + + +From: W. Trevor King <wking@drexel.edu> + diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values new file mode 100644 index 0000000..4bc81f5 --- /dev/null +++ b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values @@ -0,0 +1,17 @@ +creator: W. Trevor King <wking@drexel.edu> + + +reporter: W. Trevor King <wking@drexel.edu> + + +severity: minor + + +status: fixed + + +summary: utf8 problems in xml parsing + + +time: Sat, 11 Jul 2009 15:48:32 +0000 + diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values index eb56317..d39c4a1 100644 --- a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values +++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values @@ -1,21 +1,8 @@ +Content-type: text/plain - -Content-type=text/plain - - - - - - -Date=Tue, 25 Nov 2008 19:41:02 +0000 - - - - - - -From=W. Trevor King <wking@drexel.edu> +Date: Tue, 25 Nov 2008 19:41:02 +0000 +From: W. Trevor King <wking@drexel.edu> diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values index f976972..639fd4a 100644 --- a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values +++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values @@ -1,21 +1,8 @@ +Content-type: text/plain - -Content-type=text/plain - - - - - - -Date=Tue, 25 Nov 2008 02:36:16 +0000 - - - - - - -From=W. Trevor King <wking@drexel.edu> +Date: Tue, 25 Nov 2008 02:36:16 +0000 +From: W. Trevor King <wking@drexel.edu> diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values index bf5085b..2821b2f 100644 --- a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values +++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values @@ -1,21 +1,8 @@ +Content-type: text/plain - -Content-type=text/plain - - - - - - -Date=Tue, 25 Nov 2008 03:02:59 +0000 - - - - - - -From=W. Trevor King <wking@drexel.edu> +Date: Tue, 25 Nov 2008 03:02:59 +0000 +From: W. Trevor King <wking@drexel.edu> diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body new file mode 100644 index 0000000..b441da9 --- /dev/null +++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body @@ -0,0 +1 @@ +Test unicode �quotes� diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values new file mode 100644 index 0000000..a67680d --- /dev/null +++ b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values @@ -0,0 +1,8 @@ +Content-type: text/plain + + +Date: Sat, 11 Jul 2009 18:28:57 +0000 + + +From: W. Trevor King <wking@drexel.edu> + diff --git a/becommands/comment.py b/becommands/comment.py index 1e6ecd4..c4b074f 100644 --- a/becommands/comment.py +++ b/becommands/comment.py @@ -117,7 +117,8 @@ def execute(args, test=False): new.content_type = options.content_type else: # import XML comment [list] # read in the comments - comment_list = ElementTree.XML(body) + str_body = body.strip().encode("unicode_escape") + comment_list = ElementTree.XML(str_body) if comment_list.tag not in ["bug", "comment-list"]: raise comment.InvalidXML( comment_list, "root element must be <bug> or <comment-list>") @@ -130,7 +131,7 @@ def execute(args, test=False): for child in comment_list.getchildren(): if child.tag == "comment": new = comment.Comment(bug) - new.from_xml(ElementTree.tostring(child)) + new.from_xml(unicode(ElementTree.tostring(child)).decode("unicode_escape")) if new.alt_id in ids: raise cmdutil.UserError( "Clashing comment alt_id: %s" % new.alt_id) diff --git a/becommands/show.py b/becommands/show.py index a4208c3..f700caa 100644 --- a/becommands/show.py +++ b/becommands/show.py @@ -40,6 +40,7 @@ def execute(args, test=False): Bug A <BLANKLINE> >>> execute (["--xml", "a"], test=True) # doctest: +ELLIPSIS + <?xml version="1.0" encoding="..." ?> <bug> <uuid>a</uuid> <short-name>a</short-name> @@ -70,6 +71,7 @@ def execute(args, test=False): bug = bd.bug_from_shortname(bugname) if is_comment == False: if options.dumpXML: + print '<?xml version="1.0" encoding="%s" ?>' % bd.encoding print bug.xml(show_comments=True) else: print bug.string(show_comments=True) diff --git a/libbe/comment.py b/libbe/comment.py index d4d47a8..7acbbb1 100644 --- a/libbe/comment.py +++ b/libbe/comment.py @@ -17,10 +17,11 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA -import email.mime.base, email.encoders +import base64 import os import os.path import time +import types try: # import core module, Python >= 2.5 from xml.etree import ElementTree except ImportError: # look for non-core module @@ -80,10 +81,13 @@ def list_to_root(comments, bug, root=None): else: uuid_map[root.uuid] = root for comm in comments: + if comm.in_reply_to == INVALID_UUID: + comm.in_reply_to = None rep = comm.in_reply_to if rep == None or rep == bug.uuid: root_comments.append(comm) else: + print comm.in_reply_to parentUUID = comm.in_reply_to parent = uuid_map[parentUUID] parent.add_reply(comm) @@ -269,7 +273,7 @@ class Comment(Tree, settings_object.SavedSettingsObject): msg = email.mime.base.MIMEBase(maintype, subtype) msg.set_payload(self.body or "") email.encoders.encode_base64(msg) - body = msg.as_string() + body = base64.encodestring(self.body or "") info = [("uuid", self.uuid), ("alt-id", self.alt_id), ("short-name", shortname), @@ -310,11 +314,14 @@ class Comment(Tree, settings_object.SavedSettingsObject): >>> commA.From >>> commB.From """ + if type(xml_string) == types.UnicodeType: + xml_string = xml_string.strip().encode("unicode_escape") comment = ElementTree.XML(xml_string) if comment.tag != "comment": raise InvalidXML(comment, "root element must be <comment>") tags=['uuid','alt-id','in-reply-to','from','date','content-type','body'] uuid = None + body = None for child in comment.getchildren(): if child.tag == "short-name": pass @@ -322,24 +329,31 @@ class Comment(Tree, settings_object.SavedSettingsObject): if child.text == None or len(child.text) == 0: text = settings_object.EMPTY else: - text = xml.sax.saxutils.unescape(child.text.strip()) + text = xml.sax.saxutils.unescape(child.text) + text = unicode(text).decode("unicode_escape").strip() if child.tag == "uuid": uuid = text continue # don't set the bug's uuid tag. + if child.tag == "body": + body = text + continue # don't set the bug's body yet. elif child.tag == 'from': attr_name = "From" elif child.tag == 'date': attr_name = 'time_string' else: attr_name = child.tag.replace('-','_') - if attr_name == "body": - text += '\n' # replace strip()ed trailing newline setattr(self, attr_name, text) elif verbose == True: print >> sys.stderr, "Ignoring unknown tag %s in %s" \ % (child.tag, comment.tag) if self.alt_id == None and uuid not in [None, self.uuid]: self.alt_id = uuid + if body != None: + if self.content_type.startswith("text/"): + self.body = body + else: + self.body = base64.decodestring(body) def string(self, indent=0, shortname=None): """ diff --git a/libbe/mapfile.py b/libbe/mapfile.py index 40386e2..b183bfe 100644 --- a/libbe/mapfile.py +++ b/libbe/mapfile.py @@ -67,9 +67,9 @@ def generate(map): assert(':' not in key) assert(len(key) > 0) except AssertionError: - raise IllegalKey(key.encode('string_escape')) + raise IllegalKey(unicode(key).encode('unicode_escape')) if "\n" in map[key]: - raise IllegalValue(map[key].encode('string_escape')) + raise IllegalValue(unicode(map[key]).encode('unicode_escape')) lines = [] for key in keys: diff --git a/xml/be-mbox-to-xml b/xml/be-mbox-to-xml new file mode 100755 index 0000000..e9077b1 --- /dev/null +++ b/xml/be-mbox-to-xml @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# Copyright (C) 2009 W. Trevor King <wking@drexel.edu> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +""" +Convert an mbox into xml suitable for imput into be. + $ cat mbox | be-mbox-to-xml | be comment --xml <ID> - +mbox is a flat-file format, consisting of a series of messages. +Messages begin with a a From_ line, followed by RFC 822 email, +followed by a blank line. +""" + +from mailbox import mbox, Message # the mailbox people really want an on-disk copy +import email.utils +import types + +import base64 +from libbe.encoding import get_encoding, set_IO_stream_encodings +from time import asctime, gmtime +from xml.sax import make_parser +from xml.sax.handler import ContentHandler +from xml.sax.saxutils import escape + +DEFAULT_ENCODING = get_encoding() +set_IO_stream_encodings(DEFAULT_ENCODING) + +def comment_message_to_xml(message, fields=None): + if fields == None: + fields = {} + fields[u'alt-id'] = message[u'message-id'] + fields[u'in-reply-to'] = message[u'in-reply-to'] + fields[u'from'] = message[u'from'] + fields[u'date'] = message[u'date'] + fields[u'content-type'] = message.get_content_type() + for k,v in fields.items(): + if v != None and type(v) != types.UnicodeType: + fields[k] = unicode(v, encoding=DEFAULT_ENCODING) + + if message.is_multipart(): + ret = [] + alt_id = fields[u'alt-id'] + for m in message.walk(): + if m == message: + continue + if len(ret) >= 0: + fields.pop(u'alt-id') + fields[u'in-reply-to'] = alt_id + ret.append(comment_message_to_xml(m, fields)) + return u'\n'.join(ret) + + charset = message.get_content_charset(DEFAULT_ENCODING).lower() + #assert charset == DEFAULT_ENCODING.lower(), \ + # u"Unknown charset: %s" % charset + + encoding = message[u'content-transfer-encoding'].lower() + body = message.get_payload(decode=True) # attempt to decode + assert body != None, "Unable to decode?" + if fields[u'content-type'].startswith(u"text/"): + body = unicode(body, encoding=charset).rstrip(u'\n') + else: + body = base64.encode(body) + fields[u'body'] = body + lines = [u"<comment>"] + for tag,body in fields.items(): + if body != None: + ebody = escape(body) + lines.append(u" <%s>%s</%s>" % (tag, ebody, tag)) + lines.append(u"</comment>") + return u'\n'.join(lines) + +def main(mbox_filename): + mb = mbox(mbox_filename) + print u'<?xml version="1.0" encoding="%s" ?>' % DEFAULT_ENCODING + print u"<comment-list>" + for message in mb: + print comment_message_to_xml(message) + print u"</comment-list>" + + +if __name__ == "__main__": + import sys + main(sys.argv[1]) |