--- /dev/null
+<type 'unicode'> <body>�</body>
+Traceback (most recent call last):
+ File "<string>", line 1, in <module>
+ File "/usr/lib/python2.5/xml/etree/ElementTree.py", line 963, in XML
+ parser.feed(text)
+ File "/usr/lib/python2.5/xml/etree/ElementTree.py", line 1245, in feed
+ self._parser.Parse(data, 0)
+UnicodeEncodeError: 'ascii' codec can't encode character u'\u1234' in position 6: ordinal not in range(128)
--- /dev/null
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:34:22 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: faa686bf-c0eb-48bf-8a0b-d9a2e02bd132
+
--- /dev/null
+It looks like etree wants a byte string, not unicode input
--- /dev/null
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:42:16 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: faa686bf-c0eb-48bf-8a0b-d9a2e02bd132
+
--- /dev/null
+For example, this works:
+
+python -c 'from xml.etree import ElementTree; a=u"<body>\u1234</body>"; print type(a), a; b=ElementTree.XML(a.encode("unicode_escape")); print type(b.text), unicode(b.text).decode("unicode_escape");'
+
+Ugly though :p. Ah well.
--- /dev/null
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:46:57 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: 520a9829-8d90-43ce-be64-868b8321e5b0
+
--- /dev/null
+That's with Python 2.5.2 and ElementTree "2326 2005-03-17 07:45:21Z fredrik"
--- /dev/null
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:37:55 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: 07fc448f-c42e-4846-929a-8924de485766
+
--- /dev/null
+Isolated problem to:
+
+python -c 'from xml.etree import ElementTree; a=u"<body>\u1234</body>"; print type(a), a; b=ElementTree.XML(a);'
+
+Output attached below
--- /dev/null
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:31:13 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
--- /dev/null
+creator: W. Trevor King <wking@drexel.edu>
+
+
+reporter: W. Trevor King <wking@drexel.edu>
+
+
+severity: minor
+
+
+status: fixed
+
+
+summary: utf8 problems in xml parsing
+
+
+time: Sat, 11 Jul 2009 15:48:32 +0000
+
+Content-type: text/plain
-
-Content-type=text/plain
-
-
-
-
-
-
-Date=Tue, 25 Nov 2008 19:41:02 +0000
-
-
-
-
-
-
-From=W. Trevor King <wking@drexel.edu>
+Date: Tue, 25 Nov 2008 19:41:02 +0000
+From: W. Trevor King <wking@drexel.edu>
+Content-type: text/plain
-
-Content-type=text/plain
-
-
-
-
-
-
-Date=Tue, 25 Nov 2008 02:36:16 +0000
-
-
-
-
-
-
-From=W. Trevor King <wking@drexel.edu>
+Date: Tue, 25 Nov 2008 02:36:16 +0000
+From: W. Trevor King <wking@drexel.edu>
+Content-type: text/plain
-
-Content-type=text/plain
-
-
-
-
-
-
-Date=Tue, 25 Nov 2008 03:02:59 +0000
-
-
-
-
-
-
-From=W. Trevor King <wking@drexel.edu>
+Date: Tue, 25 Nov 2008 03:02:59 +0000
+From: W. Trevor King <wking@drexel.edu>
--- /dev/null
+Test unicode �quotes�
--- /dev/null
+Content-type: text/plain
+
+
+Date: Sat, 11 Jul 2009 18:28:57 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
new.content_type = options.content_type
else: # import XML comment [list]
# read in the comments
- comment_list = ElementTree.XML(body)
+ str_body = body.strip().encode("unicode_escape")
+ comment_list = ElementTree.XML(str_body)
if comment_list.tag not in ["bug", "comment-list"]:
raise comment.InvalidXML(
comment_list, "root element must be <bug> or <comment-list>")
for child in comment_list.getchildren():
if child.tag == "comment":
new = comment.Comment(bug)
- new.from_xml(ElementTree.tostring(child))
+ new.from_xml(unicode(ElementTree.tostring(child)).decode("unicode_escape"))
if new.alt_id in ids:
raise cmdutil.UserError(
"Clashing comment alt_id: %s" % new.alt_id)
Bug A
<BLANKLINE>
>>> execute (["--xml", "a"], test=True) # doctest: +ELLIPSIS
+ <?xml version="1.0" encoding="..." ?>
<bug>
<uuid>a</uuid>
<short-name>a</short-name>
bug = bd.bug_from_shortname(bugname)
if is_comment == False:
if options.dumpXML:
+ print '<?xml version="1.0" encoding="%s" ?>' % bd.encoding
print bug.xml(show_comments=True)
else:
print bug.string(show_comments=True)
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA
-import email.mime.base, email.encoders
+import base64
import os
import os.path
import time
+import types
try: # import core module, Python >= 2.5
from xml.etree import ElementTree
except ImportError: # look for non-core module
else:
uuid_map[root.uuid] = root
for comm in comments:
+ if comm.in_reply_to == INVALID_UUID:
+ comm.in_reply_to = None
rep = comm.in_reply_to
if rep == None or rep == bug.uuid:
root_comments.append(comm)
else:
+ print comm.in_reply_to
parentUUID = comm.in_reply_to
parent = uuid_map[parentUUID]
parent.add_reply(comm)
msg = email.mime.base.MIMEBase(maintype, subtype)
msg.set_payload(self.body or "")
email.encoders.encode_base64(msg)
- body = msg.as_string()
+ body = base64.encodestring(self.body or "")
info = [("uuid", self.uuid),
("alt-id", self.alt_id),
("short-name", shortname),
>>> commA.From
>>> commB.From
"""
+ if type(xml_string) == types.UnicodeType:
+ xml_string = xml_string.strip().encode("unicode_escape")
comment = ElementTree.XML(xml_string)
if comment.tag != "comment":
raise InvalidXML(comment, "root element must be <comment>")
tags=['uuid','alt-id','in-reply-to','from','date','content-type','body']
uuid = None
+ body = None
for child in comment.getchildren():
if child.tag == "short-name":
pass
if child.text == None or len(child.text) == 0:
text = settings_object.EMPTY
else:
- text = xml.sax.saxutils.unescape(child.text.strip())
+ text = xml.sax.saxutils.unescape(child.text)
+ text = unicode(text).decode("unicode_escape").strip()
if child.tag == "uuid":
uuid = text
continue # don't set the bug's uuid tag.
+ if child.tag == "body":
+ body = text
+ continue # don't set the bug's body yet.
elif child.tag == 'from':
attr_name = "From"
elif child.tag == 'date':
attr_name = 'time_string'
else:
attr_name = child.tag.replace('-','_')
- if attr_name == "body":
- text += '\n' # replace strip()ed trailing newline
setattr(self, attr_name, text)
elif verbose == True:
print >> sys.stderr, "Ignoring unknown tag %s in %s" \
% (child.tag, comment.tag)
if self.alt_id == None and uuid not in [None, self.uuid]:
self.alt_id = uuid
+ if body != None:
+ if self.content_type.startswith("text/"):
+ self.body = body
+ else:
+ self.body = base64.decodestring(body)
def string(self, indent=0, shortname=None):
"""
assert(':' not in key)
assert(len(key) > 0)
except AssertionError:
- raise IllegalKey(key.encode('string_escape'))
+ raise IllegalKey(unicode(key).encode('unicode_escape'))
if "\n" in map[key]:
- raise IllegalValue(map[key].encode('string_escape'))
+ raise IllegalValue(unicode(map[key]).encode('unicode_escape'))
lines = []
for key in keys:
--- /dev/null
+#!/usr/bin/env python
+# Copyright (C) 2009 W. Trevor King <wking@drexel.edu>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+"""
+Convert an mbox into xml suitable for imput into be.
+ $ cat mbox | be-mbox-to-xml | be comment --xml <ID> -
+mbox is a flat-file format, consisting of a series of messages.
+Messages begin with a a From_ line, followed by RFC 822 email,
+followed by a blank line.
+"""
+
+from mailbox import mbox, Message # the mailbox people really want an on-disk copy
+import email.utils
+import types
+
+import base64
+from libbe.encoding import get_encoding, set_IO_stream_encodings
+from time import asctime, gmtime
+from xml.sax import make_parser
+from xml.sax.handler import ContentHandler
+from xml.sax.saxutils import escape
+
+DEFAULT_ENCODING = get_encoding()
+set_IO_stream_encodings(DEFAULT_ENCODING)
+
+def comment_message_to_xml(message, fields=None):
+ if fields == None:
+ fields = {}
+ fields[u'alt-id'] = message[u'message-id']
+ fields[u'in-reply-to'] = message[u'in-reply-to']
+ fields[u'from'] = message[u'from']
+ fields[u'date'] = message[u'date']
+ fields[u'content-type'] = message.get_content_type()
+ for k,v in fields.items():
+ if v != None and type(v) != types.UnicodeType:
+ fields[k] = unicode(v, encoding=DEFAULT_ENCODING)
+
+ if message.is_multipart():
+ ret = []
+ alt_id = fields[u'alt-id']
+ for m in message.walk():
+ if m == message:
+ continue
+ if len(ret) >= 0:
+ fields.pop(u'alt-id')
+ fields[u'in-reply-to'] = alt_id
+ ret.append(comment_message_to_xml(m, fields))
+ return u'\n'.join(ret)
+
+ charset = message.get_content_charset(DEFAULT_ENCODING).lower()
+ #assert charset == DEFAULT_ENCODING.lower(), \
+ # u"Unknown charset: %s" % charset
+
+ encoding = message[u'content-transfer-encoding'].lower()
+ body = message.get_payload(decode=True) # attempt to decode
+ assert body != None, "Unable to decode?"
+ if fields[u'content-type'].startswith(u"text/"):
+ body = unicode(body, encoding=charset).rstrip(u'\n')
+ else:
+ body = base64.encode(body)
+ fields[u'body'] = body
+ lines = [u"<comment>"]
+ for tag,body in fields.items():
+ if body != None:
+ ebody = escape(body)
+ lines.append(u" <%s>%s</%s>" % (tag, ebody, tag))
+ lines.append(u"</comment>")
+ return u'\n'.join(lines)
+
+def main(mbox_filename):
+ mb = mbox(mbox_filename)
+ print u'<?xml version="1.0" encoding="%s" ?>' % DEFAULT_ENCODING
+ print u"<comment-list>"
+ for message in mb:
+ print comment_message_to_xml(message)
+ print u"</comment-list>"
+
+
+if __name__ == "__main__":
+ import sys
+ main(sys.argv[1])