Added be-mbox-to-xml.
authorW. Trevor King <wking@drexel.edu>
Sun, 12 Jul 2009 12:38:40 +0000 (08:38 -0400)
committerW. Trevor King <wking@drexel.edu>
Sun, 12 Jul 2009 12:38:40 +0000 (08:38 -0400)
Reworked to allow "be comment" to handle unicode strings (see bug
e4ed63f6-9000-4d0b-98c3-487269140141).  The solution was to escape all
the unicode to produce and ASCII string before calling
ElementTree.XML, and then converting back to unicode afterwards.

Added a unicode-containing comment to the end of bug
f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a so that there's a handy unicode
comment for testing.

XML headers (e.g. '<?xml version="1.0" encoding="UTF-8" ?>') are
now added to all xml output from be.

Switched non-text/* encoding library to base64 instead of
email.encoders, which makes that code in libbe/comment.py simpler.

Changed libbe/mapfile.py error encoding from string_escape to
unicode_escape so it can handle unicode.

Everything's still untested, and be-xml-to-mbox doesn't handle unicode
yet, but I felt this commit was getting a bit unwieldy ;).

21 files changed:
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values [new file with mode: 0644]
.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values [new file with mode: 0644]
.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/028d2e8d-5b0f-4c43-a913-35a1709b2276/values
.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/15602c0c-25e4-4c2c-9e24-79bdb90721b1/values
.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/3f556a48-c538-4569-8609-3e829b561d78/values
.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body [new file with mode: 0644]
.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values [new file with mode: 0644]
becommands/comment.py
becommands/show.py
libbe/comment.py
libbe/mapfile.py
xml/be-mbox-to-xml [new file with mode: 0755]

diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/body
new file mode 100644 (file)
index 0000000..0598d70
--- /dev/null
@@ -0,0 +1,8 @@
+<type 'unicode'> <body>�</body>
+Traceback (most recent call last):
+  File "<string>", line 1, in <module>
+  File "/usr/lib/python2.5/xml/etree/ElementTree.py", line 963, in XML
+    parser.feed(text)
+  File "/usr/lib/python2.5/xml/etree/ElementTree.py", line 1245, in feed
+    self._parser.Parse(data, 0)
+UnicodeEncodeError: 'ascii' codec can't encode character u'\u1234' in position 6: ordinal not in range(128)
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/07fc448f-c42e-4846-929a-8924de485766/values
new file mode 100644 (file)
index 0000000..cd8d8b9
--- /dev/null
@@ -0,0 +1,11 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:34:22 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: faa686bf-c0eb-48bf-8a0b-d9a2e02bd132
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/body
new file mode 100644 (file)
index 0000000..397d4b6
--- /dev/null
@@ -0,0 +1 @@
+It looks like etree wants a byte string, not unicode input
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/520a9829-8d90-43ce-be64-868b8321e5b0/values
new file mode 100644 (file)
index 0000000..8bdaf52
--- /dev/null
@@ -0,0 +1,11 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:42:16 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: faa686bf-c0eb-48bf-8a0b-d9a2e02bd132
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/body
new file mode 100644 (file)
index 0000000..ce2bb8d
--- /dev/null
@@ -0,0 +1,5 @@
+For example, this works:
+
+python -c 'from xml.etree import ElementTree; a=u"<body>\u1234</body>"; print type(a), a; b=ElementTree.XML(a.encode("unicode_escape")); print type(b.text), unicode(b.text).decode("unicode_escape");'
+
+Ugly though :p.  Ah well.
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/8b54e56e-c693-4594-998f-5bd6c1f385d7/values
new file mode 100644 (file)
index 0000000..1784e0e
--- /dev/null
@@ -0,0 +1,11 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:46:57 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: 520a9829-8d90-43ce-be64-868b8321e5b0
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/body
new file mode 100644 (file)
index 0000000..89a8f8d
--- /dev/null
@@ -0,0 +1 @@
+That's with Python 2.5.2 and ElementTree "2326 2005-03-17 07:45:21Z fredrik"
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/bb124fd9-08f5-4f82-a035-6355e8403075/values
new file mode 100644 (file)
index 0000000..cca07c3
--- /dev/null
@@ -0,0 +1,11 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:37:55 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
+
+In-reply-to: 07fc448f-c42e-4846-929a-8924de485766
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/body
new file mode 100644 (file)
index 0000000..57e050d
--- /dev/null
@@ -0,0 +1,5 @@
+Isolated problem to:
+
+python -c 'from xml.etree import ElementTree; a=u"<body>\u1234</body>"; print type(a), a; b=ElementTree.XML(a);'
+
+Output attached below
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/comments/faa686bf-c0eb-48bf-8a0b-d9a2e02bd132/values
new file mode 100644 (file)
index 0000000..e430ea0
--- /dev/null
@@ -0,0 +1,8 @@
+Content-type: text/plain
+
+
+Date: Sun, 12 Jul 2009 11:31:13 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
diff --git a/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values b/.be/bugs/e4ed63f6-9000-4d0b-98c3-487269140141/values
new file mode 100644 (file)
index 0000000..4bc81f5
--- /dev/null
@@ -0,0 +1,17 @@
+creator: W. Trevor King <wking@drexel.edu>
+
+
+reporter: W. Trevor King <wking@drexel.edu>
+
+
+severity: minor
+
+
+status: fixed
+
+
+summary: utf8 problems in xml parsing
+
+
+time: Sat, 11 Jul 2009 15:48:32 +0000
+
index eb5631713afd540f6dffa87232985ed67ab9e774..d39c4a1160a956263302679bd7bafea1aa5f66ed 100644 (file)
@@ -1,21 +1,8 @@
+Content-type: text/plain
 
 
-
-Content-type=text/plain
-
-
-
-
-
-
-Date=Tue, 25 Nov 2008 19:41:02 +0000
-
-
-
-
-
-
-From=W. Trevor King <wking@drexel.edu>
+Date: Tue, 25 Nov 2008 19:41:02 +0000
 
 
+From: W. Trevor King <wking@drexel.edu>
 
index f976972598828b808d900194f691a48454452903..639fd4ab31309d84ec44f005307cc7bb572539c6 100644 (file)
@@ -1,21 +1,8 @@
+Content-type: text/plain
 
 
-
-Content-type=text/plain
-
-
-
-
-
-
-Date=Tue, 25 Nov 2008 02:36:16 +0000
-
-
-
-
-
-
-From=W. Trevor King <wking@drexel.edu>
+Date: Tue, 25 Nov 2008 02:36:16 +0000
 
 
+From: W. Trevor King <wking@drexel.edu>
 
index bf5085b86e5f5012bf5f7fbc54f5a9dab126daa2..2821b2ff19cab2bd9c87164df9fe779d6bf99f54 100644 (file)
@@ -1,21 +1,8 @@
+Content-type: text/plain
 
 
-
-Content-type=text/plain
-
-
-
-
-
-
-Date=Tue, 25 Nov 2008 03:02:59 +0000
-
-
-
-
-
-
-From=W. Trevor King <wking@drexel.edu>
+Date: Tue, 25 Nov 2008 03:02:59 +0000
 
 
+From: W. Trevor King <wking@drexel.edu>
 
diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/body
new file mode 100644 (file)
index 0000000..b441da9
--- /dev/null
@@ -0,0 +1 @@
+Test unicode �quotes�
diff --git a/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values b/.be/bugs/f7ccd916-b5c7-4890-a2e3-8c8ace17ae3a/comments/f376debf-9f7e-4347-807f-00e7263487c7/values
new file mode 100644 (file)
index 0000000..a67680d
--- /dev/null
@@ -0,0 +1,8 @@
+Content-type: text/plain
+
+
+Date: Sat, 11 Jul 2009 18:28:57 +0000
+
+
+From: W. Trevor King <wking@drexel.edu>
+
index 1e6ecd4c0e3e74a46feaa0035eca2b4c068d7286..c4b074f0b10cb1bfd16f834c5fcee0c90a8aa6ab 100644 (file)
@@ -117,7 +117,8 @@ def execute(args, test=False):
             new.content_type = options.content_type
     else: # import XML comment [list]
         # read in the comments
-        comment_list = ElementTree.XML(body)
+        str_body = body.strip().encode("unicode_escape")
+        comment_list = ElementTree.XML(str_body)
         if comment_list.tag not in ["bug", "comment-list"]:
             raise comment.InvalidXML(
                 comment_list, "root element must be <bug> or <comment-list>")
@@ -130,7 +131,7 @@ def execute(args, test=False):
         for child in comment_list.getchildren():
             if child.tag == "comment":
                 new = comment.Comment(bug)
-                new.from_xml(ElementTree.tostring(child))
+                new.from_xml(unicode(ElementTree.tostring(child)).decode("unicode_escape"))
                 if new.alt_id in ids:
                     raise cmdutil.UserError(
                         "Clashing comment alt_id: %s" % new.alt_id)
index a4208c3565a174e533b14522dc113a3b28c5c4d8..f700caafe04b1bb7f436ba9b6b8d7367aa74953d 100644 (file)
@@ -40,6 +40,7 @@ def execute(args, test=False):
     Bug A
     <BLANKLINE>
     >>> execute (["--xml", "a"], test=True) # doctest: +ELLIPSIS
+    <?xml version="1.0" encoding="..." ?>
     <bug>
       <uuid>a</uuid>
       <short-name>a</short-name>
@@ -70,6 +71,7 @@ def execute(args, test=False):
         bug = bd.bug_from_shortname(bugname)
         if is_comment == False:
             if options.dumpXML:
+                print '<?xml version="1.0" encoding="%s" ?>' % bd.encoding
                 print bug.xml(show_comments=True)
             else:
                 print bug.string(show_comments=True)
index d4d47a812e8db8d3f4ea9f791456a87c20531f52..7acbbb10962af32008cff0375b41a2b04a2fc4e4 100644 (file)
 #    along with this program; if not, write to the Free Software
 #    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 #    MA 02110-1301, USA
-import email.mime.base, email.encoders
+import base64
 import os
 import os.path
 import time
+import types
 try: # import core module, Python >= 2.5
     from xml.etree import ElementTree
 except ImportError: # look for non-core module
@@ -80,10 +81,13 @@ def list_to_root(comments, bug, root=None):
     else:
         uuid_map[root.uuid] = root
     for comm in comments:
+        if comm.in_reply_to == INVALID_UUID:
+            comm.in_reply_to = None
         rep = comm.in_reply_to
         if rep == None or rep == bug.uuid:
             root_comments.append(comm)
         else:
+            print comm.in_reply_to
             parentUUID = comm.in_reply_to
             parent = uuid_map[parentUUID]
             parent.add_reply(comm)
@@ -269,7 +273,7 @@ class Comment(Tree, settings_object.SavedSettingsObject):
             msg = email.mime.base.MIMEBase(maintype, subtype)
             msg.set_payload(self.body or "")
             email.encoders.encode_base64(msg)
-            body = msg.as_string()
+            body = base64.encodestring(self.body or "")
         info = [("uuid", self.uuid),
                 ("alt-id", self.alt_id),
                 ("short-name", shortname),
@@ -310,11 +314,14 @@ class Comment(Tree, settings_object.SavedSettingsObject):
         >>> commA.From
         >>> commB.From
         """
+        if type(xml_string) == types.UnicodeType:
+            xml_string = xml_string.strip().encode("unicode_escape")
         comment = ElementTree.XML(xml_string)
         if comment.tag != "comment":
             raise InvalidXML(comment, "root element must be <comment>")
         tags=['uuid','alt-id','in-reply-to','from','date','content-type','body']
         uuid = None
+        body = None
         for child in comment.getchildren():
             if child.tag == "short-name":
                 pass
@@ -322,24 +329,31 @@ class Comment(Tree, settings_object.SavedSettingsObject):
                 if child.text == None or len(child.text) == 0:
                     text = settings_object.EMPTY
                 else:
-                    text = xml.sax.saxutils.unescape(child.text.strip())
+                    text = xml.sax.saxutils.unescape(child.text)
+                    text = unicode(text).decode("unicode_escape").strip()
                 if child.tag == "uuid":
                     uuid = text
                     continue # don't set the bug's uuid tag.
+                if child.tag == "body":
+                    body = text
+                    continue # don't set the bug's body yet.
                 elif child.tag == 'from':
                     attr_name = "From"
                 elif child.tag == 'date':
                     attr_name = 'time_string'
                 else:
                     attr_name = child.tag.replace('-','_')
-                if attr_name == "body":
-                    text += '\n' # replace strip()ed trailing newline
                 setattr(self, attr_name, text)
             elif verbose == True:
                 print >> sys.stderr, "Ignoring unknown tag %s in %s" \
                     % (child.tag, comment.tag)
         if self.alt_id == None and uuid not in [None, self.uuid]:
             self.alt_id = uuid
+        if body != None:
+            if self.content_type.startswith("text/"):
+                self.body = body
+            else:
+                self.body = base64.decodestring(body)
 
     def string(self, indent=0, shortname=None):
         """
index 40386e2cb59e50e169f1ef19b47f0d24464bda8c..b183bfed36db768c64c793aef2f889fcb11c9d51 100644 (file)
@@ -67,9 +67,9 @@ def generate(map):
             assert(':' not in key)
             assert(len(key) > 0)
         except AssertionError:
-            raise IllegalKey(key.encode('string_escape'))
+            raise IllegalKey(unicode(key).encode('unicode_escape'))
         if "\n" in map[key]:
-            raise IllegalValue(map[key].encode('string_escape'))
+            raise IllegalValue(unicode(map[key]).encode('unicode_escape'))
 
     lines = []
     for key in keys:
diff --git a/xml/be-mbox-to-xml b/xml/be-mbox-to-xml
new file mode 100755 (executable)
index 0000000..e9077b1
--- /dev/null
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# Copyright (C) 2009 W. Trevor King <wking@drexel.edu>
+#
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+"""
+Convert an mbox into xml suitable for imput into be.
+  $ cat mbox | be-mbox-to-xml | be comment --xml <ID> -
+mbox is a flat-file format, consisting of a series of messages.
+Messages begin with a a From_ line, followed by RFC 822 email,
+followed by a blank line.
+"""
+
+from mailbox import mbox, Message  # the mailbox people really want an on-disk copy
+import email.utils
+import types
+
+import base64
+from libbe.encoding import get_encoding, set_IO_stream_encodings
+from time import asctime, gmtime
+from xml.sax import make_parser
+from xml.sax.handler import ContentHandler
+from xml.sax.saxutils import escape
+
+DEFAULT_ENCODING = get_encoding()
+set_IO_stream_encodings(DEFAULT_ENCODING)
+
+def comment_message_to_xml(message, fields=None):
+    if fields == None:
+        fields = {}
+    fields[u'alt-id'] = message[u'message-id']
+    fields[u'in-reply-to'] = message[u'in-reply-to']
+    fields[u'from'] = message[u'from']
+    fields[u'date'] = message[u'date']
+    fields[u'content-type'] = message.get_content_type()
+    for k,v in fields.items():
+        if v != None and type(v) != types.UnicodeType:
+            fields[k] = unicode(v, encoding=DEFAULT_ENCODING)
+
+    if message.is_multipart():
+        ret = []
+        alt_id = fields[u'alt-id']
+        for m in message.walk():
+            if m == message:
+                continue
+            if len(ret) >= 0:
+                fields.pop(u'alt-id')
+                fields[u'in-reply-to'] = alt_id
+            ret.append(comment_message_to_xml(m, fields))
+            return u'\n'.join(ret)
+
+    charset = message.get_content_charset(DEFAULT_ENCODING).lower()
+    #assert charset == DEFAULT_ENCODING.lower(), \
+    #    u"Unknown charset: %s" % charset
+
+    encoding = message[u'content-transfer-encoding'].lower()
+    body = message.get_payload(decode=True) # attempt to decode
+    assert body != None, "Unable to decode?"
+    if fields[u'content-type'].startswith(u"text/"):
+        body = unicode(body, encoding=charset).rstrip(u'\n')
+    else:
+        body = base64.encode(body)
+    fields[u'body'] = body
+    lines = [u"<comment>"]
+    for tag,body in fields.items():
+        if body != None:
+            ebody = escape(body)
+            lines.append(u"  <%s>%s</%s>" % (tag, ebody, tag))
+    lines.append(u"</comment>")
+    return u'\n'.join(lines)
+
+def main(mbox_filename):
+    mb = mbox(mbox_filename)
+    print u'<?xml version="1.0" encoding="%s" ?>' % DEFAULT_ENCODING
+    print u"<comment-list>"
+    for message in mb:
+        print comment_message_to_xml(message)
+    print u"</comment-list>"
+
+
+if __name__ == "__main__":
+    import sys
+    main(sys.argv[1])