Reported bug with utf-8 strings
[be.git] / misc / xml / be-xml-to-mbox
1 #!/usr/bin/env python
2 # Copyright (C) 2009-2010 W. Trevor King <wking@drexel.edu>
3 #
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License along
15 # with this program; if not, write to the Free Software Foundation, Inc.,
16 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 """
18 Convert xml output of `be list --xml` into mbox format for browsing
19 with a mail reader.  For example
20   $ be list --xml --status=all | be-xml-to-mbox | catmutt
21
22 mbox is a flat-file format, consisting of a series of messages.
23 Messages begin with a a From_ line, followed by RFC 822 email,
24 followed by a blank line.
25 """
26
27 #from mailbox import mbox, Message  # the mailbox people really want an on-disk copy
28 import email.utils
29 from libbe.util.encoding import get_output_encoding
30 from libbe.util.utility import str_to_time as rfc2822_to_gmtime_integer
31 from time import asctime, gmtime
32 import types
33 try: # import core module, Python >= 2.5
34     from xml.etree import ElementTree
35 except ImportError: # look for non-core module
36     from elementtree import ElementTree
37 from xml.sax.saxutils import unescape
38
39
40 DEFAULT_DOMAIN = "invalid.com"
41 DEFAULT_EMAIL = "dummy@" + DEFAULT_DOMAIN
42 DEFAULT_ENCODING = get_output_encoding()
43
44 def rfc2822_to_asctime(rfc2822_string):
45     """Convert an RFC 2822-fomatted string into a asctime string.
46     >>> rfc2822_to_asctime("Thu, 01 Jan 1970 00:00:00 +0000")
47     "Thu Jan 01 00:00:00 1970"
48     """
49     if rfc2822_string == "":
50         return asctime(gmtime(0))
51     return asctime(gmtime(rfc2822_to_gmtime_integer(rfc2822_string)))
52
53 class LimitedAttrDict (dict):
54     """
55     Dict with error checking, to avoid invalid bug/comment fields.
56     """
57     _attrs = [] # override with list of valid attribute names
58     def __init__(self, **kwargs):
59         dict.__init__(self)
60         for key,value in kwargs.items():
61             self[key] = value
62     def __setitem__(self, key, item):
63         self._validate_key(key)
64         dict.__setitem__(self, key, item)
65     def _validate_key(self, key):
66         if key in self._attrs:
67             return
68         elif type(key) not in types.StringTypes:
69             raise TypeError, "Invalid attribute type %s for '%s'" % (type(key), key)
70         else:
71             raise ValueError, "Invalid attribute name '%s'" % key
72
73 class Bug (LimitedAttrDict):
74     _attrs = [u"uuid",
75               u"short-name",
76               u"severity",
77               u"status",
78               u"assigned",
79               u"reporter",
80               u"creator",
81               u"created",
82               u"summary",
83               u"comments",
84               u"extra-strings"]
85     def print_to_mbox(self):
86         if "creator" in self:
87             # otherwise, probably a `be show` uuid-only bug to avoid
88             # root comments.
89             name,addr = email.utils.parseaddr(self["creator"])
90             print "From %s %s" % (addr, rfc2822_to_asctime(self["created"]))
91             print "Message-id: <%s@%s>" % (self["uuid"], DEFAULT_DOMAIN)
92             print "Date: %s" % self["created"]
93             print "From: %s" % self["creator"]
94             print "Content-Type: %s; charset=%s" \
95                 % ("text/plain", DEFAULT_ENCODING)
96             print "Content-Transfer-Encoding: 8bit"
97             print "Subject: %s: %s" % (self["short-name"], self["summary"])
98             if "extra-strings" in self:
99                 for estr in self["extra-strings"]:
100                     print "X-Extra-String: %s" % estr
101             print ""
102             print self["summary"]
103             print ""
104         if "comments" in self:
105             for comment in self["comments"]:
106                 comment.print_to_mbox(self)            
107     def init_from_etree(self, element):
108         assert element.tag == "bug", element.tag
109         for field in element.getchildren():
110             text = unescape(unicode(field.text).decode("unicode_escape").strip())
111             if field.tag == "comment":
112                 comm = Comment()
113                 comm.init_from_etree(field)
114                 if "comments" in self:
115                     self["comments"].append(comm)
116                 else:
117                     self["comments"] = [comm]
118             elif field.tag == "extra-string":
119                 if "extra-strings" in self:
120                     self["extra-strings"].append(text)
121                 else:
122                     self["extra-strings"] = [text]
123             else:
124                 self[field.tag] = text
125
126 def wrap_id(id):
127     if "@" not in id:
128         return "<%s@%s>" % (id, DEFAULT_DOMAIN)
129     return id
130
131 class Comment (LimitedAttrDict):
132     _attrs = [u"uuid",
133               u"alt-id",
134               u"short-name",
135               u"in-reply-to",
136               u"author",
137               u"date",
138               u"content-type",
139               u"body",
140               u"extra-strings"]
141     def print_to_mbox(self, bug=None):
142         if bug == None:
143             bug = Bug()
144             bug[u"uuid"] = u"no-uuid"
145         name,addr = email.utils.parseaddr(self["author"])
146         print "From %s %s" % (addr, rfc2822_to_asctime(self["date"]))
147         if "uuid" in self:     id = self["uuid"]
148         elif "alt-id" in self: id = self["alt-id"]
149         else:                  id = None
150         if id != None:
151             print "Message-id: %s" % wrap_id(id)
152         if "alt-id" in self:
153             print "Alt-id: %s" % wrap_id(self["alt-id"])
154         print "Date: %s" % self["date"]
155         print "From: %s" % self["author"]
156         subject = ""
157         if "short-name" in self:
158             subject += self["short-name"]+u": "
159         if "summary" in bug:
160             subject += bug["summary"]
161         else:
162             subject += u"no-subject"
163         print "Subject: %s" % subject
164         if "in-reply-to" not in self.keys():
165             self["in-reply-to"] = bug["uuid"]
166         print "In-Reply-To: %s" % wrap_id(self["in-reply-to"])
167         if "extra-strings" in self:
168             for estr in self["extra-strings"]:
169                 print "X-Extra-String: %s" % estr
170         if self["content-type"].startswith("text/"):
171             print "Content-Transfer-Encoding: 8bit"
172             print "Content-Type: %s; charset=%s" \
173                 % (self["content-type"], DEFAULT_ENCODING)
174         else:
175             print "Content-Transfer-Encoding: base64"
176             print "Content-Type: %s;" % (self["content-type"])
177         print ""
178         print self["body"]
179         print ""
180     def init_from_etree(self, element):
181         assert element.tag == "comment", element.tag
182         for field in element.getchildren():
183             text = unescape(unicode(field.text).decode("unicode_escape").strip())
184             if field.tag == "extra-string":
185                 if "extra-strings" in self:
186                     self["extra-strings"].append(text)
187                 else:
188                     self["extra-strings"] = [text]
189             else:
190                 if field.tag == "body":
191                     text+="\n"
192                 self[field.tag] = text
193
194 def print_to_mbox(element):
195     if element.tag == "bug":
196         b = Bug()
197         b.init_from_etree(element)
198         b.print_to_mbox()
199     elif element.tag == "comment":
200         c = Comment()
201         c.init_from_etree(element)
202         c.print_to_mbox()
203     elif element.tag in ["be-xml"]:
204         for elt in element.getchildren():
205             print_to_mbox(elt)
206
207 if __name__ == "__main__":
208     import codecs
209     import sys
210     
211     sys.stdin = codecs.getreader(DEFAULT_ENCODING)(sys.stdin)
212     sys.stdout = codecs.getwriter(DEFAULT_ENCODING)(sys.stdout)
213
214     if len(sys.argv) == 1: # no filename given, use stdin
215         xml_unicode = sys.stdin.read()
216     else:
217         xml_unicode = codecs.open(sys.argv[1], "r", DEFAULT_ENCODING).read()
218     xml_str = xml_unicode.encode("unicode_escape").replace(r"\n", "\n")
219     elist = ElementTree.XML(xml_str)
220     print_to_mbox(elist)