Update atomgen to use lxml instead of Amara and add Amara post.
authorW. Trevor King <wking@drexel.edu>
Sat, 20 Nov 2010 19:25:21 +0000 (14:25 -0500)
committerW. Trevor King <wking@drexel.edu>
Sat, 20 Nov 2010 19:25:21 +0000 (14:25 -0500)
posts/Amara.mdwn [new file with mode: 0644]
posts/Atomgen.mdwn
posts/Atomgen/atomgen.py

diff --git a/posts/Amara.mdwn b/posts/Amara.mdwn
new file mode 100644 (file)
index 0000000..81c8265
--- /dev/null
@@ -0,0 +1,36 @@
+At one point my [[Atomgen]] script used Amara, because I liked the
+bindery interface.  However, I was having [problems][] with wide
+Unicode characters, so I've updated Atomgen to use more widely
+supported XML libraries ([lxml][]'s [objectify][] API is similar to
+Amara's bindery).  For future reference, I'll recount my history with
+Amara.
+
+I started off using Sylvain Hellegouarch's [Atomixlib][], which is
+based on [Amara2][] or [ElementTree][].  However, Sylvain seems to be
+falling behind as ElementTree and Amara continue to evolve, so I
+rewrote Atomgen to run off [Amara2][] directly, since Amara's
+*bindery* interface is wonderful.  Things worked fine when I was
+installing Amara via `easy_install`:
+
+    sudo apt-get install python-setuptools
+    easy_install --prefix=~ amara
+
+but I ran into the [wide Unicode][] issues mentioned above when I
+tried to install Amara2 using Tiziano Müller's [dev-zero][]
+[overlay][] (Gentoo only packages Amara-1.2.0.2 by default).
+
+The Amara + [expat][] + wide Unicode is discussed in [Gentoo bug
+306655][].
+
+[problems]: http://trac.xml3k.org/ticket/69
+[lxml]: http://codespeak.net/lxml/
+[objectify]: http://codespeak.net/lxml/objectify.html
+[Atomixlib]: http://pypi.python.org/pypi/atomixlib/
+[Amara]: http://xml3k.org/Amara/
+[Amara2]: http://xml3k.org/Amara2/
+[ElementTree]: http://pypi.python.org/pypi/elementtree/
+[wide Unicode]: http://www.python.org/dev/peps/pep-0261/
+[dev-zero]: http://git.overlays.gentoo.org/gitweb/?p=dev/dev-zero.git
+[overlay]: http://www.gentoo.org/proj/en/overlays/userguide.xml
+[expat]: http://expat.sourceforge.net/
+[Gentoo bug 306655]: http://bugs.gentoo.org/show_bug.cgi?id=306655
index 169fa229fca844330d5422a44166278a58da2921..c37b9e377e66585a797953064df8ea0623f4f83e 100644 (file)
@@ -12,13 +12,7 @@ feed.  It works rather well I think, even if noone ends up actually
 looking at the feed ;).
 
 Anyhow, I wrote up a little command line wrapper ([[atomgen.py]])
-around Sylvain Hellegouarch's [Atomixlib][], which is based on
-[Amara2][] or [ElementTree][].  However, Sylvain seems to be falling
-behind as ElementTree and Amara continue to evolve, so I recently
-rewrote my script to run off [Amara2][] directly, since Amara's
-*bindery* interface is wonderful (well, for data manipulation anyway.
-Attributes, prefixes, and tree construction don't make much sense to
-me yet...).
+around [lxml][]
 
     atomgen -o atom.xml new --title 'Physics 201' --author 'W. Trevor King' \
       http://www.physics.drexel.edu/~wking/phys201
@@ -26,24 +20,10 @@ me yet...).
       atomgen -o atom.xml add -i atom.xml 'Feed purpose' \
       http://www.physics.drexel.edu/~wking/phys201
 
-If your distro does not package Amara, install it with
-
-    sudo apt-get install python-setuptools
-    easy_install --prefix=~ amara
-
-or use [[pip|Distributing Python]].  Gentoo packages Amara-1.2.0.2,
-but there are Amara2 ebuilds in Tiziano Müller's [dev-zero][]
-[overlay][].
-
 [Atom feeds]: http://en.wikipedia.org/wiki/Atom_%28standard%29
 [rss2email]: http://rss2email.infogami.com/
 [procmail]: http://www.procmail.org/
-[Atomixlib]: http://pypi.python.org/pypi/atomixlib/
-[Amara]: http://xml3k.org/Amara/
-[Amara2]: http://xml3k.org/Amara2/
-[ElementTree]: http://pypi.python.org/pypi/elementtree/
-[dev-zero]: http://git.overlays.gentoo.org/gitweb/?p=dev/dev-zero.git
-[overlay]: http://www.gentoo.org/proj/en/overlays/userguide.xml
+[lxml]: http://codespeak.net/lxml/
 
 [[!tag tags/blogging]]
 [[!tag tags/programming]]
index ce6b79fab470b9a1e5e19963e002d955f8e002dd..52793c297aeebc5ec6c908c9e2f63a2e7ef8536b 100755 (executable)
 
 """Produce `RFC 4287` compliant Atom 1.0 XML from the command line.
 
-Tested on Amara_ version 2.0a4.
-
 .. _RFC 4287: http://tools.ietf.org/html/rfc4287
-.. _Amara: http://wiki.xml3k.org/Amara2
 """
 
 from optparse import OptionParser
 import sys
 import time
 
-from amara import bindery as AB
-from amara.namespaces import ATOM_NAMESPACE, XML_NAMESPACE, XHTML_NAMESPACE
-
+from lxml import etree
+from lxml import objectify
+from lxml.html import XHTML_NAMESPACE
 
 __version__ = '0.2'
-GENERATOR_NAME = u'atomgen [based on amara2]'
-XML_WRITE_KWARGS = {'writer':'xml-indent'}
+GENERATOR_NAME = u'atomgen [based on lxml]'
+
+ATOM_NAMESPACE = 'http://www.w3.org/2005/Atom'
+
+ATOM = '{%s}' % ATOM_NAMESPACE
+XHTML = '{%s}' % XHTML_NAMESPACE
+
+NSMAP = {
+    None : ATOM_NAMESPACE,
+    'html': XHTML_NAMESPACE,
+    }
 
 
 def _id(url, time_published):
@@ -107,13 +113,13 @@ class NewFeedCommand (Command):
     """Create a new feed
 
     >>> c = NewFeedCommand()
-    >>> doc = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King',
+    >>> feed = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King',
     ...     '--author-uri', 'http://www.physics.drexel.edu/~wking/',
     ...     '--author-email', 'wking@drexel.edu',
     ...     'http://www.physics.drexel.edu/~wking/phys201'])
-    >>> doc.xml_write(**XML_WRITE_KWARGS)
-    ... # doctest: +ELLIPSIS, +REPORT_UDIFF
-    <?xml version="1.0" encoding="UTF-8"?>
+    >>> print etree.tostring(feed, pretty_print=True, xml_declaration=True,
+    ...     encoding='UTF-8')  # doctest: +ELLIPSIS, +REPORT_UDIFF
+    <?xml version='1.0' encoding='UTF-8'?>
     <feed xmlns="http://www.w3.org/2005/Atom">
       <id>tag:www.physics.drexel.edu,...:/~wking/phys201</id>
       <title>Physics 201</title>
@@ -122,9 +128,10 @@ class NewFeedCommand (Command):
         <email>wking@drexel.edu</email>
         <uri>http://www.physics.drexel.edu/~wking/</uri>
       </author>
-      <generator version="0.2">atomgen [based on amara2]</generator>
+      <generator version="0.2">atomgen [based on lxml]</generator>
       <updated>...</updated>
     </feed>
+    <BLANKLINE>
     """
     name = 'new'
 
@@ -146,47 +153,42 @@ class NewFeedCommand (Command):
         return parser
 
     def _run(self, options, args):
-        uri = unicode(args[0])
+        uri = args[0]
 
-        doc = AB.nodes.entity_base()
-        doc.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'feed'))
+        feed = objectify.Element(ATOM + 'feed', nsmap=NSMAP)
 
         tpub = time.time()
-        doc.feed.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'id'))
-        doc.feed.id = _id(uri, tpub)
-
-        if options.title != None:
-            doc.feed.xml_append(doc.xml_element_factory(
-                    ATOM_NAMESPACE, u'title'))
-            doc.feed.title = unicode(options.title)
-
-        if options.author != None:
-            doc.feed.xml_append(doc.xml_element_factory(
-                    ATOM_NAMESPACE, u'author'))
-            doc.feed.author.xml_append(doc.xml_element_factory(
-                    ATOM_NAMESPACE, u'name'))
-            doc.feed.author.name = unicode(options.author)
+        etree.SubElement(feed, ATOM + 'id')
+        feed.id = _id(uri, tpub)
+
+        if options.title:
+            etree.SubElement(feed, ATOM + 'title')
+            feed.title = options.title
+
+        if options.author or options.author_email or options.author_uri:
+            etree.SubElement(feed, ATOM + 'author')
+        if options.author:
+            etree.SubElement(feed.author, ATOM + 'name')
+            feed.author.name = options.author
         if options.author_email:
-            doc.feed.author.xml_append(
-                doc.xml_element_factory(ATOM_NAMESPACE, u'email'))
-            doc.feed.author.email = unicode(options.author_email)
+            etree.SubElement(feed.author, ATOM + 'email')
+            feed.author.email = options.author_email
         if options.author_uri:
-            doc.feed.author.xml_append(
-                doc.xml_element_factory(ATOM_NAMESPACE, u'uri'))
-            doc.feed.author.uri = unicode(options.author_uri)
+            etree.SubElement(feed.author, ATOM + 'uri')
+            feed.author.uri = options.author_uri
+
+        etree.SubElement(feed, ATOM + 'generator')
+        feed.generator = GENERATOR_NAME
+        feed.generator.attrib['version'] = __version__
 
-        doc.feed.xml_append(doc.xml_element_factory(
-                ATOM_NAMESPACE, u'generator'))
-        doc.feed.generator = u'%s' % GENERATOR_NAME
-        doc.feed.generator.xml_attributes.setnode(
-            doc.feed.generator.xml_attribute_factory(
-                ATOM_NAMESPACE, u'version', __version__))
+        etree.SubElement(feed, ATOM + 'updated')
+        feed.updated = _timestamp(tpub)
 
-        doc.feed.xml_append(doc.xml_element_factory(
-                ATOM_NAMESPACE, u'updated'))
-        doc.feed.updated = _timestamp(tpub)
+        # remove http://codespeak.net/lxml/objectify/pytype namespace
+        objectify.deannotate(feed)
+        etree.cleanup_namespaces(feed)
 
-        return doc
+        return feed
 
 
 class AddEntryCommand (Command):
@@ -199,24 +201,54 @@ class AddEntryCommand (Command):
     First, create a feed to edit.
 
     >>> c = NewFeedCommand()
-    >>> doc = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King',
+    >>> feed = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King',
     ...     '--author-uri', 'http://www.physics.drexel.edu/~wking/',
     ...     '--author-email', 'wking@drexel.edu',
     ...     'http://www.physics.drexel.edu/~wking/phys201'])
     >>> fd,path = mkstemp(suffix='.atom', prefix='atomgen-')
     >>> close(fd)
-    >>> with open(path, 'w') as f:
-    ...     doc.xml_write(stream=f, **XML_WRITE_KWARGS)
+    >>> root = etree.ElementTree(feed)
+    >>> root.write(path)
 
     Now add an entry to that feed.
 
     >>> c = AddEntryCommand()
     >>> stdin = sys.stdin
     >>> sys.stdin = StringIO('Changes will be noted in this feed.')
-    >>> doc = c.run(['--input', path, 'Feed purpose',
+    >>> feed = c.run(['--input', path, 'Feed purpose',
     ...     'http://www.physics.drexel.edu/~wking/phys201'])
     >>> sys.stdin = stdin
-    >>> doc.xml_write(**XML_WRITE_KWARGS)
+    >>> print etree.tostring(feed, pretty_print=True, xml_declaration=True,
+    ...     encoding='UTF-8')  # doctest: +ELLIPSIS, +REPORT_UDIFF
+    <?xml version='1.0' encoding='UTF-8'?>
+    <feed xmlns="http://www.w3.org/2005/Atom">
+      <id>tag:www.physics.drexel.edu,...:/~wking/phys201</id>
+      <title>Physics 201</title>
+      <author>
+        <name>W. Trevor King</name>
+        <email>wking@drexel.edu</email>
+        <uri>http://www.physics.drexel.edu/~wking/</uri>
+      </author>
+      <generator version="0.2">atomgen [based on lxml]</generator>
+      <updated>...</updated>
+      <entry>
+        <title>Feed purpose</title>
+        <id>tag:www.physics.drexel.edu,...:/~wking/phys201</id>
+        <link href="http://www.physics.drexel.edu/~wking/phys201"/>
+        <published>...</published>
+        <updated>...</updated>
+        <content type="xhtml">
+          <html:div xmlns:html="http://www.w3.org/1999/xhtml">Changes will be noted in this feed.</html:div>
+        </content>
+      </entry>
+    </feed>
+    <BLANKLINE>
+
+    Note that we cannot move the html namespace declaration to the
+    `<feed>` start tag until there is a way to update namespace maps
+    on the fly.  See `lxml bug 555602`_.
+
+    .. _lxml bug 555602: https://bugs.launchpad.net/lxml/+bug/555602
 
     Cleanup.
 
@@ -245,65 +277,53 @@ class AddEntryCommand (Command):
         title = unicode(args[0])
         link = unicode(args[1])
 
+        parser = etree.XMLParser(remove_blank_text=True)
+
         if options.ifilename == None:
             assert options.content != None, (
                 'Need to use one of --input or --content')
-            doc = AB.parse(sys.stdin)
+            root = objectify.parse(sys.stdin, parser=parser)
         else:
-            doc = AB.parse(options.ifilename)
+            root = objectify.parse(options.ifilename, parser=parser)
+
+        feed = root.getroot()
 
         if options.content == None:
-            content = unicode(sys.stdin.read())
+            content = sys.stdin.read()
         else:
-            content = file(options.content, 'r').read().decode('utf-8')
-
-        # convert content out of unicode.  Avoids ?bug? in
-        # generator.ax_amara.construct_xhtml_text calls
-        # amara.bindery.xml_append_fragment which gives
-        #   ValueError: String must be of type string, not unicode
-        #content = str(content)
+            content = file(options.content, 'r').read()
 
-        new_entry = doc.xml_element_factory(ATOM_NAMESPACE, u'entry')
-        print >> sys.stderr, options.ifilename
-        print >> sys.stderr, open(options.ifilename, 'r').read()
-        print >> sys.stderr, dir(doc)
-        doc.feed.xml_append(new_entry)
-
-        new_entry.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'title'))
-        new_entry.title = title
+        entry = etree.SubElement(feed, ATOM + 'entry')
+        etree.SubElement(entry, ATOM + 'title')
+        entry.title = title
 
         tpub = time.time()
-        new_entry.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'id'))
-        new_entry.id = _id(link, tpub)
+        etree.SubElement(entry, ATOM + 'id')
+        entry.id = _id(link, tpub)
+
+        etree.SubElement(entry, ATOM + 'link')
+        entry.link.attrib['href'] = link
 
-        new_entry.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'link'))
-        new_entry.link.xml_attributes.setnode(
-            new_entry.link.xml_attribute_factory(
-                ATOM_NAMESPACE, u'href', link))
+        etree.SubElement(entry, ATOM + 'published')
+        entry.published = _timestamp(tpub)
 
-        new_entry.xml_append(doc.xml_element_factory(
-                ATOM_NAMESPACE, u'published'))
-        new_entry.published = _timestamp(tpub)
+        etree.SubElement(entry, ATOM + 'updated')
+        entry.updated = _timestamp(tpub)
 
-        new_entry.xml_append(doc.xml_element_factory(
-                ATOM_NAMESPACE, u'updated'))
-        new_entry.updated = _timestamp(tpub)
+        etree.SubElement(entry, ATOM + 'content')
+        entry.content.attrib['type'] = 'xhtml'
+        etree.SubElement(entry.content, XHTML + 'div')
+        entry.content[XHTML + 'div'] = content
 
-        new_entry.xml_append(doc.xml_element_factory(
-                ATOM_NAMESPACE, u'content'))
-        new_entry.content.xml_attributes.setnode(
-            new_entry.content.xml_attribute_factory(
-                ATOM_NAMESPACE, u'type', u'xhtml'))
-        new_entry.content.xml_append(doc.xml_element_factory(
-                XHTML_NAMESPACE, u'div'))
-        new_entry.content.div = content
+        if not hasattr(feed, u'updated') :
+            etree.SubElement(feed, ATOM + 'updated')
+        feed.updated = _timestamp(tpub)
 
-        if not hasattr(doc.feed, u'updated') :
-            doc.feed.xml_append(doc.xml_element_factory(
-                    ATOM_NAMESPACE, u'updated'))
-        doc.feed.updated = _timestamp(tpub)
+        # remove http://codespeak.net/lxml/objectify/pytype namespace
+        objectify.deannotate(feed)
+        etree.cleanup_namespaces(feed)
 
-        return doc
+        return feed
 
 
 def test():
@@ -335,12 +355,12 @@ if __name__ == "__main__" and True:
     command_name = args[0]
     command = command_dict[command_name]
     args = args[1:]
-    doc = command.run(args)
+    feed = command.run(args)
 
+    ostring = etree.tostring(
+            feed, pretty_print=True, xml_declaration=True, encoding='UTF-8')
     if options.ofilename == None:
-        doc.xml_write(**XML_WRITE_KWARGS)
-        print # add trailing endline
+        print ostring,
     else:
         with file(options.ofilename, 'w') as of:
-            doc.xml_write(stream=of, **XML_WRITE_KWARGS)
-            print >> of, '' # add trailing endline
+            of.write(ostring)