From b74959d2aae9f98a549150b32aa95cc4902c39df Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Sat, 20 Nov 2010 14:25:21 -0500 Subject: [PATCH] Update atomgen to use lxml instead of Amara and add Amara post. --- posts/Amara.mdwn | 36 +++++++ posts/Atomgen.mdwn | 24 +---- posts/Atomgen/atomgen.py | 218 +++++++++++++++++++++------------------ 3 files changed, 157 insertions(+), 121 deletions(-) create mode 100644 posts/Amara.mdwn diff --git a/posts/Amara.mdwn b/posts/Amara.mdwn new file mode 100644 index 0000000..81c8265 --- /dev/null +++ b/posts/Amara.mdwn @@ -0,0 +1,36 @@ +At one point my [[Atomgen]] script used Amara, because I liked the +bindery interface. However, I was having [problems][] with wide +Unicode characters, so I've updated Atomgen to use more widely +supported XML libraries ([lxml][]'s [objectify][] API is similar to +Amara's bindery). For future reference, I'll recount my history with +Amara. + +I started off using Sylvain Hellegouarch's [Atomixlib][], which is +based on [Amara2][] or [ElementTree][]. However, Sylvain seems to be +falling behind as ElementTree and Amara continue to evolve, so I +rewrote Atomgen to run off [Amara2][] directly, since Amara's +*bindery* interface is wonderful. Things worked fine when I was +installing Amara via `easy_install`: + + sudo apt-get install python-setuptools + easy_install --prefix=~ amara + +but I ran into the [wide Unicode][] issues mentioned above when I +tried to install Amara2 using Tiziano Müller's [dev-zero][] +[overlay][] (Gentoo only packages Amara-1.2.0.2 by default). + +The Amara + [expat][] + wide Unicode is discussed in [Gentoo bug +306655][]. + +[problems]: http://trac.xml3k.org/ticket/69 +[lxml]: http://codespeak.net/lxml/ +[objectify]: http://codespeak.net/lxml/objectify.html +[Atomixlib]: http://pypi.python.org/pypi/atomixlib/ +[Amara]: http://xml3k.org/Amara/ +[Amara2]: http://xml3k.org/Amara2/ +[ElementTree]: http://pypi.python.org/pypi/elementtree/ +[wide Unicode]: http://www.python.org/dev/peps/pep-0261/ +[dev-zero]: http://git.overlays.gentoo.org/gitweb/?p=dev/dev-zero.git +[overlay]: http://www.gentoo.org/proj/en/overlays/userguide.xml +[expat]: http://expat.sourceforge.net/ +[Gentoo bug 306655]: http://bugs.gentoo.org/show_bug.cgi?id=306655 diff --git a/posts/Atomgen.mdwn b/posts/Atomgen.mdwn index 169fa22..c37b9e3 100644 --- a/posts/Atomgen.mdwn +++ b/posts/Atomgen.mdwn @@ -12,13 +12,7 @@ feed. It works rather well I think, even if noone ends up actually looking at the feed ;). Anyhow, I wrote up a little command line wrapper ([[atomgen.py]]) -around Sylvain Hellegouarch's [Atomixlib][], which is based on -[Amara2][] or [ElementTree][]. However, Sylvain seems to be falling -behind as ElementTree and Amara continue to evolve, so I recently -rewrote my script to run off [Amara2][] directly, since Amara's -*bindery* interface is wonderful (well, for data manipulation anyway. -Attributes, prefixes, and tree construction don't make much sense to -me yet...). +around [lxml][] atomgen -o atom.xml new --title 'Physics 201' --author 'W. Trevor King' \ http://www.physics.drexel.edu/~wking/phys201 @@ -26,24 +20,10 @@ me yet...). atomgen -o atom.xml add -i atom.xml 'Feed purpose' \ http://www.physics.drexel.edu/~wking/phys201 -If your distro does not package Amara, install it with - - sudo apt-get install python-setuptools - easy_install --prefix=~ amara - -or use [[pip|Distributing Python]]. Gentoo packages Amara-1.2.0.2, -but there are Amara2 ebuilds in Tiziano Müller's [dev-zero][] -[overlay][]. - [Atom feeds]: http://en.wikipedia.org/wiki/Atom_%28standard%29 [rss2email]: http://rss2email.infogami.com/ [procmail]: http://www.procmail.org/ -[Atomixlib]: http://pypi.python.org/pypi/atomixlib/ -[Amara]: http://xml3k.org/Amara/ -[Amara2]: http://xml3k.org/Amara2/ -[ElementTree]: http://pypi.python.org/pypi/elementtree/ -[dev-zero]: http://git.overlays.gentoo.org/gitweb/?p=dev/dev-zero.git -[overlay]: http://www.gentoo.org/proj/en/overlays/userguide.xml +[lxml]: http://codespeak.net/lxml/ [[!tag tags/blogging]] [[!tag tags/programming]] diff --git a/posts/Atomgen/atomgen.py b/posts/Atomgen/atomgen.py index ce6b79f..52793c2 100755 --- a/posts/Atomgen/atomgen.py +++ b/posts/Atomgen/atomgen.py @@ -17,23 +17,29 @@ """Produce `RFC 4287` compliant Atom 1.0 XML from the command line. -Tested on Amara_ version 2.0a4. - .. _RFC 4287: http://tools.ietf.org/html/rfc4287 -.. _Amara: http://wiki.xml3k.org/Amara2 """ from optparse import OptionParser import sys import time -from amara import bindery as AB -from amara.namespaces import ATOM_NAMESPACE, XML_NAMESPACE, XHTML_NAMESPACE - +from lxml import etree +from lxml import objectify +from lxml.html import XHTML_NAMESPACE __version__ = '0.2' -GENERATOR_NAME = u'atomgen [based on amara2]' -XML_WRITE_KWARGS = {'writer':'xml-indent'} +GENERATOR_NAME = u'atomgen [based on lxml]' + +ATOM_NAMESPACE = 'http://www.w3.org/2005/Atom' + +ATOM = '{%s}' % ATOM_NAMESPACE +XHTML = '{%s}' % XHTML_NAMESPACE + +NSMAP = { + None : ATOM_NAMESPACE, + 'html': XHTML_NAMESPACE, + } def _id(url, time_published): @@ -107,13 +113,13 @@ class NewFeedCommand (Command): """Create a new feed >>> c = NewFeedCommand() - >>> doc = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King', + >>> feed = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King', ... '--author-uri', 'http://www.physics.drexel.edu/~wking/', ... '--author-email', 'wking@drexel.edu', ... 'http://www.physics.drexel.edu/~wking/phys201']) - >>> doc.xml_write(**XML_WRITE_KWARGS) - ... # doctest: +ELLIPSIS, +REPORT_UDIFF - + >>> print etree.tostring(feed, pretty_print=True, xml_declaration=True, + ... encoding='UTF-8') # doctest: +ELLIPSIS, +REPORT_UDIFF + tag:www.physics.drexel.edu,...:/~wking/phys201 Physics 201 @@ -122,9 +128,10 @@ class NewFeedCommand (Command): wking@drexel.edu http://www.physics.drexel.edu/~wking/ - atomgen [based on amara2] + atomgen [based on lxml] ... + """ name = 'new' @@ -146,47 +153,42 @@ class NewFeedCommand (Command): return parser def _run(self, options, args): - uri = unicode(args[0]) + uri = args[0] - doc = AB.nodes.entity_base() - doc.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'feed')) + feed = objectify.Element(ATOM + 'feed', nsmap=NSMAP) tpub = time.time() - doc.feed.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'id')) - doc.feed.id = _id(uri, tpub) - - if options.title != None: - doc.feed.xml_append(doc.xml_element_factory( - ATOM_NAMESPACE, u'title')) - doc.feed.title = unicode(options.title) - - if options.author != None: - doc.feed.xml_append(doc.xml_element_factory( - ATOM_NAMESPACE, u'author')) - doc.feed.author.xml_append(doc.xml_element_factory( - ATOM_NAMESPACE, u'name')) - doc.feed.author.name = unicode(options.author) + etree.SubElement(feed, ATOM + 'id') + feed.id = _id(uri, tpub) + + if options.title: + etree.SubElement(feed, ATOM + 'title') + feed.title = options.title + + if options.author or options.author_email or options.author_uri: + etree.SubElement(feed, ATOM + 'author') + if options.author: + etree.SubElement(feed.author, ATOM + 'name') + feed.author.name = options.author if options.author_email: - doc.feed.author.xml_append( - doc.xml_element_factory(ATOM_NAMESPACE, u'email')) - doc.feed.author.email = unicode(options.author_email) + etree.SubElement(feed.author, ATOM + 'email') + feed.author.email = options.author_email if options.author_uri: - doc.feed.author.xml_append( - doc.xml_element_factory(ATOM_NAMESPACE, u'uri')) - doc.feed.author.uri = unicode(options.author_uri) + etree.SubElement(feed.author, ATOM + 'uri') + feed.author.uri = options.author_uri + + etree.SubElement(feed, ATOM + 'generator') + feed.generator = GENERATOR_NAME + feed.generator.attrib['version'] = __version__ - doc.feed.xml_append(doc.xml_element_factory( - ATOM_NAMESPACE, u'generator')) - doc.feed.generator = u'%s' % GENERATOR_NAME - doc.feed.generator.xml_attributes.setnode( - doc.feed.generator.xml_attribute_factory( - ATOM_NAMESPACE, u'version', __version__)) + etree.SubElement(feed, ATOM + 'updated') + feed.updated = _timestamp(tpub) - doc.feed.xml_append(doc.xml_element_factory( - ATOM_NAMESPACE, u'updated')) - doc.feed.updated = _timestamp(tpub) + # remove http://codespeak.net/lxml/objectify/pytype namespace + objectify.deannotate(feed) + etree.cleanup_namespaces(feed) - return doc + return feed class AddEntryCommand (Command): @@ -199,24 +201,54 @@ class AddEntryCommand (Command): First, create a feed to edit. >>> c = NewFeedCommand() - >>> doc = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King', + >>> feed = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King', ... '--author-uri', 'http://www.physics.drexel.edu/~wking/', ... '--author-email', 'wking@drexel.edu', ... 'http://www.physics.drexel.edu/~wking/phys201']) >>> fd,path = mkstemp(suffix='.atom', prefix='atomgen-') >>> close(fd) - >>> with open(path, 'w') as f: - ... doc.xml_write(stream=f, **XML_WRITE_KWARGS) + >>> root = etree.ElementTree(feed) + >>> root.write(path) Now add an entry to that feed. >>> c = AddEntryCommand() >>> stdin = sys.stdin >>> sys.stdin = StringIO('Changes will be noted in this feed.') - >>> doc = c.run(['--input', path, 'Feed purpose', + >>> feed = c.run(['--input', path, 'Feed purpose', ... 'http://www.physics.drexel.edu/~wking/phys201']) >>> sys.stdin = stdin - >>> doc.xml_write(**XML_WRITE_KWARGS) + >>> print etree.tostring(feed, pretty_print=True, xml_declaration=True, + ... encoding='UTF-8') # doctest: +ELLIPSIS, +REPORT_UDIFF + + + tag:www.physics.drexel.edu,...:/~wking/phys201 + Physics 201 + + W. Trevor King + wking@drexel.edu + http://www.physics.drexel.edu/~wking/ + + atomgen [based on lxml] + ... + + Feed purpose + tag:www.physics.drexel.edu,...:/~wking/phys201 + + ... + ... + + Changes will be noted in this feed. + + + + + + Note that we cannot move the html namespace declaration to the + `` start tag until there is a way to update namespace maps + on the fly. See `lxml bug 555602`_. + + .. _lxml bug 555602: https://bugs.launchpad.net/lxml/+bug/555602 Cleanup. @@ -245,65 +277,53 @@ class AddEntryCommand (Command): title = unicode(args[0]) link = unicode(args[1]) + parser = etree.XMLParser(remove_blank_text=True) + if options.ifilename == None: assert options.content != None, ( 'Need to use one of --input or --content') - doc = AB.parse(sys.stdin) + root = objectify.parse(sys.stdin, parser=parser) else: - doc = AB.parse(options.ifilename) + root = objectify.parse(options.ifilename, parser=parser) + + feed = root.getroot() if options.content == None: - content = unicode(sys.stdin.read()) + content = sys.stdin.read() else: - content = file(options.content, 'r').read().decode('utf-8') - - # convert content out of unicode. Avoids ?bug? in - # generator.ax_amara.construct_xhtml_text calls - # amara.bindery.xml_append_fragment which gives - # ValueError: String must be of type string, not unicode - #content = str(content) + content = file(options.content, 'r').read() - new_entry = doc.xml_element_factory(ATOM_NAMESPACE, u'entry') - print >> sys.stderr, options.ifilename - print >> sys.stderr, open(options.ifilename, 'r').read() - print >> sys.stderr, dir(doc) - doc.feed.xml_append(new_entry) - - new_entry.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'title')) - new_entry.title = title + entry = etree.SubElement(feed, ATOM + 'entry') + etree.SubElement(entry, ATOM + 'title') + entry.title = title tpub = time.time() - new_entry.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'id')) - new_entry.id = _id(link, tpub) + etree.SubElement(entry, ATOM + 'id') + entry.id = _id(link, tpub) + + etree.SubElement(entry, ATOM + 'link') + entry.link.attrib['href'] = link - new_entry.xml_append(doc.xml_element_factory(ATOM_NAMESPACE, u'link')) - new_entry.link.xml_attributes.setnode( - new_entry.link.xml_attribute_factory( - ATOM_NAMESPACE, u'href', link)) + etree.SubElement(entry, ATOM + 'published') + entry.published = _timestamp(tpub) - new_entry.xml_append(doc.xml_element_factory( - ATOM_NAMESPACE, u'published')) - new_entry.published = _timestamp(tpub) + etree.SubElement(entry, ATOM + 'updated') + entry.updated = _timestamp(tpub) - new_entry.xml_append(doc.xml_element_factory( - ATOM_NAMESPACE, u'updated')) - new_entry.updated = _timestamp(tpub) + etree.SubElement(entry, ATOM + 'content') + entry.content.attrib['type'] = 'xhtml' + etree.SubElement(entry.content, XHTML + 'div') + entry.content[XHTML + 'div'] = content - new_entry.xml_append(doc.xml_element_factory( - ATOM_NAMESPACE, u'content')) - new_entry.content.xml_attributes.setnode( - new_entry.content.xml_attribute_factory( - ATOM_NAMESPACE, u'type', u'xhtml')) - new_entry.content.xml_append(doc.xml_element_factory( - XHTML_NAMESPACE, u'div')) - new_entry.content.div = content + if not hasattr(feed, u'updated') : + etree.SubElement(feed, ATOM + 'updated') + feed.updated = _timestamp(tpub) - if not hasattr(doc.feed, u'updated') : - doc.feed.xml_append(doc.xml_element_factory( - ATOM_NAMESPACE, u'updated')) - doc.feed.updated = _timestamp(tpub) + # remove http://codespeak.net/lxml/objectify/pytype namespace + objectify.deannotate(feed) + etree.cleanup_namespaces(feed) - return doc + return feed def test(): @@ -335,12 +355,12 @@ if __name__ == "__main__" and True: command_name = args[0] command = command_dict[command_name] args = args[1:] - doc = command.run(args) + feed = command.run(args) + ostring = etree.tostring( + feed, pretty_print=True, xml_declaration=True, encoding='UTF-8') if options.ofilename == None: - doc.xml_write(**XML_WRITE_KWARGS) - print # add trailing endline + print ostring, else: with file(options.ofilename, 'w') as of: - doc.xml_write(stream=of, **XML_WRITE_KWARGS) - print >> of, '' # add trailing endline + of.write(ostring) -- 2.26.2