posts/Atomgen/atomgen.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright (C) 2009-2010, William Trevor King <wking@tremily.us>
   4 #
   5 # This program is free software: you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation, either version 3 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 """Produce `RFC 4287` compliant Atom 1.0 XML from the command line.
  19
  20 .. _RFC 4287: http://tools.ietf.org/html/rfc4287
  21 """
  22
  23 from optparse import OptionParser
  24 import sys
  25 import time
  26
  27 from lxml import etree
  28 from lxml import objectify
  29 from lxml.html import XHTML_NAMESPACE
  30
  31 __version__ = '0.2'
  32 GENERATOR_NAME = u'atomgen [based on lxml]'
  33
  34 ATOM_NAMESPACE = 'http://www.w3.org/2005/Atom'
  35
  36 ATOM = '{%s}' % ATOM_NAMESPACE
  37 XHTML = '{%s}' % XHTML_NAMESPACE
  38
  39 NSMAP = {
  40     None : ATOM_NAMESPACE,
  41     'html': XHTML_NAMESPACE,
  42     }
  43
  44
  45 def _id(url, time_published):
  46     """Convert a URL to an Atom ID
  47
  48     Following Mark Pilgrim's suggestions_.
  49     >>> _id('http://example.com/blog#5', 0)
  50     u'tag:example.com,1970-01-01:/blog/5'
  51
  52     Tags conform to RFC4151 tag syntax.  You're restricted to one post
  53     per day with a single url.
  54
  55     .. _suggestions: http://diveintomark.org/archives/2004/05/28/howto-atom-id
  56     .. _tag syntax: http://tools.ietf.org/html/rfc4151#section-2.1
  57     """
  58     # Discard everything before the domain name
  59     start = u"http://"
  60     if url.startswith(start):
  61         url = url[len(start):]
  62     # Change all # characters to /
  63     url = url.replace(u'#', u'/')
  64     # Extract the domain name
  65     end_of_domain_index = url.find(u'/')
  66     if end_of_domain_index == -1:
  67         domain = url
  68         trailer = u''
  69     else:
  70         domain = url[0:end_of_domain_index]
  71         trailer = url[end_of_domain_index:]
  72     # Immediately after the domain name, insert a comma, year-month-date, colon
  73     time_string = time.strftime("%Y-%m-%d", time.gmtime(time_published))
  74     url = u"tag:%s,%s:%s" % (domain, time_string, trailer)
  75     return url
  76
  77 def _timestamp(seconds=None):
  78     """Return an `RFC 3339`_ timestamp.
  79
  80     Complete date plus hours, minutes and seconds::
  81
  82         YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30Z)
  83
  84     Where the the trailing `Z` designates times in UTC.
  85
  86     >>> _timestamp(0)
  87     u'1970-01-01T00:00:00Z'
  88
  89     .. _RFC 3339: http://www.ietf.org/rfc/rfc3339.txt
  90     """
  91     utc = time.gmtime(seconds)
  92     string = time.strftime('%Y-%m-%dT%H:%M:%SZ', utc)
  93     return unicode(string)
  94
  95
  96 class Command (object):
  97     """A command exposed via the command line."""
  98     name = None
  99
 100     def run(self, argv):
 101         parser = self._get_parser()
 102         options,args = parser.parse_args(argv)
 103         return self._run(options, args)
 104
 105     def _get_parser(self):
 106         raise NotImplementedError()
 107
 108     def _run(self, options, args):
 109         raise NotImplementedError()
 110
 111
 112 class NewFeedCommand (Command):
 113     """Create a new feed
 114
 115     >>> c = NewFeedCommand()
 116     >>> feed = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King',
 117     ...     '--author-uri', 'http://www.physics.drexel.edu/~wking/',
 118     ...     '--author-email', 'wking@tremily.us',
 119     ...     'http://www.physics.drexel.edu/~wking/phys201'])
 120     >>> print etree.tostring(feed, pretty_print=True, xml_declaration=True,
 121     ...     encoding='UTF-8')  # doctest: +ELLIPSIS, +REPORT_UDIFF
 122     <?xml version='1.0' encoding='UTF-8'?>
 123     <feed xmlns="http://www.w3.org/2005/Atom">
 124       <id>tag:www.physics.drexel.edu,...:/~wking/phys201</id>
 125       <title>Physics 201</title>
 126       <author>
 127         <name>W. Trevor King</name>
 128         <email>wking@tremily.us</email>
 129         <uri>http://www.physics.drexel.edu/~wking/</uri>
 130       </author>
 131       <generator version="0.2">atomgen [based on lxml]</generator>
 132       <updated>...</updated>
 133     </feed>
 134     <BLANKLINE>
 135     """
 136     name = 'new'
 137
 138     def _get_parser(self):
 139         usage = ['%prog [general-options] new [options] URI',
 140                  '',
 141                  'Where',
 142                  '  URI is a URI used to generate a unique ID for the feed']
 143         parser = OptionParser(usage='\n'.join(usage))
 144         parser.disable_interspersed_args()
 145         parser.add_option('-t', '--title', dest='title', metavar='TITLE',
 146                           help='Feed title')
 147         parser.add_option('-a', '--author', dest='author', metavar='NAME',
 148                           help='Feed author name')
 149         parser.add_option('-u', '--author-uri', dest='author_uri',
 150                           metavar='URI', help='Feed author homepage URI')
 151         parser.add_option('-e', '--author-email', dest='author_email',
 152                           metavar='EMAIL', help='Feed author email address')
 153         return parser
 154
 155     def _run(self, options, args):
 156         uri = args[0]
 157
 158         feed = objectify.Element(ATOM + 'feed', nsmap=NSMAP)
 159
 160         tpub = time.time()
 161         etree.SubElement(feed, ATOM + 'id')
 162         feed.id = _id(uri, tpub)
 163
 164         if options.title:
 165             etree.SubElement(feed, ATOM + 'title')
 166             feed.title = options.title
 167
 168         if options.author or options.author_email or options.author_uri:
 169             etree.SubElement(feed, ATOM + 'author')
 170         if options.author:
 171             etree.SubElement(feed.author, ATOM + 'name')
 172             feed.author.name = options.author
 173         if options.author_email:
 174             etree.SubElement(feed.author, ATOM + 'email')
 175             feed.author.email = options.author_email
 176         if options.author_uri:
 177             etree.SubElement(feed.author, ATOM + 'uri')
 178             feed.author.uri = options.author_uri
 179
 180         etree.SubElement(feed, ATOM + 'generator')
 181         feed.generator = GENERATOR_NAME
 182         feed.generator.attrib['version'] = __version__
 183
 184         etree.SubElement(feed, ATOM + 'updated')
 185         feed.updated = _timestamp(tpub)
 186
 187         # remove http://codespeak.net/lxml/objectify/pytype namespace
 188         objectify.deannotate(feed)
 189         etree.cleanup_namespaces(feed)
 190
 191         return feed
 192
 193
 194 class AddEntryCommand (Command):
 195     """Add an entry to an existing feed.
 196
 197     >>> from os import close, remove
 198     >>> from StringIO import StringIO
 199     >>> from tempfile import mkstemp
 200
 201     First, create a feed to edit.
 202
 203     >>> c = NewFeedCommand()
 204     >>> feed = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King',
 205     ...     '--author-uri', 'http://www.physics.drexel.edu/~wking/',
 206     ...     '--author-email', 'wking@tremily.us',
 207     ...     'http://www.physics.drexel.edu/~wking/phys201'])
 208     >>> fd,path = mkstemp(suffix='.atom', prefix='atomgen-')
 209     >>> close(fd)
 210     >>> root = etree.ElementTree(feed)
 211     >>> root.write(path)
 212
 213     Now add an entry to that feed.
 214
 215     >>> c = AddEntryCommand()
 216     >>> stdin = sys.stdin
 217     >>> sys.stdin = StringIO('Changes will be noted in this feed.')
 218     >>> feed = c.run(['--input', path, 'Feed purpose',
 219     ...     'http://www.physics.drexel.edu/~wking/phys201'])
 220     >>> sys.stdin = stdin
 221     >>> print etree.tostring(feed, pretty_print=True, xml_declaration=True,
 222     ...     encoding='UTF-8')  # doctest: +ELLIPSIS, +REPORT_UDIFF
 223     <?xml version='1.0' encoding='UTF-8'?>
 224     <feed xmlns="http://www.w3.org/2005/Atom">
 225       <id>tag:www.physics.drexel.edu,...:/~wking/phys201</id>
 226       <title>Physics 201</title>
 227       <author>
 228         <name>W. Trevor King</name>
 229         <email>wking@tremily.us</email>
 230         <uri>http://www.physics.drexel.edu/~wking/</uri>
 231       </author>
 232       <generator version="0.2">atomgen [based on lxml]</generator>
 233       <updated>...</updated>
 234       <entry>
 235         <title>Feed purpose</title>
 236         <id>tag:www.physics.drexel.edu,...:/~wking/phys201</id>
 237         <link href="http://www.physics.drexel.edu/~wking/phys201"/>
 238         <published>...</published>
 239         <updated>...</updated>
 240         <content type="xhtml">
 241           <html:div xmlns:html="http://www.w3.org/1999/xhtml">Changes will be noted in this feed.</html:div>
 242         </content>
 243       </entry>
 244     </feed>
 245     <BLANKLINE>
 246
 247     Note that we cannot move the html namespace declaration to the
 248     `<feed>` start tag until there is a way to update namespace maps
 249     on the fly.  See `lxml bug 555602`_.
 250
 251     .. _lxml bug 555602: https://bugs.launchpad.net/lxml/+bug/555602
 252
 253     Cleanup.
 254
 255     >>> remove(path)
 256     """
 257     name = 'add'
 258
 259     def _get_parser(self):
 260         usage = ['%prog [general-options] add [options] TITLE LINK',
 261                  '',
 262                  'Where',
 263                  '  TITLE is the title of the new entry',
 264                  '  LINK is the URI of that the entry refers to']
 265         parser = OptionParser(usage='\n'.join(usage))
 266         parser.disable_interspersed_args()
 267         parser.add_option('-i', '--input', dest='ifilename', metavar='FILE',
 268                           help=('Input file for generated feed '
 269                                 '(defaults to stdin)'))
 270         parser.add_option('-c', '--content', dest='content', metavar='FILE',
 271                           help=('Input file for entry content '
 272                                 '(defaults to stdin, unless input is stdin, '
 273                                 'in which case this option is required.)'))
 274         return parser
 275
 276     def _run(self, options, args):
 277         title = unicode(args[0])
 278         link = unicode(args[1])
 279
 280         parser = objectify.makeparser()
 281
 282         if options.ifilename == None:
 283             assert options.content != None, (
 284                 'Need to use one of --input or --content')
 285             root = objectify.parse(sys.stdin, parser=parser)
 286         else:
 287             root = objectify.parse(options.ifilename, parser=parser)
 288
 289         feed = root.getroot()
 290
 291         if options.content == None:
 292             content = sys.stdin.read()
 293         else:
 294             content = file(options.content, 'r').read()
 295
 296         entry = etree.SubElement(feed, ATOM + 'entry')
 297         etree.SubElement(entry, ATOM + 'title')
 298         entry.title = title
 299
 300         tpub = time.time()
 301         etree.SubElement(entry, ATOM + 'id')
 302         entry.id = _id(link, tpub)
 303
 304         etree.SubElement(entry, ATOM + 'link')
 305         entry.link.attrib['href'] = link
 306
 307         etree.SubElement(entry, ATOM + 'published')
 308         entry.published = _timestamp(tpub)
 309
 310         etree.SubElement(entry, ATOM + 'updated')
 311         entry.updated = _timestamp(tpub)
 312
 313         etree.SubElement(entry, ATOM + 'content')
 314         entry.content.attrib['type'] = 'xhtml'
 315         etree.SubElement(entry.content, XHTML + 'div')
 316         entry.content[XHTML + 'div'] = content
 317
 318         if not hasattr(feed, u'updated') :
 319             etree.SubElement(feed, ATOM + 'updated')
 320         feed.updated = _timestamp(tpub)
 321
 322         # remove http://codespeak.net/lxml/objectify/pytype namespace
 323         objectify.deannotate(feed)
 324         etree.cleanup_namespaces(feed)
 325
 326         return feed
 327
 328
 329 def test():
 330     import doctest
 331     doctest.testmod()
 332
 333
 334 if __name__ == "__main__" and True:
 335     commands = [NewFeedCommand(), AddEntryCommand()]
 336     command_dict = dict([(c.name, c) for c in commands])
 337     usage = ['%prog [options] command [command-options]',
 338              '',
 339              'Where command is one of']
 340     usage.extend(['  %s\t%s' % (c.name, c.__doc__.splitlines()[0])
 341                   for c in commands])
 342
 343     parser = OptionParser(usage='\n'.join(usage))
 344     parser.disable_interspersed_args()
 345     parser.add_option('-o', '--output', dest='ofilename', metavar='FILE',
 346                       help='Output file for generated feed (defaults to stdout)')
 347     parser.add_option('--test', dest='test', action='store_true',
 348                       help='Run the module test suite')
 349     (options, args) = parser.parse_args()
 350
 351     if options.test == True:
 352         test()
 353         sys.exit(0)
 354
 355     command_name = args[0]
 356     command = command_dict[command_name]
 357     args = args[1:]
 358     feed = command.run(args)
 359
 360     ostring = etree.tostring(
 361             feed, pretty_print=True, xml_declaration=True, encoding='UTF-8')
 362     if options.ofilename == None:
 363         print ostring,
 364     else:
 365         with file(options.ofilename, 'w') as of:
 366             of.write(ostring)