c26b0e60235b72d1735793e9f8edb6d9e471fc2c
[blog.git] / posts / Atomgen / atomgen.py
1 #!/usr/bin/env python
2 #
3 # Copyright (C) 2009-2010, William Trevor King <wking@tremily.us>
4 #
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
17
18 """Produce `RFC 4287` compliant Atom 1.0 XML from the command line.
19
20 .. _RFC 4287: http://tools.ietf.org/html/rfc4287
21 """
22
23 from optparse import OptionParser
24 import sys
25 import time
26
27 from lxml import etree
28 from lxml import objectify
29 from lxml.html import XHTML_NAMESPACE
30
31 __version__ = '0.2'
32 GENERATOR_NAME = u'atomgen [based on lxml]'
33
34 ATOM_NAMESPACE = 'http://www.w3.org/2005/Atom'
35
36 ATOM = '{%s}' % ATOM_NAMESPACE
37 XHTML = '{%s}' % XHTML_NAMESPACE
38
39 NSMAP = {
40     None : ATOM_NAMESPACE,
41     'html': XHTML_NAMESPACE,
42     }
43
44
45 def _id(url, time_published):
46     """Convert a URL to an Atom ID
47
48     Following Mark Pilgrim's suggestions_.
49     >>> _id('http://example.com/blog#5', 0)
50     u'tag:example.com,1970-01-01:/blog/5'
51
52     Tags conform to RFC4151 tag syntax.  You're restricted to one post
53     per day with a single url.
54
55     .. _suggestions: http://diveintomark.org/archives/2004/05/28/howto-atom-id
56     .. _tag syntax: http://tools.ietf.org/html/rfc4151#section-2.1
57     """
58     # Discard everything before the domain name
59     start = u"http://"
60     if url.startswith(start):
61         url = url[len(start):]
62     # Change all # characters to /
63     url = url.replace(u'#', u'/')
64     # Extract the domain name
65     end_of_domain_index = url.find(u'/')
66     if end_of_domain_index == -1:
67         domain = url
68         trailer = u''
69     else:
70         domain = url[0:end_of_domain_index]
71         trailer = url[end_of_domain_index:]
72     # Immediately after the domain name, insert a comma, year-month-date, colon
73     time_string = time.strftime("%Y-%m-%d", time.gmtime(time_published))
74     url = u"tag:%s,%s:%s" % (domain, time_string, trailer)
75     return url
76
77 def _timestamp(seconds=None):
78     """Return an `RFC 3339`_ timestamp.
79
80     Complete date plus hours, minutes and seconds::
81
82         YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30Z)
83
84     Where the the trailing `Z` designates times in UTC.
85
86     >>> _timestamp(0)
87     u'1970-01-01T00:00:00Z'
88
89     .. _RFC 3339: http://www.ietf.org/rfc/rfc3339.txt
90     """
91     utc = time.gmtime(seconds)
92     string = time.strftime('%Y-%m-%dT%H:%M:%SZ', utc)
93     return unicode(string)
94
95
96 class Command (object):
97     """A command exposed via the command line."""
98     name = None
99
100     def run(self, argv):
101         parser = self._get_parser()
102         options,args = parser.parse_args(argv)
103         return self._run(options, args)
104
105     def _get_parser(self):
106         raise NotImplementedError()
107     
108     def _run(self, options, args):
109         raise NotImplementedError()
110
111
112 class NewFeedCommand (Command):
113     """Create a new feed
114
115     >>> c = NewFeedCommand()
116     >>> feed = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King',
117     ...     '--author-uri', 'http://www.physics.drexel.edu/~wking/',
118     ...     '--author-email', 'wking@tremily.us',
119     ...     'http://www.physics.drexel.edu/~wking/phys201'])
120     >>> print etree.tostring(feed, pretty_print=True, xml_declaration=True,
121     ...     encoding='UTF-8')  # doctest: +ELLIPSIS, +REPORT_UDIFF
122     <?xml version='1.0' encoding='UTF-8'?>
123     <feed xmlns="http://www.w3.org/2005/Atom">
124       <id>tag:www.physics.drexel.edu,...:/~wking/phys201</id>
125       <title>Physics 201</title>
126       <author>
127         <name>W. Trevor King</name>
128         <email>wking@tremily.us</email>
129         <uri>http://www.physics.drexel.edu/~wking/</uri>
130       </author>
131       <generator version="0.2">atomgen [based on lxml]</generator>
132       <updated>...</updated>
133     </feed>
134     <BLANKLINE>
135     """
136     name = 'new'
137
138     def _get_parser(self):
139         usage = ['%prog [general-options] new [options] URI',
140                  '',
141                  'Where',
142                  '  URI is a URI used to generate a unique ID for the feed']
143         parser = OptionParser(usage='\n'.join(usage))
144         parser.disable_interspersed_args()
145         parser.add_option('-t', '--title', dest='title', metavar='TITLE',
146                           help='Feed title')
147         parser.add_option('-a', '--author', dest='author', metavar='NAME',
148                           help='Feed author name')
149         parser.add_option('-u', '--author-uri', dest='author_uri',
150                           metavar='URI', help='Feed author homepage URI')
151         parser.add_option('-e', '--author-email', dest='author_email',
152                           metavar='EMAIL', help='Feed author email address')
153         return parser
154
155     def _run(self, options, args):
156         uri = args[0]
157
158         feed = objectify.Element(ATOM + 'feed', nsmap=NSMAP)
159
160         tpub = time.time()
161         etree.SubElement(feed, ATOM + 'id')
162         feed.id = _id(uri, tpub)
163
164         if options.title:
165             etree.SubElement(feed, ATOM + 'title')
166             feed.title = options.title
167
168         if options.author or options.author_email or options.author_uri:
169             etree.SubElement(feed, ATOM + 'author')
170         if options.author:
171             etree.SubElement(feed.author, ATOM + 'name')
172             feed.author.name = options.author
173         if options.author_email:
174             etree.SubElement(feed.author, ATOM + 'email')
175             feed.author.email = options.author_email
176         if options.author_uri:
177             etree.SubElement(feed.author, ATOM + 'uri')
178             feed.author.uri = options.author_uri
179
180         etree.SubElement(feed, ATOM + 'generator')
181         feed.generator = GENERATOR_NAME
182         feed.generator.attrib['version'] = __version__
183
184         etree.SubElement(feed, ATOM + 'updated')
185         feed.updated = _timestamp(tpub)
186
187         # remove http://codespeak.net/lxml/objectify/pytype namespace
188         objectify.deannotate(feed)
189         etree.cleanup_namespaces(feed)
190
191         return feed
192
193
194 class AddEntryCommand (Command):
195     """Add an entry to an existing feed.
196
197     >>> from os import close, remove
198     >>> from StringIO import StringIO
199     >>> from tempfile import mkstemp
200
201     First, create a feed to edit.
202
203     >>> c = NewFeedCommand()
204     >>> feed = c.run(['--title', 'Physics 201', '--author', 'W. Trevor King',
205     ...     '--author-uri', 'http://www.physics.drexel.edu/~wking/',
206     ...     '--author-email', 'wking@tremily.us',
207     ...     'http://www.physics.drexel.edu/~wking/phys201'])
208     >>> fd,path = mkstemp(suffix='.atom', prefix='atomgen-')
209     >>> close(fd)
210     >>> root = etree.ElementTree(feed)
211     >>> root.write(path)
212
213     Now add an entry to that feed.
214
215     >>> c = AddEntryCommand()
216     >>> stdin = sys.stdin
217     >>> sys.stdin = StringIO('Changes will be noted in this feed.')
218     >>> feed = c.run(['--input', path, 'Feed purpose',
219     ...     'http://www.physics.drexel.edu/~wking/phys201'])
220     >>> sys.stdin = stdin
221     >>> print etree.tostring(feed, pretty_print=True, xml_declaration=True,
222     ...     encoding='UTF-8')  # doctest: +ELLIPSIS, +REPORT_UDIFF
223     <?xml version='1.0' encoding='UTF-8'?>
224     <feed xmlns="http://www.w3.org/2005/Atom">
225       <id>tag:www.physics.drexel.edu,...:/~wking/phys201</id>
226       <title>Physics 201</title>
227       <author>
228         <name>W. Trevor King</name>
229         <email>wking@tremily.us</email>
230         <uri>http://www.physics.drexel.edu/~wking/</uri>
231       </author>
232       <generator version="0.2">atomgen [based on lxml]</generator>
233       <updated>...</updated>
234       <entry>
235         <title>Feed purpose</title>
236         <id>tag:www.physics.drexel.edu,...:/~wking/phys201</id>
237         <link href="http://www.physics.drexel.edu/~wking/phys201"/>
238         <published>...</published>
239         <updated>...</updated>
240         <content type="xhtml">
241           <html:div xmlns:html="http://www.w3.org/1999/xhtml">Changes will be noted in this feed.</html:div>
242         </content>
243       </entry>
244     </feed>
245     <BLANKLINE>
246
247     Note that we cannot move the html namespace declaration to the
248     `<feed>` start tag until there is a way to update namespace maps
249     on the fly.  See `lxml bug 555602`_.
250
251     .. _lxml bug 555602: https://bugs.launchpad.net/lxml/+bug/555602
252
253     Cleanup.
254
255     >>> remove(path)
256     """
257     name = 'add'
258
259     def _get_parser(self):
260         usage = ['%prog [general-options] add [options] TITLE LINK',
261                  '',
262                  'Where',
263                  '  TITLE is the title of the new entry',
264                  '  LINK is the URI of that the entry refers to']
265         parser = OptionParser(usage='\n'.join(usage))
266         parser.disable_interspersed_args()
267         parser.add_option('-i', '--input', dest='ifilename', metavar='FILE',
268                           help=('Input file for generated feed '
269                                 '(defaults to stdin)'))
270         parser.add_option('-c', '--content', dest='content', metavar='FILE',
271                           help=('Input file for entry content '
272                                 '(defaults to stdin, unless input is stdin, '
273                                 'in which case this option is required.)'))
274         return parser
275
276     def _run(self, options, args):
277         title = unicode(args[0])
278         link = unicode(args[1])
279
280         parser = objectify.makeparser()
281
282         if options.ifilename == None:
283             assert options.content != None, (
284                 'Need to use one of --input or --content')
285             root = objectify.parse(sys.stdin, parser=parser)
286         else:
287             root = objectify.parse(options.ifilename, parser=parser)
288
289         feed = root.getroot()
290
291         if options.content == None:
292             content = sys.stdin.read()
293         else:
294             content = file(options.content, 'r').read()
295
296         entry = etree.SubElement(feed, ATOM + 'entry')
297         etree.SubElement(entry, ATOM + 'title')
298         entry.title = title
299
300         tpub = time.time()
301         etree.SubElement(entry, ATOM + 'id')
302         entry.id = _id(link, tpub)
303
304         etree.SubElement(entry, ATOM + 'link')
305         entry.link.attrib['href'] = link
306
307         etree.SubElement(entry, ATOM + 'published')
308         entry.published = _timestamp(tpub)
309
310         etree.SubElement(entry, ATOM + 'updated')
311         entry.updated = _timestamp(tpub)
312
313         etree.SubElement(entry, ATOM + 'content')
314         entry.content.attrib['type'] = 'xhtml'
315         etree.SubElement(entry.content, XHTML + 'div')
316         entry.content[XHTML + 'div'] = content
317
318         if not hasattr(feed, u'updated') :
319             etree.SubElement(feed, ATOM + 'updated')
320         feed.updated = _timestamp(tpub)
321
322         # remove http://codespeak.net/lxml/objectify/pytype namespace
323         objectify.deannotate(feed)
324         etree.cleanup_namespaces(feed)
325
326         return feed
327
328
329 def test():
330     import doctest
331     doctest.testmod()
332
333
334 if __name__ == "__main__" and True:
335     commands = [NewFeedCommand(), AddEntryCommand()]
336     command_dict = dict([(c.name, c) for c in commands])
337     usage = ['%prog [options] command [command-options]',
338              '',
339              'Where command is one of']
340     usage.extend(['  %s\t%s' % (c.name, c.__doc__.splitlines()[0])
341                   for c in commands])
342
343     parser = OptionParser(usage='\n'.join(usage))
344     parser.disable_interspersed_args()
345     parser.add_option('-o', '--output', dest='ofilename', metavar='FILE',
346                       help='Output file for generated feed (defaults to stdout)')
347     parser.add_option('--test', dest='test', action='store_true',
348                       help='Run the module test suite')
349     (options, args) = parser.parse_args()
350
351     if options.test == True:
352         test()
353         sys.exit(0)
354
355     command_name = args[0]
356     command = command_dict[command_name]
357     args = args[1:]
358     feed = command.run(args)
359
360     ostring = etree.tostring(
361             feed, pretty_print=True, xml_declaration=True, encoding='UTF-8')
362     if options.ofilename == None:
363         print ostring,
364     else:
365         with file(options.ofilename, 'w') as of:
366             of.write(ostring)