doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn

   1 [[!meta title="ikiwiki-wordpress-import"]]
   2
   3 I modified the script a bit so categories and tags would actually show up in the output file.
   4
   5
   6 <pre>
   7 #!/usr/bin/env python
   8
   9 """
  10     Purpose:
  11     Wordpress-to-Ikiwiki import tool
  12
  13     Copyright:
  14     Copyright (C) 2007  Chris Lamb <chris@chris-lamb.co.uk>
  15
  16     This program is free software: you can redistribute it and/or modify
  17     it under the terms of the GNU General Public License as published by
  18     the Free Software Foundation, either version 3 of the License, or
  19     (at your option) any later version.
  20
  21     This program is distributed in the hope that it will be useful,
  22     but WITHOUT ANY WARRANTY; without even the implied warranty of
  23     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  24     GNU General Public License for more details.
  25
  26     You should have received a copy of the GNU General Public License
  27     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  28
  29     Usage: run --help as an argument with this script.
  30
  31     Notes:
  32     I added some extra bits to include the [[!tag foo]] stuff in the post,
  33     as it wasn't before, at all. I'll diff the versions out so you can see
  34     the mess I made :).
  35
  36 """
  37
  38 import os, sys
  39 import time
  40 import re
  41
  42 from BeautifulSoup import BeautifulSoup
  43
  44 import codecs, htmlentitydefs
  45
  46 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
  47     % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
  48
  49 def main(name, email, subdir, branch='master'):
  50     soup = BeautifulSoup(sys.stdin.read())
  51
  52     # Regular expression to match stub in URL.
  53     stub_pattern = re.compile(r'.*\/(.+)\/$')
  54
  55     for x in soup.findAll('item'):
  56         # Ignore draft posts
  57         if x.find('wp:status').string != 'publish': continue
  58
  59         match = stub_pattern.match(x.guid.string)
  60         if match:
  61             stub = match.groups()[0]
  62         else:
  63             # Fall back to our own stubs
  64             stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
  65
  66         commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
  67         timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
  68
  69         content = '[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
  70         content += x.find('content:encoded').string.replace('\r\n', '\n')
  71
  72         # categories = x.findAll('category')
  73         # categories = x.findAll({'category':True}, attrs={'domain':re.compile(('category|tag'))})
  74         # categories = x.findAll({'category':True}, domain=["category", "tag"])
  75         # categories = x.findAll({'category':True}, nicename=True)
  76         """
  77         We do it differently here because we have duplicates otherwise.
  78         Take a look:
  79         <category><![CDATA[Health]]></category>
  80         <category domain="category" nicename="health"><![CDATA[Health]]></category>
  81
  82         If we do the what original did, we end up with all tags and cats doubled.
  83         Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
  84         I'd much rather have the value of 'nicename', and tried, but my
  85         python skillz are extremely limited....
  86         """
  87         categories = x.findAll('category', nicename=True)
  88         if categories:
  89             content += "\n"
  90             for cat in categories:
  91                 # remove 'tags/' because we have a 'tagbase' set.
  92                 # your choice: 'tag', or 'taglink'
  93                 # content += "\n[[!tag %s]]" % (cat.string.replace(' ', '-'))
  94                 content += "\n[[!taglink %s]]" % (cat.string.replace(' ', '-'))
  95                 # print >>sys.stderr, cat.string.replace(' ', '-')
  96
  97         # moved this thing down
  98         data = content.encode('ascii', 'html_replace')
  99         print "commit refs/heads/%s" % branch
 100         print "committer %s <%s> %d +0000" % (name, email, timestamp)
 101         print "data %d" % len(commit_msg)
 102         print commit_msg
 103         print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
 104         print "data %d" % len(data)
 105         print data
 106
 107 if __name__ == "__main__":
 108     if len(sys.argv) not in (4, 5):
 109         print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
 110     else:
 111         main(*sys.argv[1:])
 112
 113 </pre>