posts/ikiwiki-nanoblogger-import.py

   1 #!/usr/bin/env python
   2
   3 """
   4     Purpose:
   5     Nanoblogger-to-Ikiwiki import tool
   6
   7     Copyright:
   8     Copyright (C) 2007  Chris Lamb <lamby@debian.org>
   9     Copyright (C) 2010  W. Trevor King <wking@drexel.edu>
  10
  11     This program is free software: you can redistribute it and/or modify
  12     it under the terms of the GNU General Public License as published by
  13     the Free Software Foundation, either version 3 of the License, or
  14     (at your option) any later version.
  15
  16     This program is distributed in the hope that it will be useful,
  17     but WITHOUT ANY WARRANTY; without even the implied warranty of
  18     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19     GNU General Public License for more details.
  20
  21     You should have received a copy of the GNU General Public License
  22     along with this program.  If not, see .
  23
  24     Usage: run --help as an argument with this script.
  25
  26     Notes:
  27     I added some extra bits to include the [[!tag tags/foo]] stuff in the post,
  28     as it wasn't before, at all. I'll diff the versions out so you can see
  29     the mess I made :).
  30
  31 """
  32
  33 import os, sys
  34 import time
  35 import re
  36
  37 from datetime import datetime
  38 import codecs, htmlentitydefs
  39
  40
  41 class Tag (object):
  42     def __init__(self, path):
  43         self.path = path
  44         lines = [x.strip() for x in open(path, 'r').readlines()]
  45         self.tag = lines[0]
  46         self.files = lines[1:]
  47
  48     def is_tagged(self, path):
  49         return os.path.basename(path) in self.files
  50
  51     @staticmethod
  52     def is_tag_file(filename):
  53         return filename.startswith('cat_') and filename.endswith('.db')
  54
  55
  56 def parse_file(path, possible_tags):
  57     lines = open(path, 'r').readlines()
  58     post_dict = {}
  59     while True:  # parse header
  60         line = lines.pop(0)
  61         if line.startswith('-----'):
  62             break
  63         field,value = [x.strip() for x in line.split(':', 1)]
  64         post_dict[field.lower()] = value
  65     assert lines[0].startswith('BODY:'), lines[0]
  66     lines.pop(0)
  67     assert lines[-1].startswith('END-----'), lines[-1]
  68     lines.pop(-1)
  69     text = '\n'.join([unicode(x.rstrip(), 'utf-8') for x in lines])
  70     post_dict['text'] = text
  71
  72     post_dict['timestamp'] = time.mktime(time.strptime(
  73             post_dict['date'].replace('EST ', '').replace('EDT ', ''), '%c'))
  74     if 'EDT' in post_dict['date']:
  75         post_dict['timestamp'] += 4*60*60
  76     elif 'EST' in post_dict['date']:
  77         post_dict['timestamp'] += 5*60*60
  78     else:
  79         raise NotImplementedError('unknown time zone in %s'
  80                                   % post_dict['date'])
  81     post_dict['tags'] = [t.tag for t in possible_tags if t.is_tagged(path)]
  82     return post_dict
  83
  84
  85 def format_commit(post_dict, name, email, subdir, branch):
  86     stub = post_dict['title'].replace(' ', '_')
  87     if post_dict['format'].lower() == 'markdown':
  88         ext = 'mdwn'
  89     else:
  90         raise NotImplementedError('Unkown extension for %s'
  91                                   % post_dict['format'])
  92     commit_msg = '''Importing NanoBlogger post "%s"''' % (post_dict['title'])
  93
  94     lines = [
  95         '[[!meta  title="%s"]]' % (post_dict['title'].replace('"', r"'")),
  96         '[[!meta  date="%s"]]' % datetime.fromtimestamp(post_dict['timestamp']),
  97         post_dict['text']]
  98
  99     if len(post_dict['tags']) > 0:
 100         lines.append('')
 101     for tag in post_dict['tags']:
 102         lines.append(
 103             '[[!tag tags/%s]]' % (tag.replace(' ', '-').replace('/', '-').lower()))
 104     lines.append('')
 105     data = '\n'.join(lines).encode('utf-8', 'html_replace')
 106     ret = [
 107         "commit refs/heads/%s" % branch,
 108         "committer %s <%s> %d +0000" % (name, email, post_dict['timestamp']),
 109         "data %d" % len(commit_msg),
 110         commit_msg,
 111         "M 644 inline %s" % os.path.join(subdir, "%s.%s" % (stub, ext)),
 112         "data %d" % len(data),
 113         data,
 114     ]
 115     return '\n'.join(ret)
 116
 117
 118 def main(name, email, data_dir, subdir, branch='master'):
 119     files = os.listdir(data_dir)
 120     tags = []
 121     for x in files:  # read tag (category) files
 122         if Tag.is_tag_file(x):
 123             tags.append(Tag(os.path.join(data_dir, x)))
 124     posts = []
 125     for x in files:
 126         if Tag.is_tag_file(x):
 127             continue
 128         if x.endswith('.db'):
 129             continue  # ignore master.db.  it just repeats tag info
 130         posts.append(parse_file(os.path.join(data_dir, x), tags))
 131     posts.sort(key=lambda x:x['timestamp'])
 132     for x in posts:
 133         print format_commit(x, name, email, subdir, branch)
 134
 135
 136 if __name__ == "__main__":
 137     if len(sys.argv) not in (5, 6):
 138         print >>sys.stderr, "%s: usage: %s name email datadir subdir [branch] | git-fast-import " % (sys.argv[0], sys.argv[0])
 139     else:
 140         main(*sys.argv[1:])