--- /dev/null
+#!/usr/bin/env python
+#
+# Copyright (C) 2010 W. Trevor King <wking@drexel.edu>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program. If not, see
+# <http://www.gnu.org/licenses/>.
+
+'Generate CSS mimicking a specified page'
+
+from __future__ import with_statement # support Python 2.5
+
+from codecs import BOM_UTF8
+from logging import CRITICAL, DEBUG, getLogger, StreamHandler, Formatter
+from mimetypes import guess_extension
+from os import mkdir
+import os.path
+try: # Python 3
+ from io import StringIO
+except ImportError:
+ from StringIO import StringIO
+import sys
+try: # Python 3
+ from urllib.request import urlopen
+ from urllib.parse import urljoin
+except ImportError:
+ from urllib2 import urlopen
+ from urlparse import urljoin
+
+from lxml import etree
+from cssutils import CSSParser, replaceUrls, resolveImports
+from cssutils import log as _cssutils_log
+import cssutils # for MonkeyCSSParser
+
+
+DATA_DIR = 'data'
+DATA_URL = '/data'
+
+LOG = getLogger('get_css')
+LOG.setLevel(DEBUG)
+_STREAM_HANDLER = StreamHandler()
+_STREAM_HANDLER.setLevel(CRITICAL)
+_STREAM_HANDLER.setFormatter(
+ Formatter('%(levelname)s - %(message)s'))
+LOG.addHandler(_STREAM_HANDLER)
+
+
+def _standardize_text(text):
+ # remove byte-order marker (BOM) if present
+ # possible Python parsing bug. See
+ # http://evanjones.ca/python-utf8.html#bom
+ text = text.lstrip(unicode(BOM_UTF8, 'utf-8'))
+
+ for nl in ['\r\n', '\r']: # standardize newlines
+ text = text.replace(nl, '\n')
+ return text
+
+def get_page(url):
+ LOG.info('get %s' % url)
+ f = urlopen(url)
+ info = f.info()
+ _url = f.geturl()
+ if _url != url:
+ LOG.info('%s redirected to %s' % (url, _url))
+ ctype = f.headers['content-type']
+ body = f.read()
+ f.close()
+ if info.getmaintype() == 'text':
+ try:
+ type,encoding = ctype.split('charset=')
+ except ValueError:
+ encoding = 'utf-8'
+ body = unicode(body, encoding)
+ body = _standardize_text(body)
+ return (info, body)
+
+
+def is_stylesheet(link):
+ "Return `True` if the `etree._Element` `link` is a stylesheet."
+ for attr,value in [('rel', 'stylesheet'), ('type', 'text/css')]:
+ v = link.get(attr).lower()
+ if v != value:
+ return False
+ return True
+
+def get_css(url):
+ "Return urls for all CSS linked to from the (X)HTML at `url`."
+ info,body = get_page(url)
+ assert info.getmaintype() == 'text', 'invalid type %s' % info.gettype()
+ if info.getsubtype() == 'html':
+ parser = etree.HTMLParser()
+ elif info.getsubtype() == 'xhtml':
+ parser = etree.XMLParser()
+ else:
+ raise ValueError('invalid page type %s' % info.gettype())
+ x = etree.parse(StringIO(body), parser)
+ for link in x.iterfind('.//link[@rel]'):
+ if is_stylesheet(link):
+ LOG.info('page %s links to %s' % (url, link.get('href')))
+ yield urljoin(url, link.get('href'))
+
+
+def _fetch_css(url):
+ "Get CSS from `url`, check type, and print a log message."
+ info,body = get_page(url)
+ if info.gettype() != 'text/css':
+ LOG.warn('invalid type for %s: %s' % (url, info.gettype()))
+ return (None, None)
+ LOG.info('returning CSS for %s' % url)
+ return (None, body)
+
+class MonkeyCSSParser (CSSParser):
+ """Fix issue 48.
+
+ http://code.google.com/p/cssutils/issues/detail?id=48
+ """
+ def __init__(self, *args, **kwargs):
+ super(MonkeyCSSParser, self).__init__(*args, **kwargs)
+ self.__fetcher = kwargs['fetcher']
+
+ def parseUrl(self, href, encoding=None, media=None, title=None):
+ encoding, enctype, text = cssutils.util._readUrl(
+ href, fetcher=self.__fetcher,
+ overrideEncoding=encoding)
+ if enctype == 5:
+ # do not used if defaulting to UTF-8
+ encoding = None
+
+ if text is not None:
+ return self.parseString(text, encoding=encoding,
+ href=href, media=media, title=title)
+
+
+class CSSReplacer (object):
+ """Replace `url(...)` references in stylesheets with local values.
+
+ Downloads the files, adjusting the extension if necessary, and
+ update reference to point to the local copies.
+ """
+ def __init__(self, href=None, data_dir=None, data_url=None):
+ self._href = href
+ if data_dir == None:
+ data_dir = DATA_DIR
+ self._data_dir = data_dir
+ if data_url == None:
+ data_url = DATA_URL
+ if not data_url.endswith('/'):
+ data_url += '/' # urlljoin needs trailing slash
+ self._data_url = data_url
+
+ def __call__(self, url):
+ full_url = urljoin(self._href, url)
+ _url = os.path.basename(url)
+ root,ext = os.path.splitext(_url)
+ info,data = get_page(full_url)
+ expected_ext = guess_extension(info.gettype())
+ for _from,_to in [('.jpe', '.jpg')]:
+ if expected_ext == _from:
+ LOG.debug('wierd exception %s from type %s'
+ % (_from, info.gettype()))
+ expected_ext = _to
+ if expected_ext != ext:
+ LOG.info('changing extension for %s from %s to %s'
+ % (full_url, ext, expected_ext))
+ filename = root + expected_ext
+ target = urljoin(self._data_url, filename)
+ LOG.info('replace url %s -> %s' % (full_url, target))
+ LOG.debug('download %s' % full_url)
+ if not os.path.exists(self._data_dir):
+ mkdir(self._data_dir)
+ with open(os.path.join(self._data_dir, filename), 'wb') as f:
+ f.write(data)
+ return _url
+
+
+def _standardize_css(sheet, **kwargs):
+ "Post-process `sheet` to adapt it to to the local environment."
+ sheet = resolveImports(sheet)
+ replaceUrls(sheet, CSSReplacer(href=sheet.href, **kwargs))
+ return sheet
+
+def consolidate_css(urls, parser=None, **kwargs):
+ """Get a single, standardized stylesheet combining each URL in `urls`.
+
+ Missing URLs are ignored.
+ """
+ if parser == None:
+ parser = MonkeyCSSParser(fetcher=_fetch_css)
+ lines = []
+ for url in urls:
+ sheet = parseUrl(url)
+ if sheet == None:
+ continue
+ sheet = _standardize_css(sheet, **kwargs)
+ lines.extend(['/* %s */' % url, '', sheet.cssText, ''])
+ return '\n'.join(lines)
+
+
+if __name__ == '__main__':
+ try: # argparse code is untested
+ from argparse import ArgumentParser
+
+ p = ArgumentParser(description=__doc__)
+ p.add_argument('-v', '--verbose', default=0) # TODO: count
+ p.add_argument(
+ '-d', '--data-dir', default=DATA_DIR, dest='data_dir',
+ help='path to downloaded image directory (%(default)).')
+ p.add_argument(
+ '-u', '--data-url', default=DATA_URL, dest='data_url',
+ help='URL to downloaded image directory (%(default)).')
+ p.add_argument(
+ '-o', '--output',
+ help='path to the consolidated output file (`stdout`)')
+ p.add_argument('url', metavar='URL', help='page to mimic')
+ args = p.parse_args()
+ except ImportError:
+ from optparse import OptionParser
+ p = OptionParser(description=__doc__)
+ p.add_option('-v', '--verbose', default=0, action='count')
+ p.add_option(
+ '-d', '--data-dir', default=DATA_DIR, dest='data_dir',
+ help='path to downloaded images directory (%default).')
+ p.add_option(
+ '-u', '--data-url', default=DATA_URL, dest='data_url',
+ help='URL to downloaded image directory (%default).')
+ p.add_option(
+ '-o', '--output',
+ help='path to the consolidated output file (`stdout`)')
+ options,args = p.parse_args()
+ options.url = args[0]
+ args = options
+
+ log_level = CRITICAL - 10*args.verbose
+ _STREAM_HANDLER.setLevel(log_level)
+ _cssutils_log.setLevel(log_level)
+
+ urls = get_css(args.url)
+ full = consolidate_css(
+ urls, data_dir=args.data_dir, data_url=args.data_url)
+ bytes = full.encode('utf-8')
+
+ if args.output == None:
+ sys.stdout.write(bytes)
+ else:
+ with open(args.output, 'w') as f:
+ f.write(bytes)