From: W. Trevor King Date: Sun, 9 Jan 2011 12:20:15 +0000 (-0500) Subject: Add get_css post. X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=9c9f014f37a581f0d7cc774e8ad985956f12ee15;p=mw2txt.git Add get_css post. --- diff --git a/posts/get_css.mdwn b/posts/get_css.mdwn new file mode 100644 index 0000000..754c31a --- /dev/null +++ b/posts/get_css.mdwn @@ -0,0 +1,32 @@ +[[!meta title="get_css.py"]] + +The [Drexel physics department][dept] moved most of its content off of +the department servers and onto college servers this quarter. The +college servers manage their content with SiteCore, so there was a +reasonable amount of trouble getting everything over (see +[[SiteCorePy]]). Luckily, I got *lots* of help, and now I don't have +to worry about the content that has migrated :). However, not all of +the content made the switch. + +We have a number of forms and databases that stayed on our department +servers, and it's my job to make sure those pages look similar to the +SiteCore pages that link to them. No problem, you say, just clone the +SiteCore page's CSS, and apply it to the local pages. That's exactly +what I want to do, but the jittery folks upstream keep changing the +CSS, so my cloned CSS gets out of sync fairly quickly. To minimize my +suffering, I've written a little script to automate the task of +cloning another page's CSS. + +[[get_css.py]] scrapes an (X)HTML page for stylesheets (assuming there +is no embedded styling in the HTML itself). It then downloads all +those CSS files, cleans them up with [cssutils][], and saves a single +clone stylesheet mimicking their behaviour. It also downloads all +media referenced via `url(...)` entries in the CSS (e.g. background +images), and adjusts the CSS to point to the local copies. + +[dept]: http://www.drexel.edu/physics/ +[cssutils]: http://code.google.com/p/cssutils/ + +[[!tag tags/code]] +[[!tag tags/python]] +[[!tag tags/web]] diff --git a/posts/get_css/get_css.py b/posts/get_css/get_css.py new file mode 100755 index 0000000..5e34c0e --- /dev/null +++ b/posts/get_css/get_css.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python +# +# Copyright (C) 2010 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this program. If not, see +# . + +'Generate CSS mimicking a specified page' + +from __future__ import with_statement # support Python 2.5 + +from codecs import BOM_UTF8 +from logging import CRITICAL, DEBUG, getLogger, StreamHandler, Formatter +from mimetypes import guess_extension +from os import mkdir +import os.path +try: # Python 3 + from io import StringIO +except ImportError: + from StringIO import StringIO +import sys +try: # Python 3 + from urllib.request import urlopen + from urllib.parse import urljoin +except ImportError: + from urllib2 import urlopen + from urlparse import urljoin + +from lxml import etree +from cssutils import CSSParser, replaceUrls, resolveImports +from cssutils import log as _cssutils_log +import cssutils # for MonkeyCSSParser + + +DATA_DIR = 'data' +DATA_URL = '/data' + +LOG = getLogger('get_css') +LOG.setLevel(DEBUG) +_STREAM_HANDLER = StreamHandler() +_STREAM_HANDLER.setLevel(CRITICAL) +_STREAM_HANDLER.setFormatter( + Formatter('%(levelname)s - %(message)s')) +LOG.addHandler(_STREAM_HANDLER) + + +def _standardize_text(text): + # remove byte-order marker (BOM) if present + # possible Python parsing bug. See + # http://evanjones.ca/python-utf8.html#bom + text = text.lstrip(unicode(BOM_UTF8, 'utf-8')) + + for nl in ['\r\n', '\r']: # standardize newlines + text = text.replace(nl, '\n') + return text + +def get_page(url): + LOG.info('get %s' % url) + f = urlopen(url) + info = f.info() + _url = f.geturl() + if _url != url: + LOG.info('%s redirected to %s' % (url, _url)) + ctype = f.headers['content-type'] + body = f.read() + f.close() + if info.getmaintype() == 'text': + try: + type,encoding = ctype.split('charset=') + except ValueError: + encoding = 'utf-8' + body = unicode(body, encoding) + body = _standardize_text(body) + return (info, body) + + +def is_stylesheet(link): + "Return `True` if the `etree._Element` `link` is a stylesheet." + for attr,value in [('rel', 'stylesheet'), ('type', 'text/css')]: + v = link.get(attr).lower() + if v != value: + return False + return True + +def get_css(url): + "Return urls for all CSS linked to from the (X)HTML at `url`." + info,body = get_page(url) + assert info.getmaintype() == 'text', 'invalid type %s' % info.gettype() + if info.getsubtype() == 'html': + parser = etree.HTMLParser() + elif info.getsubtype() == 'xhtml': + parser = etree.XMLParser() + else: + raise ValueError('invalid page type %s' % info.gettype()) + x = etree.parse(StringIO(body), parser) + for link in x.iterfind('.//link[@rel]'): + if is_stylesheet(link): + LOG.info('page %s links to %s' % (url, link.get('href'))) + yield urljoin(url, link.get('href')) + + +def _fetch_css(url): + "Get CSS from `url`, check type, and print a log message." + info,body = get_page(url) + if info.gettype() != 'text/css': + LOG.warn('invalid type for %s: %s' % (url, info.gettype())) + return (None, None) + LOG.info('returning CSS for %s' % url) + return (None, body) + +class MonkeyCSSParser (CSSParser): + """Fix issue 48. + + http://code.google.com/p/cssutils/issues/detail?id=48 + """ + def __init__(self, *args, **kwargs): + super(MonkeyCSSParser, self).__init__(*args, **kwargs) + self.__fetcher = kwargs['fetcher'] + + def parseUrl(self, href, encoding=None, media=None, title=None): + encoding, enctype, text = cssutils.util._readUrl( + href, fetcher=self.__fetcher, + overrideEncoding=encoding) + if enctype == 5: + # do not used if defaulting to UTF-8 + encoding = None + + if text is not None: + return self.parseString(text, encoding=encoding, + href=href, media=media, title=title) + + +class CSSReplacer (object): + """Replace `url(...)` references in stylesheets with local values. + + Downloads the files, adjusting the extension if necessary, and + update reference to point to the local copies. + """ + def __init__(self, href=None, data_dir=None, data_url=None): + self._href = href + if data_dir == None: + data_dir = DATA_DIR + self._data_dir = data_dir + if data_url == None: + data_url = DATA_URL + if not data_url.endswith('/'): + data_url += '/' # urlljoin needs trailing slash + self._data_url = data_url + + def __call__(self, url): + full_url = urljoin(self._href, url) + _url = os.path.basename(url) + root,ext = os.path.splitext(_url) + info,data = get_page(full_url) + expected_ext = guess_extension(info.gettype()) + for _from,_to in [('.jpe', '.jpg')]: + if expected_ext == _from: + LOG.debug('wierd exception %s from type %s' + % (_from, info.gettype())) + expected_ext = _to + if expected_ext != ext: + LOG.info('changing extension for %s from %s to %s' + % (full_url, ext, expected_ext)) + filename = root + expected_ext + target = urljoin(self._data_url, filename) + LOG.info('replace url %s -> %s' % (full_url, target)) + LOG.debug('download %s' % full_url) + if not os.path.exists(self._data_dir): + mkdir(self._data_dir) + with open(os.path.join(self._data_dir, filename), 'wb') as f: + f.write(data) + return _url + + +def _standardize_css(sheet, **kwargs): + "Post-process `sheet` to adapt it to to the local environment." + sheet = resolveImports(sheet) + replaceUrls(sheet, CSSReplacer(href=sheet.href, **kwargs)) + return sheet + +def consolidate_css(urls, parser=None, **kwargs): + """Get a single, standardized stylesheet combining each URL in `urls`. + + Missing URLs are ignored. + """ + if parser == None: + parser = MonkeyCSSParser(fetcher=_fetch_css) + lines = [] + for url in urls: + sheet = parseUrl(url) + if sheet == None: + continue + sheet = _standardize_css(sheet, **kwargs) + lines.extend(['/* %s */' % url, '', sheet.cssText, '']) + return '\n'.join(lines) + + +if __name__ == '__main__': + try: # argparse code is untested + from argparse import ArgumentParser + + p = ArgumentParser(description=__doc__) + p.add_argument('-v', '--verbose', default=0) # TODO: count + p.add_argument( + '-d', '--data-dir', default=DATA_DIR, dest='data_dir', + help='path to downloaded image directory (%(default)).') + p.add_argument( + '-u', '--data-url', default=DATA_URL, dest='data_url', + help='URL to downloaded image directory (%(default)).') + p.add_argument( + '-o', '--output', + help='path to the consolidated output file (`stdout`)') + p.add_argument('url', metavar='URL', help='page to mimic') + args = p.parse_args() + except ImportError: + from optparse import OptionParser + p = OptionParser(description=__doc__) + p.add_option('-v', '--verbose', default=0, action='count') + p.add_option( + '-d', '--data-dir', default=DATA_DIR, dest='data_dir', + help='path to downloaded images directory (%default).') + p.add_option( + '-u', '--data-url', default=DATA_URL, dest='data_url', + help='URL to downloaded image directory (%default).') + p.add_option( + '-o', '--output', + help='path to the consolidated output file (`stdout`)') + options,args = p.parse_args() + options.url = args[0] + args = options + + log_level = CRITICAL - 10*args.verbose + _STREAM_HANDLER.setLevel(log_level) + _cssutils_log.setLevel(log_level) + + urls = get_css(args.url) + full = consolidate_css( + urls, data_dir=args.data_dir, data_url=args.data_url) + bytes = full.encode('utf-8') + + if args.output == None: + sys.stdout.write(bytes) + else: + with open(args.output, 'w') as f: + f.write(bytes)