From: W. Trevor King Date: Sun, 9 Jan 2011 13:52:48 +0000 (-0500) Subject: Don't convert (X)HTML to unicode (lxml works better on raw bytes). X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=dbbae826b72361a381a667cb176afa24ad321095;p=blog.git Don't convert (X)HTML to unicode (lxml works better on raw bytes). --- diff --git a/posts/get_css/get_css.py b/posts/get_css/get_css.py index a25e3f2..0b8dcc7 100755 --- a/posts/get_css/get_css.py +++ b/posts/get_css/get_css.py @@ -65,7 +65,7 @@ def _standardize_text(text): text = text.replace(nl, '\n') return text -def get_page(url): +def get_page(url, standardize_text=True): LOG.info('get %s' % url) f = urlopen(url) info = f.info() @@ -75,9 +75,9 @@ def get_page(url): ctype = f.headers['content-type'] body = f.read() f.close() - if info.getmaintype() == 'text': + if info.getmaintype() == 'text' and standardize_text == True: try: - type,encoding = ctype.split('charset=') + _type,encoding = ctype.split('charset=') except ValueError: encoding = 'utf-8' body = unicode(body, encoding) @@ -95,7 +95,7 @@ def is_stylesheet(link): def get_css(url): "Return urls for all CSS linked to from the (X)HTML at `url`." - info,body = get_page(url) + info,body = get_page(url, standardize_text=False) assert info.getmaintype() == 'text', 'invalid type %s' % info.gettype() if info.getsubtype() == 'html': parser = etree.HTMLParser() @@ -116,7 +116,7 @@ def _fetch_css(url): if info.gettype() != 'text/css': LOG.warn('invalid type for %s: %s' % (url, info.gettype())) return (None, None) - LOG.info('returning CSS for %s' % url) + LOG.info('returning CSS for %s (type %s)' % (url, type(body))) return (None, body) class MonkeyCSSParser (CSSParser): @@ -250,7 +250,7 @@ if __name__ == '__main__': urls = get_css(args.url) full = consolidate_css( urls, data_dir=args.data_dir, data_url=args.data_url) - bytes = full.encode('utf-8') + bytes = full #full.encode('utf-8') if args.output == None: sys.stdout.write(bytes)