From: W. Trevor King <wking@drexel.edu>
Date: Sun, 9 Jan 2011 13:52:48 +0000 (-0500)
Subject: Don't convert (X)HTML to unicode (lxml works better on raw bytes).
X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=dbbae826b72361a381a667cb176afa24ad321095;p=blog.git

Don't convert (X)HTML to unicode (lxml works better on raw bytes).
---

diff --git a/posts/get_css/get_css.py b/posts/get_css/get_css.py
index a25e3f2..0b8dcc7 100755
--- a/posts/get_css/get_css.py
+++ b/posts/get_css/get_css.py
@@ -65,7 +65,7 @@ def _standardize_text(text):
         text = text.replace(nl, '\n')
     return text
 
-def get_page(url):
+def get_page(url, standardize_text=True):
     LOG.info('get %s' % url)
     f = urlopen(url)
     info = f.info()
@@ -75,9 +75,9 @@ def get_page(url):
     ctype = f.headers['content-type']
     body = f.read()
     f.close()
-    if info.getmaintype() == 'text':
+    if info.getmaintype() == 'text' and standardize_text == True:
         try:
-            type,encoding = ctype.split('charset=')
+            _type,encoding = ctype.split('charset=')
         except ValueError:
             encoding = 'utf-8'
         body = unicode(body, encoding)
@@ -95,7 +95,7 @@ def is_stylesheet(link):
 
 def get_css(url):
     "Return urls for all CSS linked to from the (X)HTML at `url`."
-    info,body = get_page(url)
+    info,body = get_page(url, standardize_text=False)
     assert info.getmaintype() == 'text', 'invalid type %s' % info.gettype()
     if info.getsubtype() == 'html':
         parser = etree.HTMLParser()
@@ -116,7 +116,7 @@ def _fetch_css(url):
     if info.gettype() != 'text/css':
         LOG.warn('invalid type for %s: %s' % (url, info.gettype()))
         return (None, None)
-    LOG.info('returning CSS for %s' % url)
+    LOG.info('returning CSS for %s (type %s)' % (url, type(body)))
     return (None, body)
 
 class MonkeyCSSParser (CSSParser):
@@ -250,7 +250,7 @@ if __name__ == '__main__':
     urls = get_css(args.url)
     full = consolidate_css(
         urls, data_dir=args.data_dir, data_url=args.data_url)
-    bytes = full.encode('utf-8')
+    bytes = full  #full.encode('utf-8')
 
     if args.output == None:
         sys.stdout.write(bytes)