Don't convert (X)HTML to unicode (lxml works better on raw bytes).

author W. Trevor King <wking@drexel.edu>

Sun, 9 Jan 2011 13:52:48 +0000 (08:52 -0500)

committer W. Trevor King <wking@drexel.edu>

Sun, 9 Jan 2011 13:52:48 +0000 (08:52 -0500)
author W. Trevor King <wking@drexel.edu>
Sun, 9 Jan 2011 13:52:48 +0000 (08:52 -0500)
committer W. Trevor King <wking@drexel.edu>
Sun, 9 Jan 2011 13:52:48 +0000 (08:52 -0500)
diff --git a/posts/get_css/get_css.py b/posts/get_css/get_css.py

index a25e3f24c0a8c19af46041f73c8590db2dca14fc..0b8dcc7317aef4670423dddb27fdfbb4557c0427 100755 (executable)
--- a/posts/get_css/get_css.py
+++ b/posts/get_css/get_css.py
@@ -65,7 +65,7 @@ def _standardize_text(text):
          text = text.replace(nl, '\n')
      return text
  
-def get_page(url):
+def get_page(url, standardize_text=True):
      LOG.info('get %s' % url)
      f = urlopen(url)
      info = f.info()
@@ -75,9 +75,9 @@ def get_page(url):
      ctype = f.headers['content-type']
      body = f.read()
      f.close()
-    if info.getmaintype() == 'text':
+    if info.getmaintype() == 'text' and standardize_text == True:
          try:
-            type,encoding = ctype.split('charset=')
+            _type,encoding = ctype.split('charset=')
          except ValueError:
              encoding = 'utf-8'
          body = unicode(body, encoding)
@@ -95,7 +95,7 @@ def is_stylesheet(link):
  
  def get_css(url):
      "Return urls for all CSS linked to from the (X)HTML at `url`."
-    info,body = get_page(url)
+    info,body = get_page(url, standardize_text=False)
      assert info.getmaintype() == 'text', 'invalid type %s' % info.gettype()
      if info.getsubtype() == 'html':
          parser = etree.HTMLParser()
@@ -116,7 +116,7 @@ def _fetch_css(url):
      if info.gettype() != 'text/css':
          LOG.warn('invalid type for %s: %s' % (url, info.gettype()))
          return (None, None)
-    LOG.info('returning CSS for %s' % url)
+    LOG.info('returning CSS for %s (type %s)' % (url, type(body)))
      return (None, body)
  
  class MonkeyCSSParser (CSSParser):
@@ -250,7 +250,7 @@ if __name__ == '__main__':
      urls = get_css(args.url)
      full = consolidate_css(
          urls, data_dir=args.data_dir, data_url=args.data_url)
-    bytes = full.encode('utf-8')
+    bytes = full  #full.encode('utf-8')
  
      if args.output == None:
          sys.stdout.write(bytes)
author	W. Trevor King <wking@drexel.edu>
	Sun, 9 Jan 2011 13:52:48 +0000 (08:52 -0500)
committer	W. Trevor King <wking@drexel.edu>
	Sun, 9 Jan 2011 13:52:48 +0000 (08:52 -0500)