text = text.replace(nl, '\n')
return text
-def get_page(url):
+def get_page(url, standardize_text=True):
LOG.info('get %s' % url)
f = urlopen(url)
info = f.info()
ctype = f.headers['content-type']
body = f.read()
f.close()
- if info.getmaintype() == 'text':
+ if info.getmaintype() == 'text' and standardize_text == True:
try:
- type,encoding = ctype.split('charset=')
+ _type,encoding = ctype.split('charset=')
except ValueError:
encoding = 'utf-8'
body = unicode(body, encoding)
def get_css(url):
"Return urls for all CSS linked to from the (X)HTML at `url`."
- info,body = get_page(url)
+ info,body = get_page(url, standardize_text=False)
assert info.getmaintype() == 'text', 'invalid type %s' % info.gettype()
if info.getsubtype() == 'html':
parser = etree.HTMLParser()
if info.gettype() != 'text/css':
LOG.warn('invalid type for %s: %s' % (url, info.gettype()))
return (None, None)
- LOG.info('returning CSS for %s' % url)
+ LOG.info('returning CSS for %s (type %s)' % (url, type(body)))
return (None, body)
class MonkeyCSSParser (CSSParser):
urls = get_css(args.url)
full = consolidate_css(
urls, data_dir=args.data_dir, data_url=args.data_url)
- bytes = full.encode('utf-8')
+ bytes = full #full.encode('utf-8')
if args.output == None:
sys.stdout.write(bytes)