From 8f73173c67d4e979b697ff67b4614f8e59abf5b8 Mon Sep 17 00:00:00 2001 From: Lindsey Smith Date: Fri, 4 Mar 2011 12:00:00 +0000 Subject: [PATCH] Bump to version 2.71. --- CHANGELOG | 71 +++--- feedparser.py | 597 ++++++++++++++++++++++++++++++++++---------------- html2text.py | 104 +++++---- rss2email.py | 15 +- 4 files changed, 516 insertions(+), 271 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index fe78dbd..3fadbec 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,30 +1,34 @@ +v2.71 (2011-03-04) + * Potentialy safer method for writing feeds.dat on UNIX + * Handle via links with no title attribute + * Handle attributes more cleanly with OVERRIDE_EMAIL and DEFAULT_EMAIL + v2.70 (2010-12-21) -* Improved handling of given feed email addresses to prevent mail servers rejecting poorly formed Froms -* Added X-RSS-TAGS header that lists any tags provided by an entry, which will be helpful in filtering incoming messages + * Improved handling of given feed email addresses to prevent mail servers rejecting poorly formed Froms + * Added X-RSS-TAGS header that lists any tags provided by an entry, which will be helpful in filtering incoming messages v2.69 (2010-11-12) - * Added support for connecting to SMTP server via SSL, see SMTP_SSL option - * Improved backwards compatibility by fixing issue with listing feeds when run with older Python versions - * Added selective feed email overrides through OVERRIDE_EMAIL and DEFAULT_EMAIL options - * Added NO_FRIENDLY_NAME to from from address only without the friendly name - * Added X-RSS-URL header in each message with the link to the original item + * Added support for connecting to SMTP server via SSL, see SMTP_SSL option + * Improved backwards compatibility by fixing issue with listing feeds when run with older Python versions + * Added selective feed email overrides through OVERRIDE_EMAIL and DEFAULT_EMAIL options + * Added NO_FRIENDLY_NAME to from from address only without the friendly name + * Added X-RSS-URL header in each message with the link to the original item v2.68 (2010-10-01) * Added ability to pause/resume checking of individual feeds through pause and unpause commands - * Added ability to import and export OPML feed lists through importopml and exportopml commands - + * Added ability to import and export OPML feed lists through importopml and exportopml commands + v2.67 (2010-09-21) * Fixed entries that include an id which is blank (i.e., an empty string) were being resent * Fixed some entries not being sent by email because they had bad From headers * Fixed From headers with HTML entities encoded twice * Compatibility changes to support most recent development versions of feedparser * Compatibility changes to support Google Reader feeds - + v2.66 (2009-12-21) - * Complete packaging of all necessary source files (rss2email, html2text, feedparser, r2e, etc.) into one bundle - o Included a more complete config.py with all options - o Default to HTML mail and CSS results + o Included a more complete config.py with all options + o Default to HTML mail and CSS results * Added 'reset' command to erase history of already seen entries * Changed project email to 'lindsey@allthingsrss.com' and project homepage to 'http://www.allthingsrss.com/rss2email/' * Made exception and error output text more useful @@ -42,42 +46,36 @@ v2.65 (2009-01-05) * No file locking for SunOS v2.64 (2008-10-21) - * Bug-fix version - o Gracefully handle missing charsets - o Friendlier and more useful message if sendmail isn't installed - o SunOS locking fix + o Gracefully handle missing charsets + o Friendlier and more useful message if sendmail isn't installed + o SunOS locking fix v2.63 (2008-06-13) - * Bug-fix version and license change: - o Licensed under GPL 2 & 3 now - o Display feed number in warning and error message lines - o Fix for unicode handling problem with certain entry titles + o Licensed under GPL 2 & 3 now + o Display feed number in warning and error message lines + o Fix for unicode handling problem with certain entry titles v2.62 (2008-01-14) - * Bug-fix version: - o Simplified SunOS fix - o Local feeds (/home/user/file.xml) should work + o Simplified SunOS fix + o Local feeds (/home/user/file.xml) should work v2.61 (2007-12-07) - * Bug-fix version: - o Now really compatible with SunOS - o Don't wrap long subject headers - o New parameter CHARSET_LIST to override or supplement the order in which charsets are tried against an entry - o Don't use blank content to generate id - o Using GMail as mail server should work - + o Now really compatible with SunOS + o Don't wrap long subject headers + o New parameter CHARSET_LIST to override or supplement the order in which charsets are tried against an entry + o Don't use blank content to generate id + o Using GMail as mail server should work + v2.60 (2006-08-25) - * Small bug-fix version: - o Now compatible with SunOS - o Correctly handle international character sets in email From + o Now compatible with SunOS + o Correctly handle international character sets in email From v2.59 (2006-06-09) - * Finally added oft-requested support for enclosures. Any enclosures, such as a podcast MP3, will be listed under the entry URL * Made feed timeout compatible with Python versions 2.2 and higher, instead of v2.4 only * Added optional, configurable CSS styling to HTML mail. Set USE_CSS_STYLING=1 in your config.py to enable this. If you want to tweak the look, modify STYLE_SHEET. @@ -86,7 +84,6 @@ v2.59 (2006-06-09) * Unfortunately, rss2email is no longer compatible with Python v2.1. Two of the most serious lingering issues with rss2email were waiting forever for non-responsive feeds and its inablility to properly handle feeds with international characters. To properly fix these once and for all, rss2email now depends on functionality that was not available until Python v2.2. Hopefully this does not unduly inconvenience anyone that has not yet upgraded to a more current version of Python. v2.58 (2006-05-11) - * Total rewrite of email code that should fix encoding problems * Added configurable timeout for nonresponsive feeds * Fixed incorrectly using text summary_detail instead of html content @@ -94,7 +91,6 @@ v2.58 (2006-05-11) * Print name of feed that is being deleted v2.57 (2006-04-07) - * Integrated Joey Hess's patches o First, a patch that makes delete more reliable, so it no longer allows you to remove the default email address ('feed' 0) and thereby hose your feed file, or 'remove' entries that don't exist without warning; and so it only says IDs have changed when they really have. Originally from http://bugs.debian.org/313101 o Next a patch that avoids a backtrace if there's no email address defined, and outputs a less scary error message. @@ -116,7 +112,6 @@ v2.57 (2006-04-07) * Broke contributors across multiple lines v2.56 (2006-04-04) - * SMTP AUTH support added * Windows support * Fixed bug with HTML in titles diff --git a/feedparser.py b/feedparser.py index 0760cc2..b9144a9 100644 --- a/feedparser.py +++ b/feedparser.py @@ -6,12 +6,11 @@ Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds Visit http://feedparser.org/ for the latest version Visit http://feedparser.org/docs/ for the latest documentation -Required: Python 2.1 or later -Recommended: Python 2.3 or later +Required: Python 2.4 or later Recommended: CJKCodecs and iconv_codec """ -__version__ = "4.2-pre-" + "$Revision$"[11:14] + "-svn" +__version__ = "5.0.1" __license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -41,7 +40,9 @@ __contributors__ = ["Jason Diamond ", "Aaron Swartz ", "Kevin Marks ", "Sam Ruby ", - "Ade Oshineye "] + "Ade Oshineye ", + "Martin Pool ", + "Kurt McKee "] _debug = 0 # HTTP "User-Agent" header to send to servers when downloading feeds. @@ -75,12 +76,73 @@ RESOLVE_RELATIVE_URIS = 1 # HTML content, set this to 1. SANITIZE_HTML = 1 -# ---------- required modules (should come with any Python distribution) ---------- -import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 +# ---------- Python 3 modules (make it work if possible) ---------- +try: + import rfc822 +except ImportError: + from email import _parseaddr as rfc822 + +try: + # Python 3.1 introduces bytes.maketrans and simultaneously + # deprecates string.maketrans; use bytes.maketrans if possible + _maketrans = bytes.maketrans +except (NameError, AttributeError): + import string + _maketrans = string.maketrans + +# base64 support for Atom feeds that contain embedded binary data try: - from cStringIO import StringIO as _StringIO + import base64, binascii + # Python 3.1 deprecates decodestring in favor of decodebytes + _base64decode = getattr(base64, 'decodebytes', base64.decodestring) except: - from StringIO import StringIO as _StringIO + base64 = binascii = None + +def _s2bytes(s): + # Convert a UTF-8 str to bytes if the interpreter is Python 3 + try: + return bytes(s, 'utf8') + except (NameError, TypeError): + # In Python 2.5 and below, bytes doesn't exist (NameError) + # In Python 2.6 and above, bytes and str are the same (TypeError) + return s + +def _l2bytes(l): + # Convert a list of ints to bytes if the interpreter is Python 3 + try: + if bytes is not str: + # In Python 2.6 and above, this call won't raise an exception + # but it will return bytes([65]) as '[65]' instead of 'A' + return bytes(l) + raise NameError + except NameError: + return ''.join(map(chr, l)) + +# If you want feedparser to allow all URL schemes, set this to () +# List culled from Python's urlparse documentation at: +# http://docs.python.org/library/urlparse.html +# as well as from "URI scheme" at Wikipedia: +# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme +# Many more will likely need to be added! +ACCEPTABLE_URI_SCHEMES = ( + 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto', + 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp', + 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais', + # Additional common-but-unofficial schemes + 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', + 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', +) +#ACCEPTABLE_URI_SCHEMES = () + +# ---------- required modules (should come with any Python distribution) ---------- +import sgmllib, re, sys, copy, urlparse, time, types, cgi, urllib, urllib2, datetime +try: + from io import BytesIO as _StringIO +except ImportError: + try: + from cStringIO import StringIO as _StringIO + except: + from StringIO import StringIO as _StringIO # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- @@ -113,12 +175,6 @@ except: data = data.replace(char, entity) return data -# base64 support for Atom feeds that contain embedded binary data -try: - import base64, binascii -except: - base64 = binascii = None - # cjkcodecs and iconv_codec provide support for more character encodings. # Both are available from http://cjkpython.i18n.org/ try: @@ -171,17 +227,27 @@ class UndeclaredNamespace(Exception): pass sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') sgmllib.special = re.compile(']|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') + class EndBracketRegEx: + def __init__(self): + # Overriding the built-in sgmllib.endbracket regex allows the + # parser to find angle brackets embedded in element attributes. + self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') def search(self,string,index=0): - self.match = self.endbracket.match(string,index) - if self.match: return self - def start(self,n): + match = self.endbracket.match(string,index) + if match is not None: + # Returning a new object in the calling thread's context + # resolves a thread-safety. + return EndBracketMatch(match) + return None + class EndBracketMatch: + def __init__(self, match): + self.match = match + def start(self, n): return self.match.end(n) - sgmllib.endbracket = EndBracketMatch() + sgmllib.endbracket = EndBracketRegEx() SUPPORTED_VERSIONS = {'': 'unknown', 'rss090': 'RSS 0.90', @@ -219,7 +285,7 @@ class FeedParserDict(UserDict): 'guid': 'id', 'date': 'updated', 'date_parsed': 'updated_parsed', - 'description': ['subtitle', 'summary'], + 'description': ['summary', 'subtitle'], 'url': ['href'], 'modified': 'updated', 'modified_parsed': 'updated_parsed', @@ -244,9 +310,9 @@ class FeedParserDict(UserDict): realkey = self.keymap.get(key, key) if type(realkey) == types.ListType: for k in realkey: - if UserDict.has_key(self, k): + if UserDict.__contains__(self, k): return UserDict.__getitem__(self, k) - if UserDict.has_key(self, key): + if UserDict.__contains__(self, key): return UserDict.__getitem__(self, key) return UserDict.__getitem__(self, realkey) @@ -271,9 +337,12 @@ class FeedParserDict(UserDict): def has_key(self, key): try: - return hasattr(self, key) or UserDict.has_key(self, key) + return hasattr(self, key) or UserDict.__contains__(self, key) except AttributeError: return False + # This alias prevents the 2to3 tool from changing the semantics of the + # __contains__ function below and exhausting the maximum recursion depth + __has_key = has_key def __getattr__(self, key): try: @@ -293,7 +362,7 @@ class FeedParserDict(UserDict): return self.__setitem__(key, value) def __contains__(self, key): - return self.has_key(key) + return self.__has_key(key) def zopeCompatibilityHack(): global FeedParserDict @@ -326,9 +395,8 @@ def _ebcdic_to_ascii(s): 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 ) - import string - _ebcdic_to_ascii_map = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + _ebcdic_to_ascii_map = _maketrans( \ + _l2bytes(range(256)), _l2bytes(emap)) return s.translate(_ebcdic_to_ascii_map) _cp1252 = { @@ -482,6 +550,10 @@ class _FeedParserMixin: # normalize attrs attrs = [(k.lower(), v) for k, v in attrs] attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] + # the sgml parser doesn't handle entities in attributes, but + # strict xml parsers do -- account for this difference + if isinstance(self, _LooseFeedParser): + attrs = [(k, v.replace('&', '&')) for k, v in attrs] # track xml:base and xml:lang attrsD = dict(attrs) @@ -491,7 +563,12 @@ class _FeedParserMixin: baseuri = unicode(baseuri, self.encoding) except: baseuri = unicode(baseuri, 'iso-8859-1') - self.baseuri = _urljoin(self.baseuri, baseuri) + # ensure that self.baseuri is always an absolute URI that + # uses a whitelisted URI scheme (e.g. not `javscript:`) + if self.baseuri: + self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri + else: + self.baseuri = _urljoin(self.baseuri, baseuri) lang = attrsD.get('xml:lang', attrsD.get('lang')) if lang == '': # xml:lang could be explicitly set to '', we need to capture that @@ -670,7 +747,7 @@ class _FeedParserMixin: def mapContentType(self, contentType): contentType = contentType.lower() - if contentType == 'text': + if contentType == 'text' or contentType == 'plain': contentType = 'text/plain' elif contentType == 'html': contentType = 'text/html' @@ -734,6 +811,11 @@ class _FeedParserMixin: else: pieces = pieces[1:-1] + # Ensure each piece is a str for Python 3 + for (i, v) in enumerate(pieces): + if not isinstance(v, basestring): + pieces[i] = v.decode('utf-8') + output = ''.join(pieces) if stripWhitespace: output = output.strip() @@ -742,11 +824,15 @@ class _FeedParserMixin: # decode base64 content if base64 and self.contentparams.get('base64', 0): try: - output = base64.decodestring(output) + output = _base64decode(output) except binascii.Error: pass except binascii.Incomplete: pass + except TypeError: + # In Python 3, base64 takes and outputs bytes, not str + # This may not be the most correct way to accomplish this + output = _base64decode(output.encode('utf-8')).decode('utf-8') # resolve relative URIs if (element in self.can_be_relative_uri) and output: @@ -804,7 +890,7 @@ class _FeedParserMixin: # address common error where people take data that is already # utf-8, presume that it is iso-8859-1, and re-encode it. - if self.encoding=='utf-8' and type(output) == type(u''): + if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and type(output) == type(u''): try: output = unicode(output.encode('iso-8859-1'), 'utf-8') except: @@ -829,9 +915,14 @@ class _FeedParserMixin: contentparams['value'] = output self.entries[-1][element].append(contentparams) elif element == 'link': - self.entries[-1][element] = output - if output: - self.entries[-1]['links'][-1]['href'] = output + if not self.inimage: + # query variables in urls in link elements are improperly + # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're + # unhandled character references. fix this special case. + output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) + self.entries[-1][element] = output + if output: + self.entries[-1]['links'][-1]['href'] = output else: if element == 'description': element = 'summary' @@ -846,6 +937,9 @@ class _FeedParserMixin: element = 'subtitle' context[element] = output if element == 'link': + # fix query variables; see above for the explanation + output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) + context[element] = output context['links'][-1]['href'] = output elif self.incontent: contentparams = copy.deepcopy(self.contentparams) @@ -873,21 +967,21 @@ class _FeedParserMixin: # text, but this is routinely ignored. This is an attempt to detect # the most common cases. As false positives often result in silent # data loss, this function errs on the conservative side. - def lookslikehtml(self, str): + def lookslikehtml(self, s): if self.version.startswith('atom'): return if self.contentparams.get('type','text/html') != 'text/plain': return # must have a close tag or a entity reference to qualify - if not (re.search(r'',str) or re.search("&#?\w+;",str)): return + if not (re.search(r'',s) or re.search("&#?\w+;",s)): return # all tags must be in a restricted subset of valid HTML tags if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, - re.findall(r' True, 'clean' to False, and any other value to None + # False and None both evaluate as False, so the difference can be ignored + # by applications that only need to know if the content is explicit. + self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] def _start_media_content(self, attrsD): context = self._getContext() @@ -1587,6 +1698,17 @@ class _FeedParserMixin: context = self._getContext() context['media_player']['content'] = value + def _start_newlocation(self, attrsD): + self.push('newlocation', 1) + + def _end_newlocation(self): + url = self.pop('newlocation') + context = self._getContext() + # don't set newlocation if the context isn't right + if context is not self.feeddata: + return + context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip()) + if _XML_AVAILABLE: class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): def __init__(self, baseuri, baselang, encoding): @@ -1688,9 +1810,9 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): 'source', 'track', 'wbr' ] - def __init__(self, encoding, type): + def __init__(self, encoding, _type): self.encoding = encoding - self.type = type + self._type = _type if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) sgmllib.SGMLParser.__init__(self) @@ -1707,7 +1829,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): def parse_starttag(self,i): j=sgmllib.SGMLParser.parse_starttag(self, i) - if self.type == 'application/xhtml+xml': + if self._type == 'application/xhtml+xml': if j>2 and self.rawdata[j-2:j]=='/>': self.unknown_endtag(self.lasttag) return j @@ -1718,8 +1840,14 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') - if self.encoding and type(data) == type(u''): - data = data.encode(self.encoding) + try: + bytes + if bytes is str: + raise NameError + self.encoding = self.encoding + '_INVALID_PYTHON_3' + except NameError: + if self.encoding and type(data) == type(u''): + data = data.encode(self.encoding) sgmllib.SGMLParser.feed(self, data) sgmllib.SGMLParser.close(self) @@ -1748,7 +1876,11 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): value = unicode(value, self.encoding) except: value = unicode(value, 'iso-8859-1') - uattrs.append((unicode(key, self.encoding), value)) + try: + # Currently, in Python 3 the key is already a str, and cannot be decoded again + uattrs.append((unicode(key, self.encoding), value)) + except TypeError: + uattrs.append((key, value)) strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]) if self.encoding: try: @@ -1839,6 +1971,14 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): '''Return processed HTML as a single string''' return ''.join([str(p) for p in self.pieces]) + def parse_declaration(self, i): + try: + return sgmllib.SGMLParser.parse_declaration(self, i) + except sgmllib.SGMLParseError: + # escape the doctype declaration and continue parsing + self.handle_data('<') + return i+1 + class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): def __init__(self, baseuri, baselang, encoding, entities): sgmllib.SGMLParser.__init__(self) @@ -2018,10 +2158,10 @@ class _MicroformatsParser: arLines = [] def processSingleString(sProperty): - sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1) + sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding) if sValue: arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) - return sValue or '' + return sValue or u'' def processSingleURI(sProperty): sValue = self.getPropertyValue(elmCard, sProperty, self.URI) @@ -2070,8 +2210,8 @@ class _MicroformatsParser: sAgentValue = sAgentValue.replace(';', '\\;') if sAgentValue: arLines.append(self.vcardFold('AGENT:' + sAgentValue)) - elmAgent['class'] = '' - elmAgent.contents = [] + # Completely remove the agent element from the parse tree + elmAgent.extract() else: sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); if sAgentValue: @@ -2218,8 +2358,8 @@ class _MicroformatsParser: processSingleURI('key') if arLines: - arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard'] - sVCards += '\n'.join(arLines) + '\n' + arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard'] + sVCards += u'\n'.join(arLines) + u'\n' return sVCards.strip() @@ -2276,7 +2416,12 @@ class _MicroformatsParser: def _parseMicroformats(htmlSource, baseURI, encoding): if not BeautifulSoup: return if _debug: sys.stderr.write('entering _parseMicroformats\n') - p = _MicroformatsParser(htmlSource, baseURI, encoding) + try: + p = _MicroformatsParser(htmlSource, baseURI, encoding) + except UnicodeEncodeError: + # sgmllib throws this exception when performing lookups of tags + # with non-ASCII characters in them. + return p.vcard = p.findVCards(p.document) p.findTags() p.findEnclosures() @@ -2310,12 +2455,12 @@ class _RelativeURIResolver(_BaseHTMLProcessor): ('q', 'cite'), ('script', 'src')] - def __init__(self, baseuri, encoding, type): - _BaseHTMLProcessor.__init__(self, encoding, type) + def __init__(self, baseuri, encoding, _type): + _BaseHTMLProcessor.__init__(self, encoding, _type) self.baseuri = baseuri def resolveURI(self, uri): - return _urljoin(self.baseuri, uri.strip()) + return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip())) def unknown_starttag(self, tag, attrs): if _debug: @@ -2324,27 +2469,44 @@ class _RelativeURIResolver(_BaseHTMLProcessor): attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) -def _resolveRelativeURIs(htmlSource, baseURI, encoding, type): +def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') - p = _RelativeURIResolver(baseURI, encoding, type) + p = _RelativeURIResolver(baseURI, encoding, _type) p.feed(htmlSource) return p.output() +def _makeSafeAbsoluteURI(base, rel=None): + # bail if ACCEPTABLE_URI_SCHEMES is empty + if not ACCEPTABLE_URI_SCHEMES: + return _urljoin(base, rel or u'') + if not base: + return rel or u'' + if not rel: + scheme = urlparse.urlparse(base)[0] + if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: + return base + return u'' + uri = _urljoin(base, rel) + if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: + return u'' + return uri + class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article', - 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas', - 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', - 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', - 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', - 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', - 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', - 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', - 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', - 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', - 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', - 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] + acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', + 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', + 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', + 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', + 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', + 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', + 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', + 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', + 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', + 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', + 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', + 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', @@ -2468,7 +2630,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): self.unacceptablestack += 1 # add implicit namespaces to html5 inline svg/mathml - if self.type.endswith('html'): + if self._type.endswith('html'): if not dict(attrs).get('xmlns'): if tag=='svg': attrs.append( ('xmlns','http://www.w3.org/2000/svg') ) @@ -2513,6 +2675,9 @@ class _HTMLSanitizer(_BaseHTMLProcessor): for key, value in self.normalize_attrs(attrs): if key in acceptable_attributes: key=keymap.get(key,key) + # make sure the uri uses an acceptable uri scheme + if key == u'href': + value = _makeSafeAbsoluteURI(value) clean_attrs.append((key,value)) elif key=='style': clean_value = self.sanitize_style(value) @@ -2568,9 +2733,22 @@ class _HTMLSanitizer(_BaseHTMLProcessor): return ' '.join(clean) - -def _sanitizeHTML(htmlSource, encoding, type): - p = _HTMLSanitizer(encoding, type) + def parse_comment(self, i, report=1): + ret = _BaseHTMLProcessor.parse_comment(self, i, report) + if ret >= 0: + return ret + # if ret == -1, this may be a malicious attempt to circumvent + # sanitization, or a page-destroying unclosed comment + match = re.compile(r'--[^>]*>').search(self.rawdata, i+4) + if match: + return match.end() + # unclosed comment; deliberately fail to handle_data() + return len(self.rawdata) + + +def _sanitizeHTML(htmlSource, encoding, _type): + p = _HTMLSanitizer(encoding, _type) + htmlSource = htmlSource.replace('= '2.3.3' assert base64 != None - user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') + user, passw = _base64decode(req.headers['Authorization'].split(' ')[1]).split(':') realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] self.add_password(realm, host, user, passw) retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) @@ -2662,7 +2840,7 @@ class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler except: return self.http_error_default(req, fp, code, msg, headers) -def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): +def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers): """URL, filename, or string --> stream This function lets you define parsers that take any input source @@ -2689,6 +2867,9 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h If handlers is supplied, it is a list of handlers used to build a urllib2 opener. + + if request_headers is supplied it is a dictionary of HTTP request headers + that will override the values generated by FeedParser. """ if hasattr(url_file_stream_or_string, 'read'): @@ -2697,7 +2878,12 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h if url_file_stream_or_string == '-': return sys.stdin - if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): + if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): + # Deal with the feed URI scheme + if url_file_stream_or_string.startswith('feed:http'): + url_file_stream_or_string = url_file_stream_or_string[5:] + elif url_file_stream_or_string.startswith('feed:'): + url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:] if not agent: agent = USER_AGENT # test for inline user:password for basic auth @@ -2709,48 +2895,20 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h user_passwd, realhost = urllib.splituser(realhost) if user_passwd: url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) - auth = base64.encodestring(user_passwd).strip() + auth = base64.standard_b64encode(user_passwd).strip() # iri support try: if isinstance(url_file_stream_or_string,unicode): - url_file_stream_or_string = url_file_stream_or_string.encode('idna') + url_file_stream_or_string = url_file_stream_or_string.encode('idna').decode('utf-8') else: - url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna') + url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna').decode('utf-8') except: pass # try to open with urllib2 (to use optional headers) - request = urllib2.Request(url_file_stream_or_string) - request.add_header('User-Agent', agent) - if etag: - request.add_header('If-None-Match', etag) - if type(modified) == type(''): - modified = _parse_date(modified) - if modified: - # format into an RFC 1123-compliant timestamp. We can't use - # time.strftime() since the %a and %b directives can be affected - # by the current locale, but RFC 2616 states that dates must be - # in English. - short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) - if referrer: - request.add_header('Referer', referrer) - if gzip and zlib: - request.add_header('Accept-encoding', 'gzip, deflate') - elif gzip: - request.add_header('Accept-encoding', 'gzip') - elif zlib: - request.add_header('Accept-encoding', 'deflate') - else: - request.add_header('Accept-encoding', '') - if auth: - request.add_header('Authorization', 'Basic %s' % auth) - if ACCEPT_HEADER: - request.add_header('Accept', ACCEPT_HEADER) - request.add_header('A-IM', 'feed') # RFC 3229 support - opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) + request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers) + opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()])) opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: return opener.open(request) @@ -2759,13 +2917,51 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h # try to open with native open function (if url_file_stream_or_string is a filename) try: - return open(url_file_stream_or_string) + return open(url_file_stream_or_string, 'rb') except: pass # treat url_file_stream_or_string as string return _StringIO(str(url_file_stream_or_string)) +def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers): + request = urllib2.Request(url) + request.add_header('User-Agent', agent) + if etag: + request.add_header('If-None-Match', etag) + if type(modified) == type(''): + modified = _parse_date(modified) + elif isinstance(modified, datetime.datetime): + modified = modified.utctimetuple() + if modified: + # format into an RFC 1123-compliant timestamp. We can't use + # time.strftime() since the %a and %b directives can be affected + # by the current locale, but RFC 2616 states that dates must be + # in English. + short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) + if referrer: + request.add_header('Referer', referrer) + if gzip and zlib: + request.add_header('Accept-encoding', 'gzip, deflate') + elif gzip: + request.add_header('Accept-encoding', 'gzip') + elif zlib: + request.add_header('Accept-encoding', 'deflate') + else: + request.add_header('Accept-encoding', '') + if auth: + request.add_header('Authorization', 'Basic %s' % auth) + if ACCEPT_HEADER: + request.add_header('Accept', ACCEPT_HEADER) + # use this for whatever -- cookies, special headers, etc + # [('Cookie','Something'),('x-special-header','Another Value')] + for header_name, header_value in request_headers.items(): + request.add_header(header_name, header_value) + request.add_header('A-IM', 'feed') # RFC 3229 support + return request + _date_handlers = [] def registerDateHandler(func): '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' @@ -2799,9 +2995,15 @@ _iso8601_re = [ + r'(\.(?P\d+))?' + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' for tmpl in _iso8601_tmpl] -del tmpl +try: + del tmpl +except NameError: + pass _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] -del regex +try: + del regex +except NameError: + pass def _parse_date_iso8601(dateString): '''Parse a variety of ISO-8601-compatible formats like 20040105''' m = None @@ -2875,7 +3077,7 @@ def _parse_date_iso8601(dateString): # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) # which is guaranteed to normalize d/m/y/h/m/s. # Many implementations have bugs, but we'll pretend they don't. - return time.localtime(time.mktime(tm)) + return time.localtime(time.mktime(tuple(tm))) registerDateHandler(_parse_date_iso8601) # 8-bit date handling routines written by ytrewq1. @@ -3116,12 +3318,12 @@ def _parse_date_w3dtf(dateString): __date_re = ('(?P\d\d\d\d)' '(?:(?P-|)' - '(?:(?P\d\d\d)' - '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?') + '(?:(?P\d\d)(?:(?P=dsep)(?P\d\d))?' + '|(?P\d\d\d)))?') __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)' __tzd_rx = re.compile(__tzd_re) __time_re = ('(?P\d\d)(?P:|)(?P\d\d)' - '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?' + '(?:(?P=tsep)(?P\d\d)(?:[.,]\d+)?)?' + __tzd_re) __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) __datetime_rx = re.compile(__datetime_re) @@ -3145,6 +3347,10 @@ def _parse_date_rfc822(dateString): else: data.append('') dateString = " ".join(data) + # Account for the Etc/GMT timezone by stripping 'Etc/' + elif len(data) == 5 and data[4].lower().startswith('etc/'): + data[4] = data[4][4:] + dateString = " ".join(data) if len(data) < 5: dateString += ' 00:00:00 GMT' tm = rfc822.parsedate_tz(dateString) @@ -3249,59 +3455,59 @@ def _getCharacterEncoding(http_headers, xml_data): sniffed_xml_encoding = '' xml_encoding = '' true_encoding = '' - http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) + http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type'))) # Must sniff for non-ASCII-compatible character encodings before # searching for XML declaration. This heuristic is defined in # section F of the XML specification: # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': + if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]): # EBCDIC xml_data = _ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': + elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]): # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): + elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': + elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]): # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): + elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': + elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]): # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': + elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]): # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': + elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]): # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': + elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]): # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': + elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]): # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') else: # ASCII-compatible pass - xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) + xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data) except: xml_encoding_match = None if xml_encoding_match: - xml_encoding = xml_encoding_match.groups()[0].lower() + xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower() if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding acceptable_content_type = 0 @@ -3317,7 +3523,7 @@ def _getCharacterEncoding(http_headers, xml_data): true_encoding = http_encoding or 'us-ascii' elif http_content_type.startswith('text/'): true_encoding = http_encoding or 'us-ascii' - elif http_headers and (not http_headers.has_key('content-type')): + elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))): true_encoding = xml_encoding or 'iso-8859-1' else: true_encoding = xml_encoding or 'utf-8' @@ -3335,35 +3541,35 @@ def _toUTF8(data, encoding): ''' if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): + if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-16be': sys.stderr.write('trying utf-16be instead\n') encoding = 'utf-16be' data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): + elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-16le': sys.stderr.write('trying utf-16le instead\n') encoding = 'utf-16le' data = data[2:] - elif data[:3] == '\xef\xbb\xbf': + elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-8': sys.stderr.write('trying utf-8 instead\n') encoding = 'utf-8' data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': + elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-32be': sys.stderr.write('trying utf-32be instead\n') encoding = 'utf-32be' data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': + elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-32le': @@ -3386,43 +3592,47 @@ def _stripDoctype(data): rss_version may be 'rss091n' or None stripped_data is the same XML document, minus the DOCTYPE ''' - start = re.search('<\w',data) + start = re.search(_s2bytes('<\w'), data) start = start and start.start() or -1 head,data = data[:start+1], data[start+1:] - entity_pattern = re.compile(r'^\s*]*?)>', re.MULTILINE) + entity_pattern = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) entity_results=entity_pattern.findall(head) - head = entity_pattern.sub('', head) - doctype_pattern = re.compile(r'^\s*]*?)>', re.MULTILINE) + head = entity_pattern.sub(_s2bytes(''), head) + doctype_pattern = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) doctype_results = doctype_pattern.findall(head) - doctype = doctype_results and doctype_results[0] or '' - if doctype.lower().count('netscape'): + doctype = doctype_results and doctype_results[0] or _s2bytes('') + if doctype.lower().count(_s2bytes('netscape')): version = 'rss091n' else: version = None # only allow in 'safe' inline entity definitions - replacement='' + replacement=_s2bytes('') if len(doctype_results)==1 and entity_results: - safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;|[^&"]*)"') + safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')) safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) if safe_entities: - replacement='\n]>' % '>\n \n \n]>') data = doctype_pattern.sub(replacement, head) + data - return version, data, dict(replacement and safe_pattern.findall(replacement)) + return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)]) + +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], request_headers={}, response_headers={}): + '''Parse a feed from a URL, file, stream, or string. -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): - '''Parse a feed from a URL, file, stream, or string''' + request_headers, if given, is a dict from http header name to value to add + to the request; this overrides internally generated values. + ''' result = FeedParserDict() result['feed'] = FeedParserDict() result['entries'] = [] if _XML_AVAILABLE: result['bozo'] = 0 - if type(handlers) == types.InstanceType: + if not isinstance(handlers, list): handlers = [handlers] try: - f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) + f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers) data = f.read() except Exception, e: result['bozo'] = 1 @@ -3430,9 +3640,17 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer data = None f = None + if hasattr(f, 'headers'): + result['headers'] = dict(f.headers) + # overwrite existing headers using response_headers + if 'headers' in result: + result['headers'].update(response_headers) + elif response_headers: + result['headers'] = copy.deepcopy(response_headers) + # if feed is gzip-compressed, decompress it - if f and data and hasattr(f, 'headers'): - if gzip and f.headers.get('content-encoding', '') == 'gzip': + if f and data and 'headers' in result: + if gzip and result['headers'].get('content-encoding') == 'gzip': try: data = gzip.GzipFile(fileobj=_StringIO(data)).read() except Exception, e: @@ -3443,7 +3661,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result['bozo'] = 1 result['bozo_exception'] = e data = '' - elif zlib and f.headers.get('content-encoding', '') == 'deflate': + elif zlib and result['headers'].get('content-encoding') == 'deflate': try: data = zlib.decompress(data, -zlib.MAX_WBITS) except Exception, e: @@ -3452,21 +3670,20 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer data = '' # save HTTP headers - if hasattr(f, 'info'): - info = f.info() - etag = info.getheader('ETag') - if etag: - result['etag'] = etag - last_modified = info.getheader('Last-Modified') - if last_modified: - result['modified'] = _parse_date(last_modified) + if 'headers' in result: + if 'etag' in result['headers'] or 'ETag' in result['headers']: + etag = result['headers'].get('etag', result['headers'].get('ETag')) + if etag: + result['etag'] = etag + if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']: + modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified')) + if modified: + result['modified'] = _parse_date(modified) if hasattr(f, 'url'): result['href'] = f.url result['status'] = 200 if hasattr(f, 'status'): result['status'] = f.status - if hasattr(f, 'headers'): - result['headers'] = f.headers.dict if hasattr(f, 'close'): f.close() @@ -3479,8 +3696,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \ _getCharacterEncoding(http_headers, data) if http_headers and (not acceptable_content_type): - if http_headers.has_key('content-type'): - bozo_message = '%s is not an XML media type' % http_headers['content-type'] + if http_headers.has_key('content-type') or http_headers.has_key('Content-type'): + bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type')) else: bozo_message = 'no Content-type specified' result['bozo'] = 1 @@ -3489,8 +3706,12 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer if data is not None: result['version'], data, entities = _stripDoctype(data) - baseuri = http_headers.get('content-location', result.get('href')) - baselang = http_headers.get('content-language', None) + # ensure that baseuri is an absolute uri using an acceptable URI scheme + contentloc = http_headers.get('content-location', http_headers.get('Content-Location', '')) + href = result.get('href', '') + baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href + + baselang = http_headers.get('content-language', http_headers.get('Content-Language', None)) # if server sent 304, we're done if result.get('status', 0) == 304: @@ -3566,7 +3787,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer elif proposed_encoding != result['encoding']: result['bozo'] = 1 result['bozo_exception'] = CharacterEncodingOverride( \ - 'documented declared as %s, but parsed as %s' % \ + 'document declared as %s, but parsed as %s' % \ (result['encoding'], proposed_encoding)) result['encoding'] = proposed_encoding @@ -3597,8 +3818,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result['bozo_exception'] = feedparser.exc or e use_strict_parser = 0 if not use_strict_parser: - feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '', entities) - feedparser.feed(data) + feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities) + feedparser.feed(data.decode('utf-8', 'replace')) result['feed'] = feedparser.feeddata result['entries'] = feedparser.entries result['version'] = result['version'] or feedparser.version diff --git a/html2text.py b/html2text.py index 8562a7d..0ed4cec 100644 --- a/html2text.py +++ b/html2text.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "2.37" +__version__ = "3.01" __author__ = "Aaron Swartz (me@aaronsw.com)" __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] @@ -8,11 +8,29 @@ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] # TODO: # Support decoded entities with unifiable. -if not hasattr(__builtins__, 'True'): True, False = 1, 0 -import re, sys, urllib, htmlentitydefs, codecs, StringIO, types -import sgmllib -import urlparse -sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') +try: + True +except NameError: + setattr(__builtins__, 'True', 1) + setattr(__builtins__, 'False', 0) + +def has_key(x, y): + if hasattr(x, 'has_key'): return x.has_key(y) + else: return y in x + +try: + import htmlentitydefs + import urlparse + import HTMLParser +except ImportError: #Python3 + import html.entities as htmlentitydefs + import urllib.parse as urlparse + import html.parser as HTMLParser +try: #Python3 + import urllib.request as urllib +except: + import urllib +import re, sys, codecs, types try: from textwrap import wrap except: pass @@ -64,15 +82,22 @@ def charref(name): if not UNICODE_SNOB and c in unifiable_n.keys(): return unifiable_n[c] else: - return unichr(c) + try: + return unichr(c) + except NameError: #Python3 + return chr(c) def entityref(c): if not UNICODE_SNOB and c in unifiable.keys(): return unifiable[c] else: try: name2cp(c) - except KeyError: return "&" + c - else: return unichr(name2cp(c)) + except KeyError: return "&" + c + ';' + else: + try: + return unichr(name2cp(c)) + except NameError: #Python3 + return chr(name2cp(c)) def replaceEntities(s): s = s.group(1) @@ -83,14 +108,6 @@ def replaceEntities(s): r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") def unescape(s): return r_unescape.sub(replaceEntities, s) - -def fixattrs(attrs): - # Fix bug in sgmllib.py - if not attrs: return attrs - newattrs = [] - for attr in attrs: - newattrs.append((attr[0], unescape(attr[1]))) - return newattrs ### End Entity Nonsense ### @@ -133,13 +150,16 @@ def hn(tag): if n in range(1, 10): return n except ValueError: return 0 -class _html2text(sgmllib.SGMLParser): +class _html2text(HTMLParser.HTMLParser): def __init__(self, out=None, baseurl=''): - sgmllib.SGMLParser.__init__(self) + HTMLParser.HTMLParser.__init__(self) if out is None: self.out = self.outtextf else: self.out = out - self.outtext = u'' + try: + self.outtext = unicode() + except NameError: # Python3 + self.outtext = str() self.quiet = 0 self.p_p = 0 self.outcount = 0 @@ -162,7 +182,7 @@ class _html2text(sgmllib.SGMLParser): self.outtext += s def close(self): - sgmllib.SGMLParser.close(self) + HTMLParser.HTMLParser.close(self) self.pbr() self.o('', 0, 'end') @@ -175,10 +195,10 @@ class _html2text(sgmllib.SGMLParser): def handle_entityref(self, c): self.o(entityref(c)) - def unknown_starttag(self, tag, attrs): + def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) - def unknown_endtag(self, tag): + def handle_endtag(self, tag): self.handle_tag(tag, None, 0) def previousIndex(self, attrs): @@ -187,16 +207,16 @@ class _html2text(sgmllib.SGMLParser): If the set of attributes is not found, returns None """ - if not attrs.has_key('href'): return None + if not has_key(attrs, 'href'): return None i = -1 for a in self.a: i += 1 match = 0 - if a.has_key('href') and a['href'] == attrs['href']: - if a.has_key('title') or attrs.has_key('title'): - if (a.has_key('title') and attrs.has_key('title') and + if has_key(a, 'href') and a['href'] == attrs['href']: + if has_key(a, 'title') or has_key(attrs, 'title'): + if (has_key(a, 'title') and has_key(attrs, 'title') and a['title'] == attrs['title']): match = True else: @@ -205,7 +225,7 @@ class _html2text(sgmllib.SGMLParser): if match: return i def handle_tag(self, tag, attrs, start): - attrs = fixattrs(attrs) + #attrs = fixattrs(attrs) if hn(tag): self.p() @@ -246,7 +266,7 @@ class _html2text(sgmllib.SGMLParser): self.abbr_title = None self.abbr_data = '' - if attrs.has_key('title'): + if has_key(attrs, 'title'): self.abbr_title = attrs['title'] else: if self.abbr_title != None: @@ -259,7 +279,7 @@ class _html2text(sgmllib.SGMLParser): attrsD = {} for (x, y) in attrs: attrsD[x] = y attrs = attrsD - if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): + if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): self.astack.append(attrs) self.o("[") else: @@ -276,13 +296,13 @@ class _html2text(sgmllib.SGMLParser): a['count'] = self.acount a['outcount'] = self.outcount self.a.append(a) - self.o("][" + `a['count']` + "]") + self.o("][" + str(a['count']) + "]") if tag == "img" and start: attrsD = {} for (x, y) in attrs: attrsD[x] = y attrs = attrsD - if attrs.has_key('src'): + if has_key(attrs, 'src'): attrs['href'] = attrs['src'] alt = attrs.get('alt', '') i = self.previousIndex(attrs) @@ -295,7 +315,7 @@ class _html2text(sgmllib.SGMLParser): self.a.append(attrs) self.o("![") self.o(alt) - self.o("]["+`attrs['count']`+"]") + self.o("]["+ str(attrs['count']) +"]") if tag == 'dl' and start: self.p() if tag == 'dt' and not start: self.pbr() @@ -319,7 +339,7 @@ class _html2text(sgmllib.SGMLParser): if li['name'] == "ul": self.o("* ") elif li['name'] == "ol": li['num'] += 1 - self.o(`li['num']`+". ") + self.o(str(li['num'])+". ") self.start = 1 else: self.pbr() @@ -388,8 +408,8 @@ class _html2text(sgmllib.SGMLParser): newa = [] for link in self.a: if self.outcount > link['outcount']: - self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) - if link.has_key('title'): self.out(" ("+link['title']+")") + self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) + if has_key(link, 'title'): self.out(" ("+link['title']+")") self.out("\n") else: newa.append(link) @@ -413,7 +433,7 @@ class _html2text(sgmllib.SGMLParser): def unknown_decl(self, data): pass -def wrapwrite(text): sys.stdout.write(text.encode('utf8')) +def wrapwrite(text): sys.stdout.write(text) def html2text_file(html, out=wrapwrite, baseurl=''): h = _html2text(out, baseurl) @@ -428,7 +448,7 @@ if __name__ == "__main__": baseurl = '' if sys.argv[1:]: arg = sys.argv[1] - if arg.startswith('http://'): + if arg.startswith('http://') or arg.startswith('https://'): baseurl = arg j = urllib.urlopen(baseurl) try: @@ -444,8 +464,10 @@ if __name__ == "__main__": encoding = 'utf8' if len(sys.argv) > 2: encoding = sys.argv[2] - data = open(arg, 'r').read().decode(encoding) + try: #Python3 + data = open(arg, 'r', encoding=encoding).read() + except TypeError: + data = open(arg, 'r').read().decode(encoding) else: - data = sys.stdin.read().decode('utf8') + data = sys.stdin.read() wrapwrite(html2text(data, baseurl)) - diff --git a/rss2email.py b/rss2email.py index 40b4b4d..0dc2d04 100755 --- a/rss2email.py +++ b/rss2email.py @@ -446,7 +446,7 @@ def getEmail(r, entry): if FORCE_FROM: return DEFAULT_FROM - if r.url in OVERRIDE_EMAIL.keys(): + if hasattr(r, "url") and r.url in OVERRIDE_EMAIL.keys(): return validateEmail(OVERRIDE_EMAIL[r.url], DEFAULT_FROM) if 'email' in entry.get('author_detail', []): @@ -462,7 +462,7 @@ def getEmail(r, entry): if feed.get("errorreportsto", ''): return validateEmail(feed.errorreportsto, DEFAULT_FROM) - if r.url in DEFAULT_EMAIL.keys(): + if hasattr(r, "url") and r.url in DEFAULT_EMAIL.keys(): return DEFAULT_EMAIL[r.url] return DEFAULT_FROM @@ -507,7 +507,11 @@ def unlock(feeds, feedfileObject): if not unix: pickle.dump(feeds, open(feedfile, 'w')) else: - pickle.dump(feeds, open(feedfile+'.tmp', 'w')) + fd = open(feedfile+'.tmp', 'w') + pickle.dump(feeds, fd) + fd.flush() + os.fsync(fd.fileno()) + fd.close() os.rename(feedfile+'.tmp', feedfile) fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN) @@ -731,7 +735,10 @@ def run(num=None): if ('rel' in extralink) and extralink['rel'] == u'via': extraurl = extralink['href'] extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/') - content += '
Via: '+extralink['title']+'\n' + viatitle = extraurl + if ('title' in extralink): + viatitle = extralink['title'] + content += '
Via: '+viatitle+'\n' content += '

\n' content += "\n\n" else: -- 2.26.2