From eb04d97b99d96d12b5b21447b4b1cac286ed49d1 Mon Sep 17 00:00:00 2001 From: Lindsey Smith Date: Tue, 21 Sep 2010 12:00:00 +0000 Subject: [PATCH] Bump to version 2.67. --- CHANGELOG | 7 ++++ config.py | 0 feedparser.py | 100 +++++++++++++++++++++++++++++++++++++++----------- r2e | 0 rss2email.py | 47 ++++++++++++++++++------ 5 files changed, 122 insertions(+), 32 deletions(-) mode change 100644 => 100755 config.py mode change 100644 => 100755 r2e mode change 100644 => 100755 rss2email.py diff --git a/CHANGELOG b/CHANGELOG index 171f7d5..d9cf7ba 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,10 @@ +v2.67 (2010-09-21) + * Fixed entries that include an id which is blank (i.e., an empty string) were being resent + * Fixed some entries not being sent by email because they had bad From headers + * Fixed From headers with HTML entities encoded twice + * Compatibility changes to support most recent development versions of feedparser + * Compatibility changes to support Google Reader feeds + v2.66 (2009-12-21) * Complete packaging of all necessary source files (rss2email, html2text, feedparser, r2e, etc.) into one bundle diff --git a/config.py b/config.py old mode 100644 new mode 100755 diff --git a/feedparser.py b/feedparser.py index 3cfde1b..0760cc2 100644 --- a/feedparser.py +++ b/feedparser.py @@ -40,7 +40,8 @@ __contributors__ = ["Jason Diamond ", "Fazal Majid ", "Aaron Swartz ", "Kevin Marks ", - "Sam Ruby "] + "Sam Ruby ", + "Ade Oshineye "] _debug = 0 # HTTP "User-Agent" header to send to servers when downloading feeds. @@ -407,6 +408,8 @@ class _FeedParserMixin: 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', 'http://purl.org/rss/1.0/modules/link/': 'l', 'http://search.yahoo.com/mrss': 'media', + #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace + 'http://search.yahoo.com/mrss/': 'media', 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', @@ -547,7 +550,15 @@ class _FeedParserMixin: method = getattr(self, methodname) return method(attrsD) except AttributeError: - return self.push(prefix + suffix, 1) + # Since there's no handler or something has gone wrong we explicitly add the element and its attributes + unknown_tag = prefix + suffix + if len(attrsD) == 0: + # No attributes so merge it into the encosing dictionary + return self.push(unknown_tag, 1) + else: + # Has attributes so create it in its own dictionary + context = self._getContext() + context[unknown_tag] = attrsD def unknown_endtag(self, tag): if _debug: sys.stderr.write('end %s\n' % tag) @@ -643,12 +654,19 @@ class _FeedParserMixin: if _debug: sys.stderr.write('entering parse_declaration\n') if self.rawdata[i:i+9] == '', i) - if k == -1: k = len(self.rawdata) + if k == -1: + # CDATA block began but didn't finish + k = len(self.rawdata) + return k self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) return k+3 else: k = self.rawdata.find('>', i) - return k+1 + if k >= 0: + return k+1 + else: + # We have an incomplete CDATA block. + return k def mapContentType(self, contentType): contentType = contentType.lower() @@ -919,7 +937,10 @@ class _FeedParserMixin: '0.92': 'rss092', '0.93': 'rss093', '0.94': 'rss094'} - if not self.version: + #If we're here then this is an RSS feed. + #If we don't have a version or have a version that starts with something + #other than RSS then there's been a mistake. Correct it. + if not self.version or not self.version.startswith('rss'): attr_version = attrsD.get('version', '') version = versionmap.get(attr_version) if version: @@ -1110,7 +1131,7 @@ class _FeedParserMixin: def _getContext(self): if self.insource: context = self.sourcedata - elif self.inimage: + elif self.inimage and self.feeddata.has_key('image'): context = self.feeddata['image'] elif self.intextinput: context = self.feeddata['textinput'] @@ -1481,11 +1502,18 @@ class _FeedParserMixin: context['id'] = href def _start_source(self, attrsD): + if 'url' in attrsD: + # This means that we're processing a source element from an RSS 2.0 feed + self.sourcedata['href'] = attrsD[u'url'] + self.push('source', 1) self.insource = 1 self.hasTitle = 0 def _end_source(self): self.insource = 0 + value = self.pop('source') + if value: + self.sourcedata['title'] = value self._getContext()['source'] = copy.deepcopy(self.sourcedata) self.sourcedata.clear() @@ -1550,6 +1578,15 @@ class _FeedParserMixin: if not context['media_thumbnail'][-1].has_key('url'): context['media_thumbnail'][-1]['url'] = url + def _start_media_player(self, attrsD): + self.push('media_player', 0) + self._getContext()['media_player'] = FeedParserDict(attrsD) + + def _end_media_player(self): + value = self.pop('media_player') + context = self._getContext() + context['media_player']['content'] = value + if _XML_AVAILABLE: class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): def __init__(self, baseuri, baselang, encoding): @@ -1558,9 +1595,12 @@ if _XML_AVAILABLE: _FeedParserMixin.__init__(self, baseuri, baselang, encoding) self.bozo = 0 self.exc = None + self.decls = {} def startPrefixMapping(self, prefix, uri): self.trackNamespace(prefix, uri) + if uri == 'http://www.w3.org/1999/xlink': + self.decls['xmlns:'+prefix] = uri def startElementNS(self, name, qname, attrs): namespace, localname = name @@ -1585,7 +1625,7 @@ if _XML_AVAILABLE: # the qnames the SAX parser gives us (if indeed it gives us any # at all). Thanks to MatejC for helping me test this and # tirelessly telling me that it didn't work yet. - attrsD = {} + attrsD, self.decls = self.decls, {} if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': attrsD['xmlns']=namespace if localname=='svg' and namespace=='http://www.w3.org/2000/svg': @@ -1634,7 +1674,7 @@ if _XML_AVAILABLE: def error(self, exc): self.bozo = 1 self.exc = exc - + def fatalError(self, exc): self.error(exc) raise exc @@ -1642,15 +1682,18 @@ if _XML_AVAILABLE: class _BaseHTMLProcessor(sgmllib.SGMLParser): special = re.compile('''[<>'"]''') bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") - elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param'] - + elements_no_end_tag = [ + 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', + 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', + 'source', 'track', 'wbr' + ] + def __init__(self, encoding, type): self.encoding = encoding self.type = type if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) sgmllib.SGMLParser.__init__(self) - + def reset(self): self.pieces = [] sgmllib.SGMLParser.reset(self) @@ -1748,7 +1791,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. - if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) + if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text) self.pieces.append(text) def handle_comment(self, text): @@ -2275,12 +2318,16 @@ class _RelativeURIResolver(_BaseHTMLProcessor): return _urljoin(self.baseuri, uri.strip()) def unknown_starttag(self, tag, attrs): + if _debug: + sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs))) attrs = self.normalize_attrs(attrs) attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - + def _resolveRelativeURIs(htmlSource, baseURI, encoding, type): - if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') + if _debug: + sys.stderr.write('entering _resolveRelativeURIs\n') + p = _RelativeURIResolver(baseURI, encoding, type) p.feed(htmlSource) return p.output() @@ -2420,6 +2467,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor): if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack += 1 + # add implicit namespaces to html5 inline svg/mathml + if self.type.endswith('html'): + if not dict(attrs).get('xmlns'): + if tag=='svg': + attrs.append( ('xmlns','http://www.w3.org/2000/svg') ) + if tag=='math': + attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') ) + # not otherwise acceptable, perhaps it is MathML or SVG? if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs: self.mathmlOK += 1 @@ -2493,7 +2548,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor): # gauntlet if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' - if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return '' + # This replaced a regexp that used re.match and was prone to pathological back-tracking. + if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return '' clean = [] for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): @@ -2739,7 +2795,8 @@ _iso8601_re = [ 'OOO', r'(?P[0123]\d\d)').replace( 'CC', r'(?P\d\d$)') + r'(T?(?P\d{2}):(?P\d{2})' - + r'(:(?P\d{2}(\.\d*)?))?' + + r'(:(?P\d{2}))?' + + r'(\.(?P\d+))?' + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' for tmpl in _iso8601_tmpl] del tmpl @@ -3370,7 +3427,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer except Exception, e: result['bozo'] = 1 result['bozo_exception'] = e - data = '' + data = None f = None # if feed is gzip-compressed, decompress it @@ -3428,8 +3485,9 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer bozo_message = 'no Content-type specified' result['bozo'] = 1 result['bozo_exception'] = NonXMLContentType(bozo_message) - - result['version'], data, entities = _stripDoctype(data) + + if data is not None: + result['version'], data, entities = _stripDoctype(data) baseuri = http_headers.get('content-location', result.get('href')) baselang = http_headers.get('content-language', None) @@ -3442,7 +3500,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer return result # if there was a problem downloading, we're done - if not data: + if data is None: return result # determine character encoding diff --git a/r2e b/r2e old mode 100644 new mode 100755 diff --git a/rss2email.py b/rss2email.py old mode 100644 new mode 100755 index fff46ac..00fdbd7 --- a/rss2email.py +++ b/rss2email.py @@ -11,13 +11,13 @@ Usage: reset delete n """ -__version__ = "2.66" +__version__ = "2.67" __author__ = "Lindsey Smith (lindsey@allthingsrss.com)" __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3." ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", "Matej Cepl", "Martin 'Joey' Schulze", "Marcel Ackermann (http://www.DreamFlasher.de)", - "Lindsey Smith", "Aaron Swartz (original author)" ] + "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ] import urllib2 urllib2.install_opener(urllib2.build_opener()) @@ -153,13 +153,13 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps fromhdr = formataddr((sender_name, sender_addr)) msg['From'] = fromhdr - + msg_as_string = msg.as_string() #DEPRECATED if QP_REQUIRED: #DEPRECATED ins, outs = SIO(msg_as_string), SIO() #DEPRECATED mimify.mimify(ins, outs) #DEPRECATED msg_as_string = outs.getvalue() - + if SMTP_SEND: if not smtpserver: import smtplib @@ -283,6 +283,8 @@ h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH h2t.BODY_WIDTH = BODY_WIDTH html2text = h2t.html2text +from types import * + ### Utility Functions ### import threading @@ -371,7 +373,12 @@ def getContent(entry, HTMLOK=0): def getID(entry): """Get best ID from an entry.""" if TRUST_GUID: - if 'id' in entry and entry.id: return entry.id + if 'id' in entry and entry.id: + # Newer versions of feedparser could return a dictionary + if type(entry.id) is DictType: + return entry.id.values()[0] + + return entry.id content = getContent(entry) if content and content != "\n": return hash(unu(content)).hexdigest() @@ -589,13 +596,17 @@ def run(num=None): # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). - frameid = entry.get('id', id) + frameid = entry.get('id') + if not(frameid): frameid = id + if type(frameid) is DictType: + frameid = frameid.values()[0] # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. - if f.seen.has_key(frameid) and f.seen[frameid] == id: continue + if frameid in f.seen: + if f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" @@ -622,8 +633,8 @@ def run(num=None): from_addr = getEmail(r.feed, entry) - name = getName(r, entry) - fromhdr = '"'+ name + '" <' + from_addr + ">" + name = h2t.unescape(getName(r, entry)) + fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) @@ -648,7 +659,7 @@ def run(num=None): content += '
\n' content += ''+subjecthdr+'\n\n' + content += '>'+subjecthdr+'\n' if ishtml(entrycontent): body = entrycontent[1].strip() else: @@ -662,6 +673,12 @@ def run(num=None): content += ('
Enclosure: '+enclosure.url+"\n") if (hasattr(enclosure, 'src') and enclosure.src != ""): content += ('
Enclosure: '+enclosure.src+'
'+extralink['title']+'\n' content += '

\n' content += "\n\n" else: @@ -677,7 +694,11 @@ def run(num=None): for enclosure in entry.enclosures: if enclosure.url != "": content += ('Enclosure: '+enclosure.url+"
\n") - + if 'links' in entry: + for extralink in entry.links: + if ('rel' in extralink) and extralink['rel'] == u'via': + content += 'Via: '+extralink['title']+'
\n' + content += ("\n") else: content = entrycontent.strip() + "\n\nURL: "+link @@ -685,6 +706,10 @@ def run(num=None): for enclosure in entry.enclosures: if enclosure.url != "": content += ('\nEnclosure: ' + enclosure.url + "\n") + if 'links' in entry: + for extralink in entry.links: + if ('rel' in extralink) and extralink['rel'] == u'via': + content += 'Via: '+extralink['title']+'\n' smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver) -- 2.26.2