From eb04d97b99d96d12b5b21447b4b1cac286ed49d1 Mon Sep 17 00:00:00 2001
From: Lindsey Smith <lindsey@allthingsrss.com>
Date: Tue, 21 Sep 2010 12:00:00 +0000
Subject: [PATCH] Bump to version 2.67.

---
 CHANGELOG     |   7 ++++
 config.py     |   0
 feedparser.py | 100 +++++++++++++++++++++++++++++++++++++++-----------
 r2e           |   0
 rss2email.py  |  47 ++++++++++++++++++------
 5 files changed, 122 insertions(+), 32 deletions(-)
 mode change 100644 => 100755 config.py
 mode change 100644 => 100755 r2e
 mode change 100644 => 100755 rss2email.py

diff --git a/CHANGELOG b/CHANGELOG
index 171f7d5..d9cf7ba 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,10 @@
+v2.67 (2010-09-21)
+    * Fixed entries that include an id which is blank (i.e., an empty string) were being resent 
+    * Fixed some entries not being sent by email because they had bad From headers	
+    * Fixed From headers with HTML entities encoded twice
+    * Compatibility changes to support most recent development versions of feedparser
+    * Compatibility changes to support Google Reader feeds
+	
 v2.66 (2009-12-21)
 
     * Complete packaging of all necessary source files (rss2email, html2text, feedparser, r2e, etc.) into one bundle
diff --git a/config.py b/config.py
old mode 100644
new mode 100755
diff --git a/feedparser.py b/feedparser.py
index 3cfde1b..0760cc2 100644
--- a/feedparser.py
+++ b/feedparser.py
@@ -40,7 +40,8 @@ __contributors__ = ["Jason Diamond <http://injektilo.org/>",
                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
                     "Aaron Swartz <http://aaronsw.com/>",
                     "Kevin Marks <http://epeus.blogspot.com/>",
-                    "Sam Ruby <http://intertwingly.net/>"]
+                    "Sam Ruby <http://intertwingly.net/>",
+                    "Ade Oshineye <http://blog.oshineye.com/>"]
 _debug = 0
 
 # HTTP "User-Agent" header to send to servers when downloading feeds.
@@ -407,6 +408,8 @@ class _FeedParserMixin:
                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
                   'http://purl.org/rss/1.0/modules/link/':                'l',
                   'http://search.yahoo.com/mrss':                         'media',
+                  #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
+                  'http://search.yahoo.com/mrss/':                         'media',
                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
@@ -547,7 +550,15 @@ class _FeedParserMixin:
             method = getattr(self, methodname)
             return method(attrsD)
         except AttributeError:
-            return self.push(prefix + suffix, 1)
+            # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
+            unknown_tag = prefix + suffix
+            if len(attrsD) == 0:
+                # No attributes so merge it into the encosing dictionary
+                return self.push(unknown_tag, 1)
+            else:
+                # Has attributes so create it in its own dictionary
+                context = self._getContext()
+                context[unknown_tag] = attrsD
 
     def unknown_endtag(self, tag):
         if _debug: sys.stderr.write('end %s\n' % tag)
@@ -643,12 +654,19 @@ class _FeedParserMixin:
         if _debug: sys.stderr.write('entering parse_declaration\n')
         if self.rawdata[i:i+9] == '<![CDATA[':
             k = self.rawdata.find(']]>', i)
-            if k == -1: k = len(self.rawdata)
+            if k == -1:
+                # CDATA block began but didn't finish
+                k = len(self.rawdata)
+                return k
             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
             return k+3
         else:
             k = self.rawdata.find('>', i)
-            return k+1
+            if k >= 0:
+                return k+1
+            else:
+                # We have an incomplete CDATA block.
+                return k
 
     def mapContentType(self, contentType):
         contentType = contentType.lower()
@@ -919,7 +937,10 @@ class _FeedParserMixin:
                       '0.92': 'rss092',
                       '0.93': 'rss093',
                       '0.94': 'rss094'}
-        if not self.version:
+        #If we're here then this is an RSS feed.
+        #If we don't have a version or have a version that starts with something
+        #other than RSS then there's been a mistake. Correct it.
+        if not self.version or not self.version.startswith('rss'):
             attr_version = attrsD.get('version', '')
             version = versionmap.get(attr_version)
             if version:
@@ -1110,7 +1131,7 @@ class _FeedParserMixin:
     def _getContext(self):
         if self.insource:
             context = self.sourcedata
-        elif self.inimage:
+        elif self.inimage and self.feeddata.has_key('image'):
             context = self.feeddata['image']
         elif self.intextinput:
             context = self.feeddata['textinput']
@@ -1481,11 +1502,18 @@ class _FeedParserMixin:
             context['id'] = href
             
     def _start_source(self, attrsD):
+        if 'url' in attrsD:
+          # This means that we're processing a source element from an RSS 2.0 feed
+          self.sourcedata['href'] = attrsD[u'url']
+        self.push('source', 1)
         self.insource = 1
         self.hasTitle = 0
 
     def _end_source(self):
         self.insource = 0
+        value = self.pop('source')
+        if value:
+          self.sourcedata['title'] = value
         self._getContext()['source'] = copy.deepcopy(self.sourcedata)
         self.sourcedata.clear()
 
@@ -1550,6 +1578,15 @@ class _FeedParserMixin:
             if not context['media_thumbnail'][-1].has_key('url'):
                 context['media_thumbnail'][-1]['url'] = url
 
+    def _start_media_player(self, attrsD):
+        self.push('media_player', 0)
+        self._getContext()['media_player'] = FeedParserDict(attrsD)
+
+    def _end_media_player(self):
+        value = self.pop('media_player')
+        context = self._getContext()
+        context['media_player']['content'] = value
+
 if _XML_AVAILABLE:
     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
         def __init__(self, baseuri, baselang, encoding):
@@ -1558,9 +1595,12 @@ if _XML_AVAILABLE:
             _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
             self.bozo = 0
             self.exc = None
+            self.decls = {}
         
         def startPrefixMapping(self, prefix, uri):
             self.trackNamespace(prefix, uri)
+            if uri == 'http://www.w3.org/1999/xlink':
+              self.decls['xmlns:'+prefix] = uri
         
         def startElementNS(self, name, qname, attrs):
             namespace, localname = name
@@ -1585,7 +1625,7 @@ if _XML_AVAILABLE:
             # the qnames the SAX parser gives us (if indeed it gives us any
             # at all).  Thanks to MatejC for helping me test this and
             # tirelessly telling me that it didn't work yet.
-            attrsD = {}
+            attrsD, self.decls = self.decls, {}
             if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
                 attrsD['xmlns']=namespace
             if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
@@ -1634,7 +1674,7 @@ if _XML_AVAILABLE:
         def error(self, exc):
             self.bozo = 1
             self.exc = exc
-            
+
         def fatalError(self, exc):
             self.error(exc)
             raise exc
@@ -1642,15 +1682,18 @@ if _XML_AVAILABLE:
 class _BaseHTMLProcessor(sgmllib.SGMLParser):
     special = re.compile('''[<>'"]''')
     bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
-    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
-      'img', 'input', 'isindex', 'link', 'meta', 'param']
-    
+    elements_no_end_tag = [
+      'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 
+      'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+      'source', 'track', 'wbr'
+    ]
+
     def __init__(self, encoding, type):
         self.encoding = encoding
         self.type = type
         if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
         sgmllib.SGMLParser.__init__(self)
-        
+
     def reset(self):
         self.pieces = []
         sgmllib.SGMLParser.reset(self)
@@ -1748,7 +1791,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
         # called for each block of plain text, i.e. outside of any tag and
         # not containing any character or entity references
         # Store the original text verbatim.
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
         self.pieces.append(text)
         
     def handle_comment(self, text):
@@ -2275,12 +2318,16 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
         return _urljoin(self.baseuri, uri.strip())
     
     def unknown_starttag(self, tag, attrs):
+        if _debug:
+            sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
         attrs = self.normalize_attrs(attrs)
         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-        
+
 def _resolveRelativeURIs(htmlSource, baseURI, encoding, type):
-    if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
+    if _debug:
+        sys.stderr.write('entering _resolveRelativeURIs\n')
+
     p = _RelativeURIResolver(baseURI, encoding, type)
     p.feed(htmlSource)
     return p.output()
@@ -2420,6 +2467,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
             if tag in self.unacceptable_elements_with_end_tag:
                 self.unacceptablestack += 1
 
+            # add implicit namespaces to html5 inline svg/mathml
+            if self.type.endswith('html'):
+                if not dict(attrs).get('xmlns'):
+                    if tag=='svg':
+                        attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+                    if tag=='math':
+                        attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
             # not otherwise acceptable, perhaps it is MathML or SVG?
             if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
                 self.mathmlOK += 1
@@ -2493,7 +2548,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
 
         # gauntlet
         if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
-        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
+        # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+        if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''
 
         clean = []
         for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
@@ -2739,7 +2795,8 @@ _iso8601_re = [
     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
     'CC', r'(?P<century>\d\d$)')
     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
-    + r'(:(?P<second>\d{2}(\.\d*)?))?'
+    + r'(:(?P<second>\d{2}))?'
+    + r'(\.(?P<fracsecond>\d+))?'
     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
     for tmpl in _iso8601_tmpl]
 del tmpl
@@ -3370,7 +3427,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
     except Exception, e:
         result['bozo'] = 1
         result['bozo_exception'] = e
-        data = ''
+        data = None
         f = None
 
     # if feed is gzip-compressed, decompress it
@@ -3428,8 +3485,9 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
             bozo_message = 'no Content-type specified'
         result['bozo'] = 1
         result['bozo_exception'] = NonXMLContentType(bozo_message)
-        
-    result['version'], data, entities = _stripDoctype(data)
+
+    if data is not None:
+        result['version'], data, entities = _stripDoctype(data)
 
     baseuri = http_headers.get('content-location', result.get('href'))
     baselang = http_headers.get('content-language', None)
@@ -3442,7 +3500,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         return result
 
     # if there was a problem downloading, we're done
-    if not data:
+    if data is None:
         return result
 
     # determine character encoding
diff --git a/r2e b/r2e
old mode 100644
new mode 100755
diff --git a/rss2email.py b/rss2email.py
old mode 100644
new mode 100755
index fff46ac..00fdbd7
--- a/rss2email.py
+++ b/rss2email.py
@@ -11,13 +11,13 @@ Usage:
   reset
   delete n
 """
-__version__ = "2.66"
+__version__ = "2.67"
 __author__ = "Lindsey Smith (lindsey@allthingsrss.com)"
 __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3."
 ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", 
                      "Matej Cepl", "Martin 'Joey' Schulze", 
                      "Marcel Ackermann (http://www.DreamFlasher.de)", 
-                     "Lindsey Smith", "Aaron Swartz (original author)" ]
+                     "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
 
 import urllib2
 urllib2.install_opener(urllib2.build_opener())
@@ -153,13 +153,13 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps
 		
 	fromhdr = formataddr((sender_name, sender_addr))
 	msg['From'] = fromhdr
-		
+
 	msg_as_string = msg.as_string()
 #DEPRECATED 	if QP_REQUIRED:
 #DEPRECATED 		ins, outs = SIO(msg_as_string), SIO()
 #DEPRECATED 		mimify.mimify(ins, outs)
 #DEPRECATED 		msg_as_string = outs.getvalue()
-    		
+
 	if SMTP_SEND:
 		if not smtpserver: 
 			import smtplib
@@ -283,6 +283,8 @@ h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
 h2t.BODY_WIDTH = BODY_WIDTH
 html2text = h2t.html2text
 
+from types import *
+
 ### Utility Functions ###
 
 import threading
@@ -371,7 +373,12 @@ def getContent(entry, HTMLOK=0):
 def getID(entry):
 	"""Get best ID from an entry."""
 	if TRUST_GUID:
-		if 'id' in entry and entry.id: return entry.id
+		if 'id' in entry and entry.id: 
+			# Newer versions of feedparser could return a dictionary
+			if type(entry.id) is DictType:
+				return entry.id.values()[0]
+
+			return entry.id
 
 	content = getContent(entry)
 	if content and content != "\n": return hash(unu(content)).hexdigest()
@@ -589,13 +596,17 @@ def run(num=None):
 					# Instead of letting these run wild, we put them in context
 					# by associating them with the actual ID (if it exists).
 					
-					frameid = entry.get('id', id)
+					frameid = entry.get('id')
+					if not(frameid): frameid = id
+					if type(frameid) is DictType:
+						frameid = frameid.values()[0]
 					
 					# If this item's ID is in our database
 					# then it's already been sent
 					# and we don't need to do anything more.
 					
-					if f.seen.has_key(frameid) and f.seen[frameid] == id: continue
+					if frameid in f.seen:
+						if f.seen[frameid] == id: continue
 
 					if not (f.to or default_to):
 						print "No default email address defined. Please run 'r2e email emailaddress'"
@@ -622,8 +633,8 @@ def run(num=None):
 					
 					from_addr = getEmail(r.feed, entry)
 					
-					name = getName(r, entry)
-					fromhdr = '"'+ name + '" <' + from_addr + ">"
+                                        name = h2t.unescape(getName(r, entry))
+					fromhdr = formataddr((name, from_addr,))
 					tohdr = (f.to or default_to)
 					subjecthdr = title
 					datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
@@ -648,7 +659,7 @@ def run(num=None):
 						content += '<div id="entry">\n'
 						content += '<h1'
 						content += ' class="header"'
-						content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n'
+						content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
 						if ishtml(entrycontent):
 							body = entrycontent[1].strip()
 						else:
@@ -662,6 +673,12 @@ def run(num=None):
 									content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
 								if (hasattr(enclosure, 'src') and enclosure.src != ""):
 									content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
+						if 'links' in entry:
+							for extralink in entry.links:
+								if ('rel' in extralink) and extralink['rel'] == u'via':
+									extraurl = extralink['href']
+									extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
+									content += '<br/>Via: <a href="'+extraurl+'">'+extralink['title']+'</a>\n'
 						content += '</p></div>\n'
 						content += "\n\n</body></html>"
 					else:	
@@ -677,7 +694,11 @@ def run(num=None):
 								for enclosure in entry.enclosures:
 									if enclosure.url != "":
 										content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
-							
+							if 'links' in entry:
+								for extralink in entry.links:
+									if ('rel' in extralink) and extralink['rel'] == u'via':
+										content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
+                                                                
 							content += ("\n</body></html>")
 						else:
 							content = entrycontent.strip() + "\n\nURL: "+link
@@ -685,6 +706,10 @@ def run(num=None):
 								for enclosure in entry.enclosures:
 									if enclosure.url != "":
 										content += ('\nEnclosure: ' + enclosure.url + "\n")
+							if 'links' in entry:
+								for extralink in entry.links:
+									if ('rel' in extralink) and extralink['rel'] == u'via':
+										content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'
 
 					smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)
 			
-- 
2.26.2