Bump to version 2.67. v2.67
authorLindsey Smith <lindsey@allthingsrss.com>
Tue, 21 Sep 2010 12:00:00 +0000 (12:00 +0000)
committerW. Trevor King <wking@tremily.us>
Thu, 4 Oct 2012 11:05:45 +0000 (07:05 -0400)
CHANGELOG
config.py [changed mode: 0644->0755]
feedparser.py
r2e [changed mode: 0644->0755]
rss2email.py [changed mode: 0644->0755]

index 171f7d5d3c4323467d6d922d64e843c29dd3a043..d9cf7ba67e4bb25b7fe40e78322e360baa062669 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,10 @@
+v2.67 (2010-09-21)
+    * Fixed entries that include an id which is blank (i.e., an empty string) were being resent 
+    * Fixed some entries not being sent by email because they had bad From headers     
+    * Fixed From headers with HTML entities encoded twice
+    * Compatibility changes to support most recent development versions of feedparser
+    * Compatibility changes to support Google Reader feeds
+       
 v2.66 (2009-12-21)
 
     * Complete packaging of all necessary source files (rss2email, html2text, feedparser, r2e, etc.) into one bundle
old mode 100644 (file)
new mode 100755 (executable)
index 3cfde1be936b1cfa20bda7f043c4a99dc65778cb..0760cc2ca1243f3160627c70b51f4fcff56b8d55 100644 (file)
@@ -40,7 +40,8 @@ __contributors__ = ["Jason Diamond <http://injektilo.org/>",
                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
                     "Aaron Swartz <http://aaronsw.com/>",
                     "Kevin Marks <http://epeus.blogspot.com/>",
-                    "Sam Ruby <http://intertwingly.net/>"]
+                    "Sam Ruby <http://intertwingly.net/>",
+                    "Ade Oshineye <http://blog.oshineye.com/>"]
 _debug = 0
 
 # HTTP "User-Agent" header to send to servers when downloading feeds.
@@ -407,6 +408,8 @@ class _FeedParserMixin:
                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
                   'http://purl.org/rss/1.0/modules/link/':                'l',
                   'http://search.yahoo.com/mrss':                         'media',
+                  #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
+                  'http://search.yahoo.com/mrss/':                         'media',
                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
@@ -547,7 +550,15 @@ class _FeedParserMixin:
             method = getattr(self, methodname)
             return method(attrsD)
         except AttributeError:
-            return self.push(prefix + suffix, 1)
+            # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
+            unknown_tag = prefix + suffix
+            if len(attrsD) == 0:
+                # No attributes so merge it into the encosing dictionary
+                return self.push(unknown_tag, 1)
+            else:
+                # Has attributes so create it in its own dictionary
+                context = self._getContext()
+                context[unknown_tag] = attrsD
 
     def unknown_endtag(self, tag):
         if _debug: sys.stderr.write('end %s\n' % tag)
@@ -643,12 +654,19 @@ class _FeedParserMixin:
         if _debug: sys.stderr.write('entering parse_declaration\n')
         if self.rawdata[i:i+9] == '<![CDATA[':
             k = self.rawdata.find(']]>', i)
-            if k == -1: k = len(self.rawdata)
+            if k == -1:
+                # CDATA block began but didn't finish
+                k = len(self.rawdata)
+                return k
             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
             return k+3
         else:
             k = self.rawdata.find('>', i)
-            return k+1
+            if k >= 0:
+                return k+1
+            else:
+                # We have an incomplete CDATA block.
+                return k
 
     def mapContentType(self, contentType):
         contentType = contentType.lower()
@@ -919,7 +937,10 @@ class _FeedParserMixin:
                       '0.92': 'rss092',
                       '0.93': 'rss093',
                       '0.94': 'rss094'}
-        if not self.version:
+        #If we're here then this is an RSS feed.
+        #If we don't have a version or have a version that starts with something
+        #other than RSS then there's been a mistake. Correct it.
+        if not self.version or not self.version.startswith('rss'):
             attr_version = attrsD.get('version', '')
             version = versionmap.get(attr_version)
             if version:
@@ -1110,7 +1131,7 @@ class _FeedParserMixin:
     def _getContext(self):
         if self.insource:
             context = self.sourcedata
-        elif self.inimage:
+        elif self.inimage and self.feeddata.has_key('image'):
             context = self.feeddata['image']
         elif self.intextinput:
             context = self.feeddata['textinput']
@@ -1481,11 +1502,18 @@ class _FeedParserMixin:
             context['id'] = href
             
     def _start_source(self, attrsD):
+        if 'url' in attrsD:
+          # This means that we're processing a source element from an RSS 2.0 feed
+          self.sourcedata['href'] = attrsD[u'url']
+        self.push('source', 1)
         self.insource = 1
         self.hasTitle = 0
 
     def _end_source(self):
         self.insource = 0
+        value = self.pop('source')
+        if value:
+          self.sourcedata['title'] = value
         self._getContext()['source'] = copy.deepcopy(self.sourcedata)
         self.sourcedata.clear()
 
@@ -1550,6 +1578,15 @@ class _FeedParserMixin:
             if not context['media_thumbnail'][-1].has_key('url'):
                 context['media_thumbnail'][-1]['url'] = url
 
+    def _start_media_player(self, attrsD):
+        self.push('media_player', 0)
+        self._getContext()['media_player'] = FeedParserDict(attrsD)
+
+    def _end_media_player(self):
+        value = self.pop('media_player')
+        context = self._getContext()
+        context['media_player']['content'] = value
+
 if _XML_AVAILABLE:
     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
         def __init__(self, baseuri, baselang, encoding):
@@ -1558,9 +1595,12 @@ if _XML_AVAILABLE:
             _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
             self.bozo = 0
             self.exc = None
+            self.decls = {}
         
         def startPrefixMapping(self, prefix, uri):
             self.trackNamespace(prefix, uri)
+            if uri == 'http://www.w3.org/1999/xlink':
+              self.decls['xmlns:'+prefix] = uri
         
         def startElementNS(self, name, qname, attrs):
             namespace, localname = name
@@ -1585,7 +1625,7 @@ if _XML_AVAILABLE:
             # the qnames the SAX parser gives us (if indeed it gives us any
             # at all).  Thanks to MatejC for helping me test this and
             # tirelessly telling me that it didn't work yet.
-            attrsD = {}
+            attrsD, self.decls = self.decls, {}
             if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
                 attrsD['xmlns']=namespace
             if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
@@ -1634,7 +1674,7 @@ if _XML_AVAILABLE:
         def error(self, exc):
             self.bozo = 1
             self.exc = exc
-            
+
         def fatalError(self, exc):
             self.error(exc)
             raise exc
@@ -1642,15 +1682,18 @@ if _XML_AVAILABLE:
 class _BaseHTMLProcessor(sgmllib.SGMLParser):
     special = re.compile('''[<>'"]''')
     bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
-    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
-      'img', 'input', 'isindex', 'link', 'meta', 'param']
-    
+    elements_no_end_tag = [
+      'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 
+      'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+      'source', 'track', 'wbr'
+    ]
+
     def __init__(self, encoding, type):
         self.encoding = encoding
         self.type = type
         if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
         sgmllib.SGMLParser.__init__(self)
-        
+
     def reset(self):
         self.pieces = []
         sgmllib.SGMLParser.reset(self)
@@ -1748,7 +1791,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
         # called for each block of plain text, i.e. outside of any tag and
         # not containing any character or entity references
         # Store the original text verbatim.
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
         self.pieces.append(text)
         
     def handle_comment(self, text):
@@ -2275,12 +2318,16 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
         return _urljoin(self.baseuri, uri.strip())
     
     def unknown_starttag(self, tag, attrs):
+        if _debug:
+            sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
         attrs = self.normalize_attrs(attrs)
         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-        
+
 def _resolveRelativeURIs(htmlSource, baseURI, encoding, type):
-    if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
+    if _debug:
+        sys.stderr.write('entering _resolveRelativeURIs\n')
+
     p = _RelativeURIResolver(baseURI, encoding, type)
     p.feed(htmlSource)
     return p.output()
@@ -2420,6 +2467,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
             if tag in self.unacceptable_elements_with_end_tag:
                 self.unacceptablestack += 1
 
+            # add implicit namespaces to html5 inline svg/mathml
+            if self.type.endswith('html'):
+                if not dict(attrs).get('xmlns'):
+                    if tag=='svg':
+                        attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+                    if tag=='math':
+                        attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
             # not otherwise acceptable, perhaps it is MathML or SVG?
             if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
                 self.mathmlOK += 1
@@ -2493,7 +2548,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
 
         # gauntlet
         if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
-        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
+        # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+        if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''
 
         clean = []
         for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
@@ -2739,7 +2795,8 @@ _iso8601_re = [
     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
     'CC', r'(?P<century>\d\d$)')
     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
-    + r'(:(?P<second>\d{2}(\.\d*)?))?'
+    + r'(:(?P<second>\d{2}))?'
+    + r'(\.(?P<fracsecond>\d+))?'
     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
     for tmpl in _iso8601_tmpl]
 del tmpl
@@ -3370,7 +3427,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
     except Exception, e:
         result['bozo'] = 1
         result['bozo_exception'] = e
-        data = ''
+        data = None
         f = None
 
     # if feed is gzip-compressed, decompress it
@@ -3428,8 +3485,9 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
             bozo_message = 'no Content-type specified'
         result['bozo'] = 1
         result['bozo_exception'] = NonXMLContentType(bozo_message)
-        
-    result['version'], data, entities = _stripDoctype(data)
+
+    if data is not None:
+        result['version'], data, entities = _stripDoctype(data)
 
     baseuri = http_headers.get('content-location', result.get('href'))
     baselang = http_headers.get('content-language', None)
@@ -3442,7 +3500,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         return result
 
     # if there was a problem downloading, we're done
-    if not data:
+    if data is None:
         return result
 
     # determine character encoding
diff --git a/r2e b/r2e
old mode 100644 (file)
new mode 100755 (executable)
old mode 100644 (file)
new mode 100755 (executable)
index fff46ac..00fdbd7
@@ -11,13 +11,13 @@ Usage:
   reset
   delete n
 """
-__version__ = "2.66"
+__version__ = "2.67"
 __author__ = "Lindsey Smith (lindsey@allthingsrss.com)"
 __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3."
 ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", 
                      "Matej Cepl", "Martin 'Joey' Schulze", 
                      "Marcel Ackermann (http://www.DreamFlasher.de)", 
-                     "Lindsey Smith", "Aaron Swartz (original author)" ]
+                     "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
 
 import urllib2
 urllib2.install_opener(urllib2.build_opener())
@@ -153,13 +153,13 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps
                
        fromhdr = formataddr((sender_name, sender_addr))
        msg['From'] = fromhdr
-               
+
        msg_as_string = msg.as_string()
 #DEPRECATED    if QP_REQUIRED:
 #DEPRECATED            ins, outs = SIO(msg_as_string), SIO()
 #DEPRECATED            mimify.mimify(ins, outs)
 #DEPRECATED            msg_as_string = outs.getvalue()
-               
+
        if SMTP_SEND:
                if not smtpserver: 
                        import smtplib
@@ -283,6 +283,8 @@ h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
 h2t.BODY_WIDTH = BODY_WIDTH
 html2text = h2t.html2text
 
+from types import *
+
 ### Utility Functions ###
 
 import threading
@@ -371,7 +373,12 @@ def getContent(entry, HTMLOK=0):
 def getID(entry):
        """Get best ID from an entry."""
        if TRUST_GUID:
-               if 'id' in entry and entry.id: return entry.id
+               if 'id' in entry and entry.id: 
+                       # Newer versions of feedparser could return a dictionary
+                       if type(entry.id) is DictType:
+                               return entry.id.values()[0]
+
+                       return entry.id
 
        content = getContent(entry)
        if content and content != "\n": return hash(unu(content)).hexdigest()
@@ -589,13 +596,17 @@ def run(num=None):
                                        # Instead of letting these run wild, we put them in context
                                        # by associating them with the actual ID (if it exists).
                                        
-                                       frameid = entry.get('id', id)
+                                       frameid = entry.get('id')
+                                       if not(frameid): frameid = id
+                                       if type(frameid) is DictType:
+                                               frameid = frameid.values()[0]
                                        
                                        # If this item's ID is in our database
                                        # then it's already been sent
                                        # and we don't need to do anything more.
                                        
-                                       if f.seen.has_key(frameid) and f.seen[frameid] == id: continue
+                                       if frameid in f.seen:
+                                               if f.seen[frameid] == id: continue
 
                                        if not (f.to or default_to):
                                                print "No default email address defined. Please run 'r2e email emailaddress'"
@@ -622,8 +633,8 @@ def run(num=None):
                                        
                                        from_addr = getEmail(r.feed, entry)
                                        
-                                       name = getName(r, entry)
-                                       fromhdr = '"'+ name + '" <' + from_addr + ">"
+                                        name = h2t.unescape(getName(r, entry))
+                                       fromhdr = formataddr((name, from_addr,))
                                        tohdr = (f.to or default_to)
                                        subjecthdr = title
                                        datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
@@ -648,7 +659,7 @@ def run(num=None):
                                                content += '<div id="entry">\n'
                                                content += '<h1'
                                                content += ' class="header"'
-                                               content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n'
+                                               content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
                                                if ishtml(entrycontent):
                                                        body = entrycontent[1].strip()
                                                else:
@@ -662,6 +673,12 @@ def run(num=None):
                                                                        content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
                                                                if (hasattr(enclosure, 'src') and enclosure.src != ""):
                                                                        content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
+                                               if 'links' in entry:
+                                                       for extralink in entry.links:
+                                                               if ('rel' in extralink) and extralink['rel'] == u'via':
+                                                                       extraurl = extralink['href']
+                                                                       extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
+                                                                       content += '<br/>Via: <a href="'+extraurl+'">'+extralink['title']+'</a>\n'
                                                content += '</p></div>\n'
                                                content += "\n\n</body></html>"
                                        else:   
@@ -677,7 +694,11 @@ def run(num=None):
                                                                for enclosure in entry.enclosures:
                                                                        if enclosure.url != "":
                                                                                content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
-                                                       
+                                                       if 'links' in entry:
+                                                               for extralink in entry.links:
+                                                                       if ('rel' in extralink) and extralink['rel'] == u'via':
+                                                                               content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
+                                                                
                                                        content += ("\n</body></html>")
                                                else:
                                                        content = entrycontent.strip() + "\n\nURL: "+link
@@ -685,6 +706,10 @@ def run(num=None):
                                                                for enclosure in entry.enclosures:
                                                                        if enclosure.url != "":
                                                                                content += ('\nEnclosure: ' + enclosure.url + "\n")
+                                                       if 'links' in entry:
+                                                               for extralink in entry.links:
+                                                                       if ('rel' in extralink) and extralink['rel'] == u'via':
+                                                                               content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'
 
                                        smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)