Bump to version 2.67.

author Lindsey Smith <lindsey@allthingsrss.com>

Tue, 21 Sep 2010 12:00:00 +0000 (12:00 +0000)

committer W. Trevor King <wking@tremily.us>

Thu, 4 Oct 2012 11:05:45 +0000 (07:05 -0400)
author Lindsey Smith <lindsey@allthingsrss.com>
Tue, 21 Sep 2010 12:00:00 +0000 (12:00 +0000)
committer W. Trevor King <wking@tremily.us>
Thu, 4 Oct 2012 11:05:45 +0000 (07:05 -0400)
diff --git a/CHANGELOG b/CHANGELOG

index 171f7d5d3c4323467d6d922d64e843c29dd3a043..d9cf7ba67e4bb25b7fe40e78322e360baa062669 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,10 @@
+v2.67 (2010-09-21)
+    * Fixed entries that include an id which is blank (i.e., an empty string) were being resent 
+    * Fixed some entries not being sent by email because they had bad From headers     
+    * Fixed From headers with HTML entities encoded twice
+    * Compatibility changes to support most recent development versions of feedparser
+    * Compatibility changes to support Google Reader feeds
+       
  v2.66 (2009-12-21)
  
      * Complete packaging of all necessary source files (rss2email, html2text, feedparser, r2e, etc.) into one bundle
diff --git a/config.py b/config.py

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/feedparser.py b/feedparser.py

index 3cfde1be936b1cfa20bda7f043c4a99dc65778cb..0760cc2ca1243f3160627c70b51f4fcff56b8d55 100644 (file)
--- a/feedparser.py
+++ b/feedparser.py
@@ -40,7 +40,8 @@ __contributors__ = ["Jason Diamond <http://injektilo.org/>",
                      "Fazal Majid <http://www.majid.info/mylos/weblog/>",
                      "Aaron Swartz <http://aaronsw.com/>",
                      "Kevin Marks <http://epeus.blogspot.com/>",
-                    "Sam Ruby <http://intertwingly.net/>"]
+                    "Sam Ruby <http://intertwingly.net/>",
+                    "Ade Oshineye <http://blog.oshineye.com/>"]
  _debug = 0
  
  # HTTP "User-Agent" header to send to servers when downloading feeds.
@@ -407,6 +408,8 @@ class _FeedParserMixin:
                    'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
                    'http://purl.org/rss/1.0/modules/link/':                'l',
                    'http://search.yahoo.com/mrss':                         'media',
+                  #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
+                  'http://search.yahoo.com/mrss/':                         'media',
                    'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
                    'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
                    'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
@@ -547,7 +550,15 @@ class _FeedParserMixin:
              method = getattr(self, methodname)
              return method(attrsD)
          except AttributeError:
-            return self.push(prefix + suffix, 1)
+            # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
+            unknown_tag = prefix + suffix
+            if len(attrsD) == 0:
+                # No attributes so merge it into the encosing dictionary
+                return self.push(unknown_tag, 1)
+            else:
+                # Has attributes so create it in its own dictionary
+                context = self._getContext()
+                context[unknown_tag] = attrsD
  
      def unknown_endtag(self, tag):
          if _debug: sys.stderr.write('end %s\n' % tag)
@@ -643,12 +654,19 @@ class _FeedParserMixin:
          if _debug: sys.stderr.write('entering parse_declaration\n')
          if self.rawdata[i:i+9] == '<![CDATA[':
              k = self.rawdata.find(']]>', i)
-            if k == -1: k = len(self.rawdata)
+            if k == -1:
+                # CDATA block began but didn't finish
+                k = len(self.rawdata)
+                return k
              self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
              return k+3
          else:
              k = self.rawdata.find('>', i)
-            return k+1
+            if k >= 0:
+                return k+1
+            else:
+                # We have an incomplete CDATA block.
+                return k
  
      def mapContentType(self, contentType):
          contentType = contentType.lower()
@@ -919,7 +937,10 @@ class _FeedParserMixin:
                        '0.92': 'rss092',
                        '0.93': 'rss093',
                        '0.94': 'rss094'}
-        if not self.version:
+        #If we're here then this is an RSS feed.
+        #If we don't have a version or have a version that starts with something
+        #other than RSS then there's been a mistake. Correct it.
+        if not self.version or not self.version.startswith('rss'):
              attr_version = attrsD.get('version', '')
              version = versionmap.get(attr_version)
              if version:
@@ -1110,7 +1131,7 @@ class _FeedParserMixin:
      def _getContext(self):
          if self.insource:
              context = self.sourcedata
-        elif self.inimage:
+        elif self.inimage and self.feeddata.has_key('image'):
              context = self.feeddata['image']
          elif self.intextinput:
              context = self.feeddata['textinput']
@@ -1481,11 +1502,18 @@ class _FeedParserMixin:
              context['id'] = href
              
      def _start_source(self, attrsD):
+        if 'url' in attrsD:
+          # This means that we're processing a source element from an RSS 2.0 feed
+          self.sourcedata['href'] = attrsD[u'url']
+        self.push('source', 1)
          self.insource = 1
          self.hasTitle = 0
  
      def _end_source(self):
          self.insource = 0
+        value = self.pop('source')
+        if value:
+          self.sourcedata['title'] = value
          self._getContext()['source'] = copy.deepcopy(self.sourcedata)
          self.sourcedata.clear()
  
@@ -1550,6 +1578,15 @@ class _FeedParserMixin:
              if not context['media_thumbnail'][-1].has_key('url'):
                  context['media_thumbnail'][-1]['url'] = url
  
+    def _start_media_player(self, attrsD):
+        self.push('media_player', 0)
+        self._getContext()['media_player'] = FeedParserDict(attrsD)
+
+    def _end_media_player(self):
+        value = self.pop('media_player')
+        context = self._getContext()
+        context['media_player']['content'] = value
+
  if _XML_AVAILABLE:
      class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
          def __init__(self, baseuri, baselang, encoding):
@@ -1558,9 +1595,12 @@ if _XML_AVAILABLE:
              _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
              self.bozo = 0
              self.exc = None
+            self.decls = {}
          
          def startPrefixMapping(self, prefix, uri):
              self.trackNamespace(prefix, uri)
+            if uri == 'http://www.w3.org/1999/xlink':
+              self.decls['xmlns:'+prefix] = uri
          
          def startElementNS(self, name, qname, attrs):
              namespace, localname = name
@@ -1585,7 +1625,7 @@ if _XML_AVAILABLE:
              # the qnames the SAX parser gives us (if indeed it gives us any
              # at all).  Thanks to MatejC for helping me test this and
              # tirelessly telling me that it didn't work yet.
-            attrsD = {}
+            attrsD, self.decls = self.decls, {}
              if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
                  attrsD['xmlns']=namespace
              if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
@@ -1634,7 +1674,7 @@ if _XML_AVAILABLE:
          def error(self, exc):
              self.bozo = 1
              self.exc = exc
-            
+
          def fatalError(self, exc):
              self.error(exc)
              raise exc
@@ -1642,15 +1682,18 @@ if _XML_AVAILABLE:
  class _BaseHTMLProcessor(sgmllib.SGMLParser):
      special = re.compile('''[<>'"]''')
      bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
-    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
-      'img', 'input', 'isindex', 'link', 'meta', 'param']
-    
+    elements_no_end_tag = [
+      'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 
+      'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+      'source', 'track', 'wbr'
+    ]
+
      def __init__(self, encoding, type):
          self.encoding = encoding
          self.type = type
          if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
          sgmllib.SGMLParser.__init__(self)
-        
+
      def reset(self):
          self.pieces = []
          sgmllib.SGMLParser.reset(self)
@@ -1748,7 +1791,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
          # called for each block of plain text, i.e. outside of any tag and
          # not containing any character or entity references
          # Store the original text verbatim.
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
          self.pieces.append(text)
          
      def handle_comment(self, text):
@@ -2275,12 +2318,16 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
          return _urljoin(self.baseuri, uri.strip())
      
      def unknown_starttag(self, tag, attrs):
+        if _debug:
+            sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
          attrs = self.normalize_attrs(attrs)
          attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
          _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-        
+
  def _resolveRelativeURIs(htmlSource, baseURI, encoding, type):
-    if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
+    if _debug:
+        sys.stderr.write('entering _resolveRelativeURIs\n')
+
      p = _RelativeURIResolver(baseURI, encoding, type)
      p.feed(htmlSource)
      return p.output()
@@ -2420,6 +2467,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
              if tag in self.unacceptable_elements_with_end_tag:
                  self.unacceptablestack += 1
  
+            # add implicit namespaces to html5 inline svg/mathml
+            if self.type.endswith('html'):
+                if not dict(attrs).get('xmlns'):
+                    if tag=='svg':
+                        attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+                    if tag=='math':
+                        attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
              # not otherwise acceptable, perhaps it is MathML or SVG?
              if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
                  self.mathmlOK += 1
@@ -2493,7 +2548,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
  
          # gauntlet
          if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
-        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
+        # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+        if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''
  
          clean = []
          for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
@@ -2739,7 +2795,8 @@ _iso8601_re = [
      'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
      'CC', r'(?P<century>\d\d$)')
      + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
-    + r'(:(?P<second>\d{2}(\.\d*)?))?'
+    + r'(:(?P<second>\d{2}))?'
+    + r'(\.(?P<fracsecond>\d+))?'
      + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
      for tmpl in _iso8601_tmpl]
  del tmpl
@@ -3370,7 +3427,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
      except Exception, e:
          result['bozo'] = 1
          result['bozo_exception'] = e
-        data = ''
+        data = None
          f = None
  
      # if feed is gzip-compressed, decompress it
@@ -3428,8 +3485,9 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
              bozo_message = 'no Content-type specified'
          result['bozo'] = 1
          result['bozo_exception'] = NonXMLContentType(bozo_message)
-        
-    result['version'], data, entities = _stripDoctype(data)
+
+    if data is not None:
+        result['version'], data, entities = _stripDoctype(data)
  
      baseuri = http_headers.get('content-location', result.get('href'))
      baselang = http_headers.get('content-language', None)
@@ -3442,7 +3500,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
          return result
  
      # if there was a problem downloading, we're done
-    if not data:
+    if data is None:
          return result
  
      # determine character encoding
diff --git a/r2e b/r2e

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/rss2email.py b/rss2email.py

old mode 100644 (file)

new mode 100755 (executable)

index fff46ac..00fdbd7
--- a/rss2email.py
+++ b/rss2email.py
@@ -11,13 +11,13 @@ Usage:
    reset
    delete n
  """
-__version__ = "2.66"
+__version__ = "2.67"
  __author__ = "Lindsey Smith (lindsey@allthingsrss.com)"
  __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3."
  ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", 
                       "Matej Cepl", "Martin 'Joey' Schulze", 
                       "Marcel Ackermann (http://www.DreamFlasher.de)", 
-                     "Lindsey Smith", "Aaron Swartz (original author)" ]
+                     "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
  
  import urllib2
  urllib2.install_opener(urllib2.build_opener())
@@ -153,13 +153,13 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps
                 
         fromhdr = formataddr((sender_name, sender_addr))
         msg['From'] = fromhdr
-               
+
         msg_as_string = msg.as_string()
  #DEPRECATED    if QP_REQUIRED:
  #DEPRECATED            ins, outs = SIO(msg_as_string), SIO()
  #DEPRECATED            mimify.mimify(ins, outs)
  #DEPRECATED            msg_as_string = outs.getvalue()
-               
+
         if SMTP_SEND:
                 if not smtpserver: 
                         import smtplib
@@ -283,6 +283,8 @@ h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
  h2t.BODY_WIDTH = BODY_WIDTH
  html2text = h2t.html2text
  
+from types import *
+
  ### Utility Functions ###
  
  import threading
@@ -371,7 +373,12 @@ def getContent(entry, HTMLOK=0):
  def getID(entry):
         """Get best ID from an entry."""
         if TRUST_GUID:
-               if 'id' in entry and entry.id: return entry.id
+               if 'id' in entry and entry.id: 
+                       # Newer versions of feedparser could return a dictionary
+                       if type(entry.id) is DictType:
+                               return entry.id.values()[0]
+
+                       return entry.id
  
         content = getContent(entry)
         if content and content != "\n": return hash(unu(content)).hexdigest()
@@ -589,13 +596,17 @@ def run(num=None):
                                         # Instead of letting these run wild, we put them in context
                                         # by associating them with the actual ID (if it exists).
                                         
-                                       frameid = entry.get('id', id)
+                                       frameid = entry.get('id')
+                                       if not(frameid): frameid = id
+                                       if type(frameid) is DictType:
+                                               frameid = frameid.values()[0]
                                         
                                         # If this item's ID is in our database
                                         # then it's already been sent
                                         # and we don't need to do anything more.
                                         
-                                       if f.seen.has_key(frameid) and f.seen[frameid] == id: continue
+                                       if frameid in f.seen:
+                                               if f.seen[frameid] == id: continue
  
                                         if not (f.to or default_to):
                                                 print "No default email address defined. Please run 'r2e email emailaddress'"
@@ -622,8 +633,8 @@ def run(num=None):
                                         
                                         from_addr = getEmail(r.feed, entry)
                                         
-                                       name = getName(r, entry)
-                                       fromhdr = '"'+ name + '" <' + from_addr + ">"
+                                        name = h2t.unescape(getName(r, entry))
+                                       fromhdr = formataddr((name, from_addr,))
                                         tohdr = (f.to or default_to)
                                         subjecthdr = title
                                         datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
@@ -648,7 +659,7 @@ def run(num=None):
                                                 content += '<div id="entry">\n'
                                                 content += '<h1'
                                                 content += ' class="header"'
-                                               content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n'
+                                               content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
                                                 if ishtml(entrycontent):
                                                         body = entrycontent[1].strip()
                                                 else:
@@ -662,6 +673,12 @@ def run(num=None):
                                                                         content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
                                                                 if (hasattr(enclosure, 'src') and enclosure.src != ""):
                                                                         content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
+                                               if 'links' in entry:
+                                                       for extralink in entry.links:
+                                                               if ('rel' in extralink) and extralink['rel'] == u'via':
+                                                                       extraurl = extralink['href']
+                                                                       extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
+                                                                       content += '<br/>Via: <a href="'+extraurl+'">'+extralink['title']+'</a>\n'
                                                 content += '</p></div>\n'
                                                 content += "\n\n</body></html>"
                                         else:   
@@ -677,7 +694,11 @@ def run(num=None):
                                                                 for enclosure in entry.enclosures:
                                                                         if enclosure.url != "":
                                                                                 content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
-                                                       
+                                                       if 'links' in entry:
+                                                               for extralink in entry.links:
+                                                                       if ('rel' in extralink) and extralink['rel'] == u'via':
+                                                                               content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
+                                                                
                                                         content += ("\n</body></html>")
                                                 else:
                                                         content = entrycontent.strip() + "\n\nURL: "+link
@@ -685,6 +706,10 @@ def run(num=None):
                                                                 for enclosure in entry.enclosures:
                                                                         if enclosure.url != "":
                                                                                 content += ('\nEnclosure: ' + enclosure.url + "\n")
+                                                       if 'links' in entry:
+                                                               for extralink in entry.links:
+                                                                       if ('rel' in extralink) and extralink['rel'] == u'via':
+                                                                               content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'
  
                                         smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)
author	Lindsey Smith <lindsey@allthingsrss.com>
	Tue, 21 Sep 2010 12:00:00 +0000 (12:00 +0000)
committer	W. Trevor King <wking@tremily.us>
	Thu, 4 Oct 2012 11:05:45 +0000 (07:05 -0400)
CHANGELOG		patch \| blob \| history
config.py	[changed mode: 0644->0755]	patch \| blob \| history
feedparser.py		patch \| blob \| history
r2e	[changed mode: 0644->0755]	patch \| blob \| history
rss2email.py	[changed mode: 0644->0755]	patch \| blob \| history