"Fazal Majid <http://www.majid.info/mylos/weblog/>",
"Aaron Swartz <http://aaronsw.com/>",
"Kevin Marks <http://epeus.blogspot.com/>",
- "Sam Ruby <http://intertwingly.net/>"]
+ "Sam Ruby <http://intertwingly.net/>",
+ "Ade Oshineye <http://blog.oshineye.com/>"]
_debug = 0
# HTTP "User-Agent" header to send to servers when downloading feeds.
'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
'http://purl.org/rss/1.0/modules/link/': 'l',
'http://search.yahoo.com/mrss': 'media',
+ #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
+ 'http://search.yahoo.com/mrss/': 'media',
'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
method = getattr(self, methodname)
return method(attrsD)
except AttributeError:
- return self.push(prefix + suffix, 1)
+ # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
+ unknown_tag = prefix + suffix
+ if len(attrsD) == 0:
+ # No attributes so merge it into the encosing dictionary
+ return self.push(unknown_tag, 1)
+ else:
+ # Has attributes so create it in its own dictionary
+ context = self._getContext()
+ context[unknown_tag] = attrsD
def unknown_endtag(self, tag):
if _debug: sys.stderr.write('end %s\n' % tag)
if _debug: sys.stderr.write('entering parse_declaration\n')
if self.rawdata[i:i+9] == '<![CDATA[':
k = self.rawdata.find(']]>', i)
- if k == -1: k = len(self.rawdata)
+ if k == -1:
+ # CDATA block began but didn't finish
+ k = len(self.rawdata)
+ return k
self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
return k+3
else:
k = self.rawdata.find('>', i)
- return k+1
+ if k >= 0:
+ return k+1
+ else:
+ # We have an incomplete CDATA block.
+ return k
def mapContentType(self, contentType):
contentType = contentType.lower()
'0.92': 'rss092',
'0.93': 'rss093',
'0.94': 'rss094'}
- if not self.version:
+ #If we're here then this is an RSS feed.
+ #If we don't have a version or have a version that starts with something
+ #other than RSS then there's been a mistake. Correct it.
+ if not self.version or not self.version.startswith('rss'):
attr_version = attrsD.get('version', '')
version = versionmap.get(attr_version)
if version:
def _getContext(self):
if self.insource:
context = self.sourcedata
- elif self.inimage:
+ elif self.inimage and self.feeddata.has_key('image'):
context = self.feeddata['image']
elif self.intextinput:
context = self.feeddata['textinput']
context['id'] = href
def _start_source(self, attrsD):
+ if 'url' in attrsD:
+ # This means that we're processing a source element from an RSS 2.0 feed
+ self.sourcedata['href'] = attrsD[u'url']
+ self.push('source', 1)
self.insource = 1
self.hasTitle = 0
def _end_source(self):
self.insource = 0
+ value = self.pop('source')
+ if value:
+ self.sourcedata['title'] = value
self._getContext()['source'] = copy.deepcopy(self.sourcedata)
self.sourcedata.clear()
if not context['media_thumbnail'][-1].has_key('url'):
context['media_thumbnail'][-1]['url'] = url
+ def _start_media_player(self, attrsD):
+ self.push('media_player', 0)
+ self._getContext()['media_player'] = FeedParserDict(attrsD)
+
+ def _end_media_player(self):
+ value = self.pop('media_player')
+ context = self._getContext()
+ context['media_player']['content'] = value
+
if _XML_AVAILABLE:
class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
def __init__(self, baseuri, baselang, encoding):
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0
self.exc = None
+ self.decls = {}
def startPrefixMapping(self, prefix, uri):
self.trackNamespace(prefix, uri)
+ if uri == 'http://www.w3.org/1999/xlink':
+ self.decls['xmlns:'+prefix] = uri
def startElementNS(self, name, qname, attrs):
namespace, localname = name
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
- attrsD = {}
+ attrsD, self.decls = self.decls, {}
if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
attrsD['xmlns']=namespace
if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
def error(self, exc):
self.bozo = 1
self.exc = exc
-
+
def fatalError(self, exc):
self.error(exc)
raise exc
class _BaseHTMLProcessor(sgmllib.SGMLParser):
special = re.compile('''[<>'"]''')
bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
- elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
- 'img', 'input', 'isindex', 'link', 'meta', 'param']
-
+ elements_no_end_tag = [
+ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
+ 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+ 'source', 'track', 'wbr'
+ ]
+
def __init__(self, encoding, type):
self.encoding = encoding
self.type = type
if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
sgmllib.SGMLParser.__init__(self)
-
+
def reset(self):
self.pieces = []
sgmllib.SGMLParser.reset(self)
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
# Store the original text verbatim.
- if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
self.pieces.append(text)
def handle_comment(self, text):
return _urljoin(self.baseuri, uri.strip())
def unknown_starttag(self, tag, attrs):
+ if _debug:
+ sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
attrs = self.normalize_attrs(attrs)
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-
+
def _resolveRelativeURIs(htmlSource, baseURI, encoding, type):
- if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
+ if _debug:
+ sys.stderr.write('entering _resolveRelativeURIs\n')
+
p = _RelativeURIResolver(baseURI, encoding, type)
p.feed(htmlSource)
return p.output()
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack += 1
+ # add implicit namespaces to html5 inline svg/mathml
+ if self.type.endswith('html'):
+ if not dict(attrs).get('xmlns'):
+ if tag=='svg':
+ attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+ if tag=='math':
+ attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
# not otherwise acceptable, perhaps it is MathML or SVG?
if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
self.mathmlOK += 1
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
- if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
+ # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+ if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''
clean = []
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
'CC', r'(?P<century>\d\d$)')
+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
- + r'(:(?P<second>\d{2}(\.\d*)?))?'
+ + r'(:(?P<second>\d{2}))?'
+ + r'(\.(?P<fracsecond>\d+))?'
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
for tmpl in _iso8601_tmpl]
del tmpl
except Exception, e:
result['bozo'] = 1
result['bozo_exception'] = e
- data = ''
+ data = None
f = None
# if feed is gzip-compressed, decompress it
bozo_message = 'no Content-type specified'
result['bozo'] = 1
result['bozo_exception'] = NonXMLContentType(bozo_message)
-
- result['version'], data, entities = _stripDoctype(data)
+
+ if data is not None:
+ result['version'], data, entities = _stripDoctype(data)
baseuri = http_headers.get('content-location', result.get('href'))
baselang = http_headers.get('content-language', None)
return result
# if there was a problem downloading, we're done
- if not data:
+ if data is None:
return result
# determine character encoding
reset
delete n
"""
-__version__ = "2.66"
+__version__ = "2.67"
__author__ = "Lindsey Smith (lindsey@allthingsrss.com)"
__copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3."
___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess",
"Matej Cepl", "Martin 'Joey' Schulze",
"Marcel Ackermann (http://www.DreamFlasher.de)",
- "Lindsey Smith", "Aaron Swartz (original author)" ]
+ "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
import urllib2
urllib2.install_opener(urllib2.build_opener())
fromhdr = formataddr((sender_name, sender_addr))
msg['From'] = fromhdr
-
+
msg_as_string = msg.as_string()
#DEPRECATED if QP_REQUIRED:
#DEPRECATED ins, outs = SIO(msg_as_string), SIO()
#DEPRECATED mimify.mimify(ins, outs)
#DEPRECATED msg_as_string = outs.getvalue()
-
+
if SMTP_SEND:
if not smtpserver:
import smtplib
h2t.BODY_WIDTH = BODY_WIDTH
html2text = h2t.html2text
+from types import *
+
### Utility Functions ###
import threading
def getID(entry):
"""Get best ID from an entry."""
if TRUST_GUID:
- if 'id' in entry and entry.id: return entry.id
+ if 'id' in entry and entry.id:
+ # Newer versions of feedparser could return a dictionary
+ if type(entry.id) is DictType:
+ return entry.id.values()[0]
+
+ return entry.id
content = getContent(entry)
if content and content != "\n": return hash(unu(content)).hexdigest()
# Instead of letting these run wild, we put them in context
# by associating them with the actual ID (if it exists).
- frameid = entry.get('id', id)
+ frameid = entry.get('id')
+ if not(frameid): frameid = id
+ if type(frameid) is DictType:
+ frameid = frameid.values()[0]
# If this item's ID is in our database
# then it's already been sent
# and we don't need to do anything more.
- if f.seen.has_key(frameid) and f.seen[frameid] == id: continue
+ if frameid in f.seen:
+ if f.seen[frameid] == id: continue
if not (f.to or default_to):
print "No default email address defined. Please run 'r2e email emailaddress'"
from_addr = getEmail(r.feed, entry)
- name = getName(r, entry)
- fromhdr = '"'+ name + '" <' + from_addr + ">"
+ name = h2t.unescape(getName(r, entry))
+ fromhdr = formataddr((name, from_addr,))
tohdr = (f.to or default_to)
subjecthdr = title
datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
content += '<div id="entry">\n'
content += '<h1'
content += ' class="header"'
- content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n'
+ content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
if ishtml(entrycontent):
body = entrycontent[1].strip()
else:
content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
if (hasattr(enclosure, 'src') and enclosure.src != ""):
content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
+ if 'links' in entry:
+ for extralink in entry.links:
+ if ('rel' in extralink) and extralink['rel'] == u'via':
+ extraurl = extralink['href']
+ extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
+ content += '<br/>Via: <a href="'+extraurl+'">'+extralink['title']+'</a>\n'
content += '</p></div>\n'
content += "\n\n</body></html>"
else:
for enclosure in entry.enclosures:
if enclosure.url != "":
content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
-
+ if 'links' in entry:
+ for extralink in entry.links:
+ if ('rel' in extralink) and extralink['rel'] == u'via':
+ content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
+
content += ("\n</body></html>")
else:
content = entrycontent.strip() + "\n\nURL: "+link
for enclosure in entry.enclosures:
if enclosure.url != "":
content += ('\nEnclosure: ' + enclosure.url + "\n")
+ if 'links' in entry:
+ for extralink in entry.links:
+ if ('rel' in extralink) and extralink['rel'] == u'via':
+ content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'
smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)