2 """Universal feed parser
4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
6 Visit http://feedparser.org/ for the latest version
7 Visit http://feedparser.org/docs/ for the latest documentation
9 Required: Python 2.4 or later
10 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
14 __license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
16 Redistribution and use in source and binary forms, with or without modification,
17 are permitted provided that the following conditions are met:
19 * Redistributions of source code must retain the above copyright notice,
20 this list of conditions and the following disclaimer.
21 * Redistributions in binary form must reproduce the above copyright notice,
22 this list of conditions and the following disclaimer in the documentation
23 and/or other materials provided with the distribution.
25 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
26 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 POSSIBILITY OF SUCH DAMAGE."""
36 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
37 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
38 "John Beimler <http://john.beimler.org/>",
39 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
40 "Aaron Swartz <http://aaronsw.com/>",
41 "Kevin Marks <http://epeus.blogspot.com/>",
42 "Sam Ruby <http://intertwingly.net/>",
43 "Ade Oshineye <http://blog.oshineye.com/>",
44 "Martin Pool <http://sourcefrog.net/>",
45 "Kurt McKee <http://kurtmckee.org/>"]
48 # HTTP "User-Agent" header to send to servers when downloading feeds.
49 # If you are embedding feedparser in a larger application, you should
50 # change this to your application name and URL.
51 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
53 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
54 # want to send an Accept header, set this to None.
55 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
57 # List of preferred XML parsers, by SAX driver name. These will be tried first,
58 # but if they're not installed, Python will keep searching through its own list
59 # of pre-installed parsers until it finds one that supports everything we need.
60 PREFERRED_XML_PARSERS = ["drv_libxml2"]
62 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
63 # this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
64 # or utidylib <http://utidylib.berlios.de/>.
67 # List of Python interfaces for HTML Tidy, in order of preference. Only useful
69 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
71 # If you want feedparser to automatically resolve all relative URIs, set this
73 RESOLVE_RELATIVE_URIS = 1
75 # If you want feedparser to automatically sanitize all potentially unsafe
76 # HTML content, set this to 1.
79 # ---------- Python 3 modules (make it work if possible) ----------
83 from email import _parseaddr as rfc822
86 # Python 3.1 introduces bytes.maketrans and simultaneously
87 # deprecates string.maketrans; use bytes.maketrans if possible
88 _maketrans = bytes.maketrans
89 except (NameError, AttributeError):
91 _maketrans = string.maketrans
93 # base64 support for Atom feeds that contain embedded binary data
95 import base64, binascii
96 # Python 3.1 deprecates decodestring in favor of decodebytes
97 _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
99 base64 = binascii = None
102 # Convert a UTF-8 str to bytes if the interpreter is Python 3
104 return bytes(s, 'utf8')
105 except (NameError, TypeError):
106 # In Python 2.5 and below, bytes doesn't exist (NameError)
107 # In Python 2.6 and above, bytes and str are the same (TypeError)
111 # Convert a list of ints to bytes if the interpreter is Python 3
114 # In Python 2.6 and above, this call won't raise an exception
115 # but it will return bytes([65]) as '[65]' instead of 'A'
119 return ''.join(map(chr, l))
121 # If you want feedparser to allow all URL schemes, set this to ()
122 # List culled from Python's urlparse documentation at:
123 # http://docs.python.org/library/urlparse.html
124 # as well as from "URI scheme" at Wikipedia:
125 # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
126 # Many more will likely need to be added!
127 ACCEPTABLE_URI_SCHEMES = (
128 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
129 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
130 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
131 # Additional common-but-unofficial schemes
132 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
133 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
135 #ACCEPTABLE_URI_SCHEMES = ()
137 # ---------- required modules (should come with any Python distribution) ----------
138 import sgmllib, re, sys, copy, urlparse, time, types, cgi, urllib, urllib2, datetime
140 from io import BytesIO as _StringIO
143 from cStringIO import StringIO as _StringIO
145 from StringIO import StringIO as _StringIO
147 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
149 # gzip is included with most Python distributions, but may not be available if you compiled your own
159 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
160 # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
161 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
162 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
165 xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
166 from xml.sax.saxutils import escape as _xmlescape
170 def _xmlescape(data,entities={}):
171 data = data.replace('&', '&')
172 data = data.replace('>', '>')
173 data = data.replace('<', '<')
174 for char, entity in entities:
175 data = data.replace(char, entity)
178 # cjkcodecs and iconv_codec provide support for more character encodings.
179 # Both are available from http://cjkpython.i18n.org/
181 import cjkcodecs.aliases
189 # chardet library auto-detects character encodings
190 # Download from http://chardet.feedparser.org/
194 import chardet.constants
195 chardet.constants._debug = 1
199 # reversable htmlentitydefs mappings for Python 2.2
201 from htmlentitydefs import name2codepoint, codepoint2name
203 import htmlentitydefs
206 for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
207 if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
208 name2codepoint[name]=ord(codepoint)
209 codepoint2name[ord(codepoint)]=name
211 # BeautifulSoup parser used for parsing microformats from embedded HTML content
212 # http://www.crummy.com/software/BeautifulSoup/
213 # feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
214 # older 2.x series. If it doesn't, and you can figure out why, I'll accept a
215 # patch and modify the compatibility statement accordingly.
221 # ---------- don't touch these ----------
222 class ThingsNobodyCaresAboutButMe(Exception): pass
223 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
224 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
225 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
226 class UndeclaredNamespace(Exception): pass
228 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
229 sgmllib.special = re.compile('<!')
230 sgmllib.charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
232 if sgmllib.endbracket.search(' <').start(0):
233 class EndBracketRegEx:
235 # Overriding the built-in sgmllib.endbracket regex allows the
236 # parser to find angle brackets embedded in element attributes.
237 self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
238 def search(self,string,index=0):
239 match = self.endbracket.match(string,index)
240 if match is not None:
241 # Returning a new object in the calling thread's context
242 # resolves a thread-safety.
243 return EndBracketMatch(match)
245 class EndBracketMatch:
246 def __init__(self, match):
249 return self.match.end(n)
250 sgmllib.endbracket = EndBracketRegEx()
252 SUPPORTED_VERSIONS = {'': 'unknown',
253 'rss090': 'RSS 0.90',
254 'rss091n': 'RSS 0.91 (Netscape)',
255 'rss091u': 'RSS 0.91 (Userland)',
256 'rss092': 'RSS 0.92',
257 'rss093': 'RSS 0.93',
258 'rss094': 'RSS 0.94',
261 'rss': 'RSS (unknown version)',
262 'atom01': 'Atom 0.1',
263 'atom02': 'Atom 0.2',
264 'atom03': 'Atom 0.3',
265 'atom10': 'Atom 1.0',
266 'atom': 'Atom (unknown version)',
274 # Python 2.1 does not have dict
275 from UserDict import UserDict
282 class FeedParserDict(UserDict):
283 keymap = {'channel': 'feed',
287 'date_parsed': 'updated_parsed',
288 'description': ['summary', 'subtitle'],
290 'modified': 'updated',
291 'modified_parsed': 'updated_parsed',
292 'issued': 'published',
293 'issued_parsed': 'published_parsed',
294 'copyright': 'rights',
295 'copyright_detail': 'rights_detail',
296 'tagline': 'subtitle',
297 'tagline_detail': 'subtitle_detail'}
298 def __getitem__(self, key):
299 if key == 'category':
300 return UserDict.__getitem__(self, 'tags')[0]['term']
301 if key == 'enclosures':
302 norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
303 return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
305 for link in UserDict.__getitem__(self, 'links'):
306 if link['rel']=='license' and link.has_key('href'):
308 if key == 'categories':
309 return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
310 realkey = self.keymap.get(key, key)
311 if type(realkey) == types.ListType:
313 if UserDict.__contains__(self, k):
314 return UserDict.__getitem__(self, k)
315 if UserDict.__contains__(self, key):
316 return UserDict.__getitem__(self, key)
317 return UserDict.__getitem__(self, realkey)
319 def __setitem__(self, key, value):
320 for k in self.keymap.keys():
323 if type(key) == types.ListType:
325 return UserDict.__setitem__(self, key, value)
327 def get(self, key, default=None):
328 if self.has_key(key):
333 def setdefault(self, key, value):
334 if not self.has_key(key):
338 def has_key(self, key):
340 return hasattr(self, key) or UserDict.__contains__(self, key)
341 except AttributeError:
343 # This alias prevents the 2to3 tool from changing the semantics of the
344 # __contains__ function below and exhausting the maximum recursion depth
347 def __getattr__(self, key):
349 return self.__dict__[key]
353 assert not key.startswith('_')
354 return self.__getitem__(key)
356 raise AttributeError, "object has no attribute '%s'" % key
358 def __setattr__(self, key, value):
359 if key.startswith('_') or key == 'data':
360 self.__dict__[key] = value
362 return self.__setitem__(key, value)
364 def __contains__(self, key):
365 return self.__has_key(key)
367 def zopeCompatibilityHack():
368 global FeedParserDict
370 def FeedParserDict(aDict=None):
376 _ebcdic_to_ascii_map = None
377 def _ebcdic_to_ascii(s):
378 global _ebcdic_to_ascii_map
379 if not _ebcdic_to_ascii_map:
381 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
382 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
383 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
384 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
385 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
386 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
387 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
388 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
389 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
390 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
391 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
392 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
393 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
394 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
395 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
396 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
398 _ebcdic_to_ascii_map = _maketrans( \
399 _l2bytes(range(256)), _l2bytes(emap))
400 return s.translate(_ebcdic_to_ascii_map)
403 unichr(128): unichr(8364), # euro sign
404 unichr(130): unichr(8218), # single low-9 quotation mark
405 unichr(131): unichr( 402), # latin small letter f with hook
406 unichr(132): unichr(8222), # double low-9 quotation mark
407 unichr(133): unichr(8230), # horizontal ellipsis
408 unichr(134): unichr(8224), # dagger
409 unichr(135): unichr(8225), # double dagger
410 unichr(136): unichr( 710), # modifier letter circumflex accent
411 unichr(137): unichr(8240), # per mille sign
412 unichr(138): unichr( 352), # latin capital letter s with caron
413 unichr(139): unichr(8249), # single left-pointing angle quotation mark
414 unichr(140): unichr( 338), # latin capital ligature oe
415 unichr(142): unichr( 381), # latin capital letter z with caron
416 unichr(145): unichr(8216), # left single quotation mark
417 unichr(146): unichr(8217), # right single quotation mark
418 unichr(147): unichr(8220), # left double quotation mark
419 unichr(148): unichr(8221), # right double quotation mark
420 unichr(149): unichr(8226), # bullet
421 unichr(150): unichr(8211), # en dash
422 unichr(151): unichr(8212), # em dash
423 unichr(152): unichr( 732), # small tilde
424 unichr(153): unichr(8482), # trade mark sign
425 unichr(154): unichr( 353), # latin small letter s with caron
426 unichr(155): unichr(8250), # single right-pointing angle quotation mark
427 unichr(156): unichr( 339), # latin small ligature oe
428 unichr(158): unichr( 382), # latin small letter z with caron
429 unichr(159): unichr( 376)} # latin capital letter y with diaeresis
431 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
432 def _urljoin(base, uri):
433 uri = _urifixer.sub(r'\1\3', uri)
435 return urlparse.urljoin(base, uri)
437 uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
438 return urlparse.urljoin(base, uri)
440 class _FeedParserMixin:
441 namespaces = {'': '',
442 'http://backend.userland.com/rss': '',
443 'http://blogs.law.harvard.edu/tech/rss': '',
444 'http://purl.org/rss/1.0/': '',
445 'http://my.netscape.com/rdf/simple/0.9/': '',
446 'http://example.com/newformat#': '',
447 'http://example.com/necho': '',
448 'http://purl.org/echo/': '',
449 'uri/of/echo/namespace#': '',
450 'http://purl.org/pie/': '',
451 'http://purl.org/atom/ns#': '',
452 'http://www.w3.org/2005/Atom': '',
453 'http://purl.org/rss/1.0/modules/rss091#': '',
455 'http://webns.net/mvcb/': 'admin',
456 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
457 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
458 'http://media.tangent.org/rss/1.0/': 'audio',
459 'http://backend.userland.com/blogChannelModule': 'blogChannel',
460 'http://web.resource.org/cc/': 'cc',
461 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
462 'http://purl.org/rss/1.0/modules/company': 'co',
463 'http://purl.org/rss/1.0/modules/content/': 'content',
464 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
465 'http://purl.org/dc/elements/1.1/': 'dc',
466 'http://purl.org/dc/terms/': 'dcterms',
467 'http://purl.org/rss/1.0/modules/email/': 'email',
468 'http://purl.org/rss/1.0/modules/event/': 'ev',
469 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
470 'http://freshmeat.net/rss/fm/': 'fm',
471 'http://xmlns.com/foaf/0.1/': 'foaf',
472 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
473 'http://postneo.com/icbm/': 'icbm',
474 'http://purl.org/rss/1.0/modules/image/': 'image',
475 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
476 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
477 'http://purl.org/rss/1.0/modules/link/': 'l',
478 'http://search.yahoo.com/mrss': 'media',
479 #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
480 'http://search.yahoo.com/mrss/': 'media',
481 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
482 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
483 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
484 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
485 'http://purl.org/rss/1.0/modules/reference/': 'ref',
486 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
487 'http://purl.org/rss/1.0/modules/search/': 'search',
488 'http://purl.org/rss/1.0/modules/slash/': 'slash',
489 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
490 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
491 'http://hacks.benhammersley.com/rss/streaming/': 'str',
492 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
493 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
494 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
495 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
496 'http://purl.org/rss/1.0/modules/threading/': 'thr',
497 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
498 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
499 'http://wellformedweb.org/commentAPI/': 'wfw',
500 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
501 'http://www.w3.org/1999/xhtml': 'xhtml',
502 'http://www.w3.org/1999/xlink': 'xlink',
503 'http://www.w3.org/XML/1998/namespace': 'xml'
505 _matchnamespaces = {}
507 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
508 can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
509 can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
510 html_types = ['text/html', 'application/xhtml+xml']
512 def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
513 if _debug: sys.stderr.write('initializing FeedParser\n')
514 if not self._matchnamespaces:
515 for k, v in self.namespaces.items():
516 self._matchnamespaces[k.lower()] = v
517 self.feeddata = FeedParserDict() # feed-level data
518 self.encoding = encoding # character encoding
519 self.entries = [] # list of entry-level data
520 self.version = '' # feed type/version, see SUPPORTED_VERSIONS
521 self.namespacesInUse = {} # dictionary of namespaces defined by the feed
523 # the following are used internally to track state;
524 # this is really out of control and should be refactored
531 self.incontributor = 0
534 self.sourcedata = FeedParserDict()
535 self.contentparams = FeedParserDict()
536 self._summaryKey = None
537 self.namespacemap = {}
538 self.elementstack = []
541 self.baseuri = baseuri or ''
542 self.lang = baselang or None
546 self.feeddata['language'] = baselang.replace('_','-')
548 def unknown_starttag(self, tag, attrs):
549 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
551 attrs = [(k.lower(), v) for k, v in attrs]
552 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
553 # the sgml parser doesn't handle entities in attributes, but
554 # strict xml parsers do -- account for this difference
555 if isinstance(self, _LooseFeedParser):
556 attrs = [(k, v.replace('&', '&')) for k, v in attrs]
558 # track xml:base and xml:lang
560 baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
561 if type(baseuri) != type(u''):
563 baseuri = unicode(baseuri, self.encoding)
565 baseuri = unicode(baseuri, 'iso-8859-1')
566 # ensure that self.baseuri is always an absolute URI that
567 # uses a whitelisted URI scheme (e.g. not `javscript:`)
569 self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
571 self.baseuri = _urljoin(self.baseuri, baseuri)
572 lang = attrsD.get('xml:lang', attrsD.get('lang'))
574 # xml:lang could be explicitly set to '', we need to capture that
577 # if no xml:lang is specified, use parent lang
580 if tag in ('feed', 'rss', 'rdf:RDF'):
581 self.feeddata['language'] = lang.replace('_','-')
583 self.basestack.append(self.baseuri)
584 self.langstack.append(lang)
587 for prefix, uri in attrs:
588 if prefix.startswith('xmlns:'):
589 self.trackNamespace(prefix[6:], uri)
590 elif prefix == 'xmlns':
591 self.trackNamespace(None, uri)
593 # track inline content
594 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
595 if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
596 # element declared itself as escaped markup, but it isn't really
597 self.contentparams['type'] = 'application/xhtml+xml'
598 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
599 if tag.find(':') <> -1:
600 prefix, tag = tag.split(':', 1)
601 namespace = self.namespacesInUse.get(prefix, '')
602 if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
603 attrs.append(('xmlns',namespace))
604 if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
605 attrs.append(('xmlns',namespace))
606 if tag == 'svg': self.svgOK += 1
607 return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
610 if tag.find(':') <> -1:
611 prefix, suffix = tag.split(':', 1)
613 prefix, suffix = '', tag
614 prefix = self.namespacemap.get(prefix, prefix)
616 prefix = prefix + '_'
618 # special hack for better tracking of empty textinput/image elements in illformed feeds
619 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
621 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
624 # call special handler (if defined) or default handler
625 methodname = '_start_' + prefix + suffix
627 method = getattr(self, methodname)
628 return method(attrsD)
629 except AttributeError:
630 # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
631 unknown_tag = prefix + suffix
633 # No attributes so merge it into the encosing dictionary
634 return self.push(unknown_tag, 1)
636 # Has attributes so create it in its own dictionary
637 context = self._getContext()
638 context[unknown_tag] = attrsD
640 def unknown_endtag(self, tag):
641 if _debug: sys.stderr.write('end %s\n' % tag)
643 if tag.find(':') <> -1:
644 prefix, suffix = tag.split(':', 1)
646 prefix, suffix = '', tag
647 prefix = self.namespacemap.get(prefix, prefix)
649 prefix = prefix + '_'
650 if suffix == 'svg' and self.svgOK: self.svgOK -= 1
652 # call special handler (if defined) or default handler
653 methodname = '_end_' + prefix + suffix
655 if self.svgOK: raise AttributeError()
656 method = getattr(self, methodname)
658 except AttributeError:
659 self.pop(prefix + suffix)
661 # track inline content
662 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
663 # element declared itself as escaped markup, but it isn't really
664 if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
665 self.contentparams['type'] = 'application/xhtml+xml'
666 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
667 tag = tag.split(':')[-1]
668 self.handle_data('</%s>' % tag, escape=0)
670 # track xml:base and xml:lang going out of scope
673 if self.basestack and self.basestack[-1]:
674 self.baseuri = self.basestack[-1]
677 if self.langstack: # and (self.langstack[-1] is not None):
678 self.lang = self.langstack[-1]
680 def handle_charref(self, ref):
681 # called for each character reference, e.g. for ' ', ref will be '160'
682 if not self.elementstack: return
684 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
691 text = unichr(c).encode('utf-8')
692 self.elementstack[-1][2].append(text)
694 def handle_entityref(self, ref):
695 # called for each entity reference, e.g. for '©', ref will be 'copy'
696 if not self.elementstack: return
697 if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
698 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
700 elif ref in self.entities.keys():
701 text = self.entities[ref]
702 if text.startswith('&#') and text.endswith(';'):
703 return self.handle_entityref(text)
705 try: name2codepoint[ref]
706 except KeyError: text = '&%s;' % ref
707 else: text = unichr(name2codepoint[ref]).encode('utf-8')
708 self.elementstack[-1][2].append(text)
710 def handle_data(self, text, escape=1):
711 # called for each block of plain text, i.e. outside of any tag and
712 # not containing any character or entity references
713 if not self.elementstack: return
714 if escape and self.contentparams.get('type') == 'application/xhtml+xml':
715 text = _xmlescape(text)
716 self.elementstack[-1][2].append(text)
718 def handle_comment(self, text):
719 # called for each comment, e.g. <!-- insert message here -->
722 def handle_pi(self, text):
723 # called for each processing instruction, e.g. <?instruction>
726 def handle_decl(self, text):
729 def parse_declaration(self, i):
730 # override internal declaration handler to handle CDATA blocks
731 if _debug: sys.stderr.write('entering parse_declaration\n')
732 if self.rawdata[i:i+9] == '<![CDATA[':
733 k = self.rawdata.find(']]>', i)
735 # CDATA block began but didn't finish
736 k = len(self.rawdata)
738 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
741 k = self.rawdata.find('>', i)
745 # We have an incomplete CDATA block.
748 def mapContentType(self, contentType):
749 contentType = contentType.lower()
750 if contentType == 'text':
751 contentType = 'text/plain'
752 elif contentType == 'html':
753 contentType = 'text/html'
754 elif contentType == 'xhtml':
755 contentType = 'application/xhtml+xml'
758 def trackNamespace(self, prefix, uri):
759 loweruri = uri.lower()
760 if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
761 self.version = 'rss090'
762 if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
763 self.version = 'rss10'
764 if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
765 self.version = 'atom10'
766 if loweruri.find('backend.userland.com/rss') <> -1:
767 # match any backend.userland.com namespace
768 uri = 'http://backend.userland.com/rss'
770 if self._matchnamespaces.has_key(loweruri):
771 self.namespacemap[prefix] = self._matchnamespaces[loweruri]
772 self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
774 self.namespacesInUse[prefix or ''] = uri
776 def resolveURI(self, uri):
777 return _urljoin(self.baseuri or '', uri)
779 def decodeEntities(self, element, data):
782 def strattrs(self, attrs):
783 return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
785 def push(self, element, expectingText):
786 self.elementstack.append([element, expectingText, []])
788 def pop(self, element, stripWhitespace=1):
789 if not self.elementstack: return
790 if self.elementstack[-1][0] != element: return
792 element, expectingText, pieces = self.elementstack.pop()
794 if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
795 # remove enclosing child element, but only if it is a <div> and
796 # only if all the remaining content is nested underneath it.
797 # This means that the divs would be retained in the following:
798 # <div>foo</div><div>bar</div>
799 while pieces and len(pieces)>1 and not pieces[-1].strip():
801 while pieces and len(pieces)>1 and not pieces[0].strip():
803 if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
805 for piece in pieces[:-1]:
806 if piece.startswith('</'):
809 elif piece.startswith('<') and not piece.endswith('/>'):
812 pieces = pieces[1:-1]
814 # Ensure each piece is a str for Python 3
815 for (i, v) in enumerate(pieces):
816 if not isinstance(v, basestring):
817 pieces[i] = v.decode('utf-8')
819 output = ''.join(pieces)
821 output = output.strip()
822 if not expectingText: return output
824 # decode base64 content
825 if base64 and self.contentparams.get('base64', 0):
827 output = _base64decode(output)
828 except binascii.Error:
830 except binascii.Incomplete:
833 # In Python 3, base64 takes and outputs bytes, not str
834 # This may not be the most correct way to accomplish this
835 output = _base64decode(output.encode('utf-8')).decode('utf-8')
837 # resolve relative URIs
838 if (element in self.can_be_relative_uri) and output:
839 output = self.resolveURI(output)
841 # decode entities within embedded markup
842 if not self.contentparams.get('base64', 0):
843 output = self.decodeEntities(element, output)
845 if self.lookslikehtml(output):
846 self.contentparams['type']='text/html'
848 # remove temporary cruft from contentparams
850 del self.contentparams['mode']
854 del self.contentparams['base64']
858 is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
859 # resolve relative URIs within embedded markup
860 if is_htmlish and RESOLVE_RELATIVE_URIS:
861 if element in self.can_contain_relative_uris:
862 output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
865 # (must do this before sanitizing because some microformats
866 # rely on elements that we sanitize)
867 if is_htmlish and element in ['content', 'description', 'summary']:
868 mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
870 for tag in mfresults.get('tags', []):
871 self._addTag(tag['term'], tag['scheme'], tag['label'])
872 for enclosure in mfresults.get('enclosures', []):
873 self._start_enclosure(enclosure)
874 for xfn in mfresults.get('xfn', []):
875 self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
876 vcard = mfresults.get('vcard')
878 self._getContext()['vcard'] = vcard
880 # sanitize embedded markup
881 if is_htmlish and SANITIZE_HTML:
882 if element in self.can_contain_dangerous_markup:
883 output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
885 if self.encoding and type(output) != type(u''):
887 output = unicode(output, self.encoding)
891 # address common error where people take data that is already
892 # utf-8, presume that it is iso-8859-1, and re-encode it.
893 if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and type(output) == type(u''):
895 output = unicode(output.encode('iso-8859-1'), 'utf-8')
899 # map win-1252 extensions to the proper code points
900 if type(output) == type(u''):
901 output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
903 # categories/tags/keywords/whatever are handled in _end_category
904 if element == 'category':
907 if element == 'title' and self.hasTitle:
910 # store output in appropriate place(s)
911 if self.inentry and not self.insource:
912 if element == 'content':
913 self.entries[-1].setdefault(element, [])
914 contentparams = copy.deepcopy(self.contentparams)
915 contentparams['value'] = output
916 self.entries[-1][element].append(contentparams)
917 elif element == 'link':
919 # query variables in urls in link elements are improperly
920 # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
921 # unhandled character references. fix this special case.
922 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
923 self.entries[-1][element] = output
925 self.entries[-1]['links'][-1]['href'] = output
927 if element == 'description':
929 self.entries[-1][element] = output
931 contentparams = copy.deepcopy(self.contentparams)
932 contentparams['value'] = output
933 self.entries[-1][element + '_detail'] = contentparams
934 elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
935 context = self._getContext()
936 if element == 'description':
938 context[element] = output
939 if element == 'link':
940 # fix query variables; see above for the explanation
941 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
942 context[element] = output
943 context['links'][-1]['href'] = output
945 contentparams = copy.deepcopy(self.contentparams)
946 contentparams['value'] = output
947 context[element + '_detail'] = contentparams
950 def pushContent(self, tag, attrsD, defaultContentType, expectingText):
952 if self.lang: self.lang=self.lang.replace('_','-')
953 self.contentparams = FeedParserDict({
954 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
955 'language': self.lang,
956 'base': self.baseuri})
957 self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
958 self.push(tag, expectingText)
960 def popContent(self, tag):
961 value = self.pop(tag)
963 self.contentparams.clear()
966 # a number of elements in a number of RSS variants are nominally plain
967 # text, but this is routinely ignored. This is an attempt to detect
968 # the most common cases. As false positives often result in silent
969 # data loss, this function errs on the conservative side.
970 def lookslikehtml(self, s):
971 if self.version.startswith('atom'): return
972 if self.contentparams.get('type','text/html') != 'text/plain': return
974 # must have a close tag or a entity reference to qualify
975 if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)): return
977 # all tags must be in a restricted subset of valid HTML tags
978 if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
979 re.findall(r'</?(\w+)',s)): return
981 # all entities must have been defined as valid HTML entities
982 from htmlentitydefs import entitydefs
983 if filter(lambda e: e not in entitydefs.keys(),
984 re.findall(r'&(\w+);',s)): return
988 def _mapToStandardPrefix(self, name):
989 colonpos = name.find(':')
991 prefix = name[:colonpos]
992 suffix = name[colonpos+1:]
993 prefix = self.namespacemap.get(prefix, prefix)
994 name = prefix + ':' + suffix
997 def _getAttribute(self, attrsD, name):
998 return attrsD.get(self._mapToStandardPrefix(name))
1000 def _isBase64(self, attrsD, contentparams):
1001 if attrsD.get('mode', '') == 'base64':
1003 if self.contentparams['type'].startswith('text/'):
1005 if self.contentparams['type'].endswith('+xml'):
1007 if self.contentparams['type'].endswith('/xml'):
1011 def _itsAnHrefDamnIt(self, attrsD):
1012 href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1022 attrsD['href'] = href
1025 def _save(self, key, value, overwrite=False):
1026 context = self._getContext()
1028 context[key] = value
1030 context.setdefault(key, value)
1032 def _start_rss(self, attrsD):
1033 versionmap = {'0.91': 'rss091u',
1037 #If we're here then this is an RSS feed.
1038 #If we don't have a version or have a version that starts with something
1039 #other than RSS then there's been a mistake. Correct it.
1040 if not self.version or not self.version.startswith('rss'):
1041 attr_version = attrsD.get('version', '')
1042 version = versionmap.get(attr_version)
1044 self.version = version
1045 elif attr_version.startswith('2.'):
1046 self.version = 'rss20'
1048 self.version = 'rss'
1050 def _start_dlhottitles(self, attrsD):
1051 self.version = 'hotrss'
1053 def _start_channel(self, attrsD):
1055 self._cdf_common(attrsD)
1056 _start_feedinfo = _start_channel
1058 def _cdf_common(self, attrsD):
1059 if attrsD.has_key('lastmod'):
1060 self._start_modified({})
1061 self.elementstack[-1][-1] = attrsD['lastmod']
1062 self._end_modified()
1063 if attrsD.has_key('href'):
1064 self._start_link({})
1065 self.elementstack[-1][-1] = attrsD['href']
1068 def _start_feed(self, attrsD):
1070 versionmap = {'0.1': 'atom01',
1073 if not self.version:
1074 attr_version = attrsD.get('version')
1075 version = versionmap.get(attr_version)
1077 self.version = version
1079 self.version = 'atom'
1081 def _end_channel(self):
1083 _end_feed = _end_channel
1085 def _start_image(self, attrsD):
1086 context = self._getContext()
1087 if not self.inentry:
1088 context.setdefault('image', FeedParserDict())
1091 self.push('image', 0)
1093 def _end_image(self):
1097 def _start_textinput(self, attrsD):
1098 context = self._getContext()
1099 context.setdefault('textinput', FeedParserDict())
1100 self.intextinput = 1
1102 self.push('textinput', 0)
1103 _start_textInput = _start_textinput
1105 def _end_textinput(self):
1106 self.pop('textinput')
1107 self.intextinput = 0
1108 _end_textInput = _end_textinput
1110 def _start_author(self, attrsD):
1112 self.push('author', 1)
1113 # Append a new FeedParserDict when expecting an author
1114 context = self._getContext()
1115 context.setdefault('authors', [])
1116 context['authors'].append(FeedParserDict())
1117 _start_managingeditor = _start_author
1118 _start_dc_author = _start_author
1119 _start_dc_creator = _start_author
1120 _start_itunes_author = _start_author
1122 def _end_author(self):
1125 self._sync_author_detail()
1126 _end_managingeditor = _end_author
1127 _end_dc_author = _end_author
1128 _end_dc_creator = _end_author
1129 _end_itunes_author = _end_author
1131 def _start_itunes_owner(self, attrsD):
1132 self.inpublisher = 1
1133 self.push('publisher', 0)
1135 def _end_itunes_owner(self):
1136 self.pop('publisher')
1137 self.inpublisher = 0
1138 self._sync_author_detail('publisher')
1140 def _start_contributor(self, attrsD):
1141 self.incontributor = 1
1142 context = self._getContext()
1143 context.setdefault('contributors', [])
1144 context['contributors'].append(FeedParserDict())
1145 self.push('contributor', 0)
1147 def _end_contributor(self):
1148 self.pop('contributor')
1149 self.incontributor = 0
1151 def _start_dc_contributor(self, attrsD):
1152 self.incontributor = 1
1153 context = self._getContext()
1154 context.setdefault('contributors', [])
1155 context['contributors'].append(FeedParserDict())
1156 self.push('name', 0)
1158 def _end_dc_contributor(self):
1160 self.incontributor = 0
1162 def _start_name(self, attrsD):
1163 self.push('name', 0)
1164 _start_itunes_name = _start_name
1166 def _end_name(self):
1167 value = self.pop('name')
1168 if self.inpublisher:
1169 self._save_author('name', value, 'publisher')
1171 self._save_author('name', value)
1172 elif self.incontributor:
1173 self._save_contributor('name', value)
1174 elif self.intextinput:
1175 context = self._getContext()
1176 context['name'] = value
1177 _end_itunes_name = _end_name
1179 def _start_width(self, attrsD):
1180 self.push('width', 0)
1182 def _end_width(self):
1183 value = self.pop('width')
1189 context = self._getContext()
1190 context['width'] = value
1192 def _start_height(self, attrsD):
1193 self.push('height', 0)
1195 def _end_height(self):
1196 value = self.pop('height')
1202 context = self._getContext()
1203 context['height'] = value
1205 def _start_url(self, attrsD):
1206 self.push('href', 1)
1207 _start_homepage = _start_url
1208 _start_uri = _start_url
1211 value = self.pop('href')
1213 self._save_author('href', value)
1214 elif self.incontributor:
1215 self._save_contributor('href', value)
1216 _end_homepage = _end_url
1219 def _start_email(self, attrsD):
1220 self.push('email', 0)
1221 _start_itunes_email = _start_email
1223 def _end_email(self):
1224 value = self.pop('email')
1225 if self.inpublisher:
1226 self._save_author('email', value, 'publisher')
1228 self._save_author('email', value)
1229 elif self.incontributor:
1230 self._save_contributor('email', value)
1231 _end_itunes_email = _end_email
1233 def _getContext(self):
1235 context = self.sourcedata
1236 elif self.inimage and self.feeddata.has_key('image'):
1237 context = self.feeddata['image']
1238 elif self.intextinput:
1239 context = self.feeddata['textinput']
1241 context = self.entries[-1]
1243 context = self.feeddata
1246 def _save_author(self, key, value, prefix='author'):
1247 context = self._getContext()
1248 context.setdefault(prefix + '_detail', FeedParserDict())
1249 context[prefix + '_detail'][key] = value
1250 self._sync_author_detail()
1251 context.setdefault('authors', [FeedParserDict()])
1252 context['authors'][-1][key] = value
1254 def _save_contributor(self, key, value):
1255 context = self._getContext()
1256 context.setdefault('contributors', [FeedParserDict()])
1257 context['contributors'][-1][key] = value
1259 def _sync_author_detail(self, key='author'):
1260 context = self._getContext()
1261 detail = context.get('%s_detail' % key)
1263 name = detail.get('name')
1264 email = detail.get('email')
1266 context[key] = '%s (%s)' % (name, email)
1270 context[key] = email
1272 author, email = context.get(key), None
1273 if not author: return
1274 emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1276 email = emailmatch.group(0)
1277 # probably a better way to do the following, but it passes all the tests
1278 author = author.replace(email, '')
1279 author = author.replace('()', '')
1280 author = author.replace('<>', '')
1281 author = author.replace('<>', '')
1282 author = author.strip()
1283 if author and (author[0] == '('):
1285 if author and (author[-1] == ')'):
1286 author = author[:-1]
1287 author = author.strip()
1289 context.setdefault('%s_detail' % key, FeedParserDict())
1291 context['%s_detail' % key]['name'] = author
1293 context['%s_detail' % key]['email'] = email
1295 def _start_subtitle(self, attrsD):
1296 self.pushContent('subtitle', attrsD, 'text/plain', 1)
1297 _start_tagline = _start_subtitle
1298 _start_itunes_subtitle = _start_subtitle
1300 def _end_subtitle(self):
1301 self.popContent('subtitle')
1302 _end_tagline = _end_subtitle
1303 _end_itunes_subtitle = _end_subtitle
1305 def _start_rights(self, attrsD):
1306 self.pushContent('rights', attrsD, 'text/plain', 1)
1307 _start_dc_rights = _start_rights
1308 _start_copyright = _start_rights
1310 def _end_rights(self):
1311 self.popContent('rights')
1312 _end_dc_rights = _end_rights
1313 _end_copyright = _end_rights
1315 def _start_item(self, attrsD):
1316 self.entries.append(FeedParserDict())
1317 self.push('item', 0)
1321 id = self._getAttribute(attrsD, 'rdf:about')
1323 context = self._getContext()
1325 self._cdf_common(attrsD)
1326 _start_entry = _start_item
1327 _start_product = _start_item
1329 def _end_item(self):
1332 _end_entry = _end_item
1334 def _start_dc_language(self, attrsD):
1335 self.push('language', 1)
1336 _start_language = _start_dc_language
1338 def _end_dc_language(self):
1339 self.lang = self.pop('language')
1340 _end_language = _end_dc_language
1342 def _start_dc_publisher(self, attrsD):
1343 self.push('publisher', 1)
1344 _start_webmaster = _start_dc_publisher
1346 def _end_dc_publisher(self):
1347 self.pop('publisher')
1348 self._sync_author_detail('publisher')
1349 _end_webmaster = _end_dc_publisher
1351 def _start_published(self, attrsD):
1352 self.push('published', 1)
1353 _start_dcterms_issued = _start_published
1354 _start_issued = _start_published
1356 def _end_published(self):
1357 value = self.pop('published')
1358 self._save('published_parsed', _parse_date(value), overwrite=True)
1359 _end_dcterms_issued = _end_published
1360 _end_issued = _end_published
1362 def _start_updated(self, attrsD):
1363 self.push('updated', 1)
1364 _start_modified = _start_updated
1365 _start_dcterms_modified = _start_updated
1366 _start_pubdate = _start_updated
1367 _start_dc_date = _start_updated
1368 _start_lastbuilddate = _start_updated
1370 def _end_updated(self):
1371 value = self.pop('updated')
1372 parsed_value = _parse_date(value)
1373 self._save('updated_parsed', parsed_value, overwrite=True)
1374 _end_modified = _end_updated
1375 _end_dcterms_modified = _end_updated
1376 _end_pubdate = _end_updated
1377 _end_dc_date = _end_updated
1378 _end_lastbuilddate = _end_updated
1380 def _start_created(self, attrsD):
1381 self.push('created', 1)
1382 _start_dcterms_created = _start_created
1384 def _end_created(self):
1385 value = self.pop('created')
1386 self._save('created_parsed', _parse_date(value), overwrite=True)
1387 _end_dcterms_created = _end_created
1389 def _start_expirationdate(self, attrsD):
1390 self.push('expired', 1)
1392 def _end_expirationdate(self):
1393 self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1395 def _start_cc_license(self, attrsD):
1396 context = self._getContext()
1397 value = self._getAttribute(attrsD, 'rdf:resource')
1398 attrsD = FeedParserDict()
1399 attrsD['rel']='license'
1400 if value: attrsD['href']=value
1401 context.setdefault('links', []).append(attrsD)
1403 def _start_creativecommons_license(self, attrsD):
1404 self.push('license', 1)
1405 _start_creativeCommons_license = _start_creativecommons_license
1407 def _end_creativecommons_license(self):
1408 value = self.pop('license')
1409 context = self._getContext()
1410 attrsD = FeedParserDict()
1411 attrsD['rel']='license'
1412 if value: attrsD['href']=value
1413 context.setdefault('links', []).append(attrsD)
1414 del context['license']
1415 _end_creativeCommons_license = _end_creativecommons_license
1417 def _addXFN(self, relationships, href, name):
1418 context = self._getContext()
1419 xfn = context.setdefault('xfn', [])
1420 value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
1421 if value not in xfn:
1424 def _addTag(self, term, scheme, label):
1425 context = self._getContext()
1426 tags = context.setdefault('tags', [])
1427 if (not term) and (not scheme) and (not label): return
1428 value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1429 if value not in tags:
1432 def _start_category(self, attrsD):
1433 if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1434 term = attrsD.get('term')
1435 scheme = attrsD.get('scheme', attrsD.get('domain'))
1436 label = attrsD.get('label')
1437 self._addTag(term, scheme, label)
1438 self.push('category', 1)
1439 _start_dc_subject = _start_category
1440 _start_keywords = _start_category
1442 def _start_media_category(self, attrsD):
1443 attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema')
1444 self._start_category(attrsD)
1446 def _end_itunes_keywords(self):
1447 for term in self.pop('itunes_keywords').split():
1448 self._addTag(term, 'http://www.itunes.com/', None)
1450 def _start_itunes_category(self, attrsD):
1451 self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1452 self.push('category', 1)
1454 def _end_category(self):
1455 value = self.pop('category')
1456 if not value: return
1457 context = self._getContext()
1458 tags = context['tags']
1459 if value and len(tags) and not tags[-1]['term']:
1460 tags[-1]['term'] = value
1462 self._addTag(value, None, None)
1463 _end_dc_subject = _end_category
1464 _end_keywords = _end_category
1465 _end_itunes_category = _end_category
1466 _end_media_category = _end_category
1468 def _start_cloud(self, attrsD):
1469 self._getContext()['cloud'] = FeedParserDict(attrsD)
1471 def _start_link(self, attrsD):
1472 attrsD.setdefault('rel', 'alternate')
1473 if attrsD['rel'] == 'self':
1474 attrsD.setdefault('type', 'application/atom+xml')
1476 attrsD.setdefault('type', 'text/html')
1477 context = self._getContext()
1478 attrsD = self._itsAnHrefDamnIt(attrsD)
1479 if attrsD.has_key('href'):
1480 attrsD['href'] = self.resolveURI(attrsD['href'])
1481 expectingText = self.infeed or self.inentry or self.insource
1482 context.setdefault('links', [])
1483 if not (self.inentry and self.inimage):
1484 context['links'].append(FeedParserDict(attrsD))
1485 if attrsD.has_key('href'):
1487 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1488 context['link'] = attrsD['href']
1490 self.push('link', expectingText)
1491 _start_producturl = _start_link
1493 def _end_link(self):
1494 value = self.pop('link')
1495 context = self._getContext()
1496 _end_producturl = _end_link
1498 def _start_guid(self, attrsD):
1499 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1502 def _end_guid(self):
1503 value = self.pop('id')
1504 self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1506 # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1507 # and only if the item doesn't already have a link element
1508 self._save('link', value)
1510 def _start_title(self, attrsD):
1511 if self.svgOK: return self.unknown_starttag('title', attrsD.items())
1512 self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1513 _start_dc_title = _start_title
1514 _start_media_title = _start_title
1516 def _end_title(self):
1517 if self.svgOK: return
1518 value = self.popContent('title')
1519 if not value: return
1520 context = self._getContext()
1522 _end_dc_title = _end_title
1524 def _end_media_title(self):
1525 hasTitle = self.hasTitle
1527 self.hasTitle = hasTitle
1529 def _start_description(self, attrsD):
1530 context = self._getContext()
1531 if context.has_key('summary'):
1532 self._summaryKey = 'content'
1533 self._start_content(attrsD)
1535 self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1536 _start_dc_description = _start_description
1538 def _start_abstract(self, attrsD):
1539 self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1541 def _end_description(self):
1542 if self._summaryKey == 'content':
1545 value = self.popContent('description')
1546 self._summaryKey = None
1547 _end_abstract = _end_description
1548 _end_dc_description = _end_description
1550 def _start_info(self, attrsD):
1551 self.pushContent('info', attrsD, 'text/plain', 1)
1552 _start_feedburner_browserfriendly = _start_info
1554 def _end_info(self):
1555 self.popContent('info')
1556 _end_feedburner_browserfriendly = _end_info
1558 def _start_generator(self, attrsD):
1560 attrsD = self._itsAnHrefDamnIt(attrsD)
1561 if attrsD.has_key('href'):
1562 attrsD['href'] = self.resolveURI(attrsD['href'])
1563 self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1564 self.push('generator', 1)
1566 def _end_generator(self):
1567 value = self.pop('generator')
1568 context = self._getContext()
1569 if context.has_key('generator_detail'):
1570 context['generator_detail']['name'] = value
1572 def _start_admin_generatoragent(self, attrsD):
1573 self.push('generator', 1)
1574 value = self._getAttribute(attrsD, 'rdf:resource')
1576 self.elementstack[-1][2].append(value)
1577 self.pop('generator')
1578 self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1580 def _start_admin_errorreportsto(self, attrsD):
1581 self.push('errorreportsto', 1)
1582 value = self._getAttribute(attrsD, 'rdf:resource')
1584 self.elementstack[-1][2].append(value)
1585 self.pop('errorreportsto')
1587 def _start_summary(self, attrsD):
1588 context = self._getContext()
1589 if context.has_key('summary'):
1590 self._summaryKey = 'content'
1591 self._start_content(attrsD)
1593 self._summaryKey = 'summary'
1594 self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1595 _start_itunes_summary = _start_summary
1597 def _end_summary(self):
1598 if self._summaryKey == 'content':
1601 self.popContent(self._summaryKey or 'summary')
1602 self._summaryKey = None
1603 _end_itunes_summary = _end_summary
1605 def _start_enclosure(self, attrsD):
1606 attrsD = self._itsAnHrefDamnIt(attrsD)
1607 context = self._getContext()
1608 attrsD['rel']='enclosure'
1609 context.setdefault('links', []).append(FeedParserDict(attrsD))
1611 def _start_source(self, attrsD):
1613 # This means that we're processing a source element from an RSS 2.0 feed
1614 self.sourcedata['href'] = attrsD[u'url']
1615 self.push('source', 1)
1619 def _end_source(self):
1621 value = self.pop('source')
1623 self.sourcedata['title'] = value
1624 self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1625 self.sourcedata.clear()
1627 def _start_content(self, attrsD):
1628 self.pushContent('content', attrsD, 'text/plain', 1)
1629 src = attrsD.get('src')
1631 self.contentparams['src'] = src
1632 self.push('content', 1)
1634 def _start_prodlink(self, attrsD):
1635 self.pushContent('content', attrsD, 'text/html', 1)
1637 def _start_body(self, attrsD):
1638 self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1639 _start_xhtml_body = _start_body
1641 def _start_content_encoded(self, attrsD):
1642 self.pushContent('content', attrsD, 'text/html', 1)
1643 _start_fullitem = _start_content_encoded
1645 def _end_content(self):
1646 copyToSummary = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1647 value = self.popContent('content')
1649 self._save('summary', value)
1651 _end_body = _end_content
1652 _end_xhtml_body = _end_content
1653 _end_content_encoded = _end_content
1654 _end_fullitem = _end_content
1655 _end_prodlink = _end_content
1657 def _start_itunes_image(self, attrsD):
1658 self.push('itunes_image', 0)
1659 if attrsD.get('href'):
1660 self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1661 _start_itunes_link = _start_itunes_image
1663 def _end_itunes_block(self):
1664 value = self.pop('itunes_block', 0)
1665 self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1667 def _end_itunes_explicit(self):
1668 value = self.pop('itunes_explicit', 0)
1669 # Convert 'yes' -> True, 'clean' to False, and any other value to None
1670 # False and None both evaluate as False, so the difference can be ignored
1671 # by applications that only need to know if the content is explicit.
1672 self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1674 def _start_media_content(self, attrsD):
1675 context = self._getContext()
1676 context.setdefault('media_content', [])
1677 context['media_content'].append(attrsD)
1679 def _start_media_thumbnail(self, attrsD):
1680 context = self._getContext()
1681 context.setdefault('media_thumbnail', [])
1682 self.push('url', 1) # new
1683 context['media_thumbnail'].append(attrsD)
1685 def _end_media_thumbnail(self):
1686 url = self.pop('url')
1687 context = self._getContext()
1688 if url != None and len(url.strip()) != 0:
1689 if not context['media_thumbnail'][-1].has_key('url'):
1690 context['media_thumbnail'][-1]['url'] = url
1692 def _start_media_player(self, attrsD):
1693 self.push('media_player', 0)
1694 self._getContext()['media_player'] = FeedParserDict(attrsD)
1696 def _end_media_player(self):
1697 value = self.pop('media_player')
1698 context = self._getContext()
1699 context['media_player']['content'] = value
1701 def _start_newlocation(self, attrsD):
1702 self.push('newlocation', 1)
1704 def _end_newlocation(self):
1705 url = self.pop('newlocation')
1706 context = self._getContext()
1707 # don't set newlocation if the context isn't right
1708 if context is not self.feeddata:
1710 context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1713 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1714 def __init__(self, baseuri, baselang, encoding):
1715 if _debug: sys.stderr.write('trying StrictFeedParser\n')
1716 xml.sax.handler.ContentHandler.__init__(self)
1717 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1722 def startPrefixMapping(self, prefix, uri):
1723 self.trackNamespace(prefix, uri)
1724 if uri == 'http://www.w3.org/1999/xlink':
1725 self.decls['xmlns:'+prefix] = uri
1727 def startElementNS(self, name, qname, attrs):
1728 namespace, localname = name
1729 lowernamespace = str(namespace or '').lower()
1730 if lowernamespace.find('backend.userland.com/rss') <> -1:
1731 # match any backend.userland.com namespace
1732 namespace = 'http://backend.userland.com/rss'
1733 lowernamespace = namespace
1734 if qname and qname.find(':') > 0:
1735 givenprefix = qname.split(':')[0]
1738 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1739 if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1740 raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1741 localname = str(localname).lower()
1743 # qname implementation is horribly broken in Python 2.1 (it
1744 # doesn't report any), and slightly broken in Python 2.2 (it
1745 # doesn't report the xml: namespace). So we match up namespaces
1746 # with a known list first, and then possibly override them with
1747 # the qnames the SAX parser gives us (if indeed it gives us any
1748 # at all). Thanks to MatejC for helping me test this and
1749 # tirelessly telling me that it didn't work yet.
1750 attrsD, self.decls = self.decls, {}
1751 if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
1752 attrsD['xmlns']=namespace
1753 if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
1754 attrsD['xmlns']=namespace
1757 localname = prefix.lower() + ':' + localname
1758 elif namespace and not qname: #Expat
1759 for name,value in self.namespacesInUse.items():
1760 if name and value == namespace:
1761 localname = name + ':' + localname
1763 if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1765 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1766 lowernamespace = (namespace or '').lower()
1767 prefix = self._matchnamespaces.get(lowernamespace, '')
1769 attrlocalname = prefix + ':' + attrlocalname
1770 attrsD[str(attrlocalname).lower()] = attrvalue
1771 for qname in attrs.getQNames():
1772 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1773 self.unknown_starttag(localname, attrsD.items())
1775 def characters(self, text):
1776 self.handle_data(text)
1778 def endElementNS(self, name, qname):
1779 namespace, localname = name
1780 lowernamespace = str(namespace or '').lower()
1781 if qname and qname.find(':') > 0:
1782 givenprefix = qname.split(':')[0]
1785 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1787 localname = prefix + ':' + localname
1788 elif namespace and not qname: #Expat
1789 for name,value in self.namespacesInUse.items():
1790 if name and value == namespace:
1791 localname = name + ':' + localname
1793 localname = str(localname).lower()
1794 self.unknown_endtag(localname)
1796 def error(self, exc):
1800 def fatalError(self, exc):
1804 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1805 special = re.compile('''[<>'"]''')
1806 bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
1807 elements_no_end_tag = [
1808 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
1809 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
1810 'source', 'track', 'wbr'
1813 def __init__(self, encoding, _type):
1814 self.encoding = encoding
1816 if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1817 sgmllib.SGMLParser.__init__(self)
1821 sgmllib.SGMLParser.reset(self)
1823 def _shorttag_replace(self, match):
1824 tag = match.group(1)
1825 if tag in self.elements_no_end_tag:
1826 return '<' + tag + ' />'
1828 return '<' + tag + '></' + tag + '>'
1830 def parse_starttag(self,i):
1831 j=sgmllib.SGMLParser.parse_starttag(self, i)
1832 if self._type == 'application/xhtml+xml':
1833 if j>2 and self.rawdata[j-2:j]=='/>':
1834 self.unknown_endtag(self.lasttag)
1837 def feed(self, data):
1838 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1839 #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1840 data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
1841 data = data.replace(''', "'")
1842 data = data.replace('"', '"')
1847 self.encoding = self.encoding + '_INVALID_PYTHON_3'
1849 if self.encoding and type(data) == type(u''):
1850 data = data.encode(self.encoding)
1851 sgmllib.SGMLParser.feed(self, data)
1852 sgmllib.SGMLParser.close(self)
1854 def normalize_attrs(self, attrs):
1855 if not attrs: return attrs
1856 # utility method to be called by descendants
1857 attrs = dict([(k.lower(), v) for k, v in attrs]).items()
1858 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1862 def unknown_starttag(self, tag, attrs):
1863 # called for each start tag
1864 # attrs is a list of (attr, value) tuples
1865 # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1866 if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1870 for key, value in attrs:
1871 value=value.replace('>','>').replace('<','<').replace('"','"')
1872 value = self.bare_ampersand.sub("&", value)
1873 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1874 if type(value) != type(u''):
1876 value = unicode(value, self.encoding)
1878 value = unicode(value, 'iso-8859-1')
1880 # Currently, in Python 3 the key is already a str, and cannot be decoded again
1881 uattrs.append((unicode(key, self.encoding), value))
1883 uattrs.append((key, value))
1884 strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
1887 strattrs=strattrs.encode(self.encoding)
1890 if tag in self.elements_no_end_tag:
1891 self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1893 self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1895 def unknown_endtag(self, tag):
1896 # called for each end tag, e.g. for </pre>, tag will be 'pre'
1897 # Reconstruct the original end tag.
1898 if tag not in self.elements_no_end_tag:
1899 self.pieces.append("</%(tag)s>" % locals())
1901 def handle_charref(self, ref):
1902 # called for each character reference, e.g. for ' ', ref will be '160'
1903 # Reconstruct the original character reference.
1904 if ref.startswith('x'):
1905 value = unichr(int(ref[1:],16))
1907 value = unichr(int(ref))
1909 if value in _cp1252.keys():
1910 self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
1912 self.pieces.append('&#%(ref)s;' % locals())
1914 def handle_entityref(self, ref):
1915 # called for each entity reference, e.g. for '©', ref will be 'copy'
1916 # Reconstruct the original entity reference.
1917 if name2codepoint.has_key(ref):
1918 self.pieces.append('&%(ref)s;' % locals())
1920 self.pieces.append('&%(ref)s' % locals())
1922 def handle_data(self, text):
1923 # called for each block of plain text, i.e. outside of any tag and
1924 # not containing any character or entity references
1925 # Store the original text verbatim.
1926 if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
1927 self.pieces.append(text)
1929 def handle_comment(self, text):
1930 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1931 # Reconstruct the original comment.
1932 self.pieces.append('<!--%(text)s-->' % locals())
1934 def handle_pi(self, text):
1935 # called for each processing instruction, e.g. <?instruction>
1936 # Reconstruct original processing instruction.
1937 self.pieces.append('<?%(text)s>' % locals())
1939 def handle_decl(self, text):
1940 # called for the DOCTYPE, if present, e.g.
1941 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1942 # "http://www.w3.org/TR/html4/loose.dtd">
1943 # Reconstruct original DOCTYPE
1944 self.pieces.append('<!%(text)s>' % locals())
1946 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1947 def _scan_name(self, i, declstartpos):
1948 rawdata = self.rawdata
1952 m = self._new_declname_match(rawdata, i)
1956 if (i + len(s)) == n:
1957 return None, -1 # end of buffer
1958 return name.lower(), m.end()
1960 self.handle_data(rawdata)
1961 # self.updatepos(declstartpos, i)
1964 def convert_charref(self, name):
1965 return '&#%s;' % name
1967 def convert_entityref(self, name):
1968 return '&%s;' % name
1971 '''Return processed HTML as a single string'''
1972 return ''.join([str(p) for p in self.pieces])
1974 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1975 def __init__(self, baseuri, baselang, encoding, entities):
1976 sgmllib.SGMLParser.__init__(self)
1977 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1978 _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
1979 self.entities=entities
1981 def decodeEntities(self, element, data):
1982 data = data.replace('<', '<')
1983 data = data.replace('<', '<')
1984 data = data.replace('<', '<')
1985 data = data.replace('>', '>')
1986 data = data.replace('>', '>')
1987 data = data.replace('>', '>')
1988 data = data.replace('&', '&')
1989 data = data.replace('&', '&')
1990 data = data.replace('"', '"')
1991 data = data.replace('"', '"')
1992 data = data.replace(''', ''')
1993 data = data.replace(''', ''')
1994 if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1995 data = data.replace('<', '<')
1996 data = data.replace('>', '>')
1997 data = data.replace('&', '&')
1998 data = data.replace('"', '"')
1999 data = data.replace(''', "'")
2002 def strattrs(self, attrs):
2003 return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
2005 class _MicroformatsParser:
2012 known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
2013 known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
2015 def __init__(self, data, baseuri, encoding):
2016 self.document = BeautifulSoup.BeautifulSoup(data)
2017 self.baseuri = baseuri
2018 self.encoding = encoding
2019 if type(data) == type(u''):
2020 data = data.encode(encoding)
2022 self.enclosures = []
2026 def vcardEscape(self, s):
2027 if type(s) in (type(''), type(u'')):
2028 s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
2031 def vcardFold(self, s):
2032 s = re.sub(';+$', '', s)
2036 while len(s) > iMax:
2037 sFolded += sPrefix + s[:iMax] + '\n'
2041 sFolded += sPrefix + s
2044 def normalize(self, s):
2045 return re.sub(r'\s+', ' ', s).strip()
2047 def unique(self, aList):
2049 for element in aList:
2050 if element not in results:
2051 results.append(element)
2054 def toISO8601(self, dt):
2055 return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
2057 def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
2059 sProperty = sProperty.lower()
2062 propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
2063 if bAllowMultiple and (iPropertyType != self.NODE):
2065 containers = elmRoot(['ul', 'ol'], propertyMatch)
2066 for container in containers:
2067 snapResults.extend(container('li'))
2068 bFound = (len(snapResults) != 0)
2070 snapResults = elmRoot(all, propertyMatch)
2071 bFound = (len(snapResults) != 0)
2072 if (not bFound) and (sProperty == 'value'):
2073 snapResults = elmRoot('pre')
2074 bFound = (len(snapResults) != 0)
2075 bNormalize = not bFound
2077 snapResults = [elmRoot]
2078 bFound = (len(snapResults) != 0)
2080 if sProperty == 'vcard':
2081 snapFilter = elmRoot(all, propertyMatch)
2082 for node in snapFilter:
2083 if node.findParent(all, propertyMatch):
2084 arFilter.append(node)
2086 for node in snapResults:
2087 if node not in arFilter:
2088 arResults.append(node)
2089 bFound = (len(arResults) != 0)
2091 if bAllowMultiple: return []
2092 elif iPropertyType == self.STRING: return ''
2093 elif iPropertyType == self.DATE: return None
2094 elif iPropertyType == self.URI: return ''
2095 elif iPropertyType == self.NODE: return None
2098 for elmResult in arResults:
2100 if iPropertyType == self.NODE:
2102 arValues.append(elmResult)
2106 sNodeName = elmResult.name.lower()
2107 if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
2108 sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
2110 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2111 if (not sValue) and (sNodeName == 'abbr'):
2112 sValue = elmResult.get('title')
2114 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2115 if (not sValue) and (iPropertyType == self.URI):
2116 if sNodeName == 'a': sValue = elmResult.get('href')
2117 elif sNodeName == 'img': sValue = elmResult.get('src')
2118 elif sNodeName == 'object': sValue = elmResult.get('data')
2120 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2121 if (not sValue) and (sNodeName == 'img'):
2122 sValue = elmResult.get('alt')
2124 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2126 sValue = elmResult.renderContents()
2127 sValue = re.sub(r'<\S[^>]*>', '', sValue)
2128 sValue = sValue.replace('\r\n', '\n')
2129 sValue = sValue.replace('\r', '\n')
2131 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2132 if not sValue: continue
2133 if iPropertyType == self.DATE:
2134 sValue = _parse_date_iso8601(sValue)
2136 arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
2138 return bAutoEscape and self.vcardEscape(sValue) or sValue
2141 def findVCards(self, elmRoot, bAgentParsing=0):
2144 if not bAgentParsing:
2145 arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
2149 for elmCard in arCards:
2152 def processSingleString(sProperty):
2153 sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
2155 arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
2156 return sValue or u''
2158 def processSingleURI(sProperty):
2159 sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
2164 if sValue.startswith('data:'):
2165 sEncoding = ';ENCODING=b'
2166 sContentType = sValue.split(';')[0].split('/').pop()
2167 sValue = sValue.split(',', 1).pop()
2169 elmValue = self.getPropertyValue(elmCard, sProperty)
2171 if sProperty != 'url':
2172 sValueKey = ';VALUE=uri'
2173 sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
2174 sContentType = sContentType.upper()
2175 if sContentType == 'OCTET-STREAM':
2178 sContentType = ';TYPE=' + sContentType.upper()
2179 arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
2181 def processTypeValue(sProperty, arDefaultType, arForceType=None):
2182 arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
2183 for elmResult in arResults:
2184 arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
2186 arType = self.unique(arForceType + arType)
2188 arType = arDefaultType
2189 sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
2191 arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
2194 # must do this before all other properties because it is destructive
2195 # (removes nested class="vcard" nodes so they don't interfere with
2196 # this vcard's other properties)
2197 arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
2198 for elmAgent in arAgent:
2199 if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
2200 sAgentValue = self.findVCards(elmAgent, 1) + '\n'
2201 sAgentValue = sAgentValue.replace('\n', '\\n')
2202 sAgentValue = sAgentValue.replace(';', '\\;')
2204 arLines.append(self.vcardFold('AGENT:' + sAgentValue))
2205 # Completely remove the agent element from the parse tree
2208 sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
2210 arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
2213 sFN = processSingleString('fn')
2216 elmName = self.getPropertyValue(elmCard, 'n')
2218 sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
2219 sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
2220 arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
2221 arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
2222 arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
2223 arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
2225 ','.join(arAdditionalNames) + ';' +
2226 ','.join(arHonorificPrefixes) + ';' +
2227 ','.join(arHonorificSuffixes)))
2229 # implied "N" optimization
2230 # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
2231 arNames = self.normalize(sFN).split()
2232 if len(arNames) == 2:
2233 bFamilyNameFirst = (arNames[0].endswith(',') or
2234 len(arNames[1]) == 1 or
2235 ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
2236 if bFamilyNameFirst:
2237 arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
2239 arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
2242 sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
2244 arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
2247 arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
2249 arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
2252 processSingleURI('photo')
2255 dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
2257 arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
2260 arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
2261 for elmAdr in arAdr:
2262 arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
2264 arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
2265 sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
2266 sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
2267 sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
2268 sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
2269 sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
2270 sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
2271 sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
2272 arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
2273 sPostOfficeBox + ';' +
2274 sExtendedAddress + ';' +
2275 sStreetAddress + ';' +
2282 processTypeValue('label', ['intl','postal','parcel','work'])
2284 # TEL (phone number)
2285 processTypeValue('tel', ['voice'])
2288 processTypeValue('email', ['internet'], ['internet'])
2291 processSingleString('mailer')
2294 processSingleString('tz')
2296 # GEO (geographical information)
2297 elmGeo = self.getPropertyValue(elmCard, 'geo')
2299 sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
2300 sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
2301 arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
2304 processSingleString('title')
2307 processSingleString('role')
2310 processSingleURI('logo')
2312 # ORG (organization)
2313 elmOrg = self.getPropertyValue(elmCard, 'org')
2315 sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
2316 if not sOrganizationName:
2317 # implied "organization-name" optimization
2318 # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
2319 sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
2320 if sOrganizationName:
2321 arLines.append(self.vcardFold('ORG:' + sOrganizationName))
2323 arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
2324 arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
2327 arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
2329 arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
2332 processSingleString('note')
2335 processSingleString('rev')
2338 processSingleURI('sound')
2341 processSingleString('uid')
2344 processSingleURI('url')
2347 processSingleString('class')
2350 processSingleURI('key')
2353 arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard']
2354 sVCards += u'\n'.join(arLines) + u'\n'
2356 return sVCards.strip()
2358 def isProbablyDownloadable(self, elm):
2359 attrsD = elm.attrMap
2360 if not attrsD.has_key('href'): return 0
2361 linktype = attrsD.get('type', '').strip()
2362 if linktype.startswith('audio/') or \
2363 linktype.startswith('video/') or \
2364 (linktype.startswith('application/') and not linktype.endswith('xml')):
2366 path = urlparse.urlparse(attrsD['href'])[2]
2367 if path.find('.') == -1: return 0
2368 fileext = path.split('.').pop().lower()
2369 return fileext in self.known_binary_extensions
2373 for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
2374 href = elm.get('href')
2375 if not href: continue
2376 urlscheme, domain, path, params, query, fragment = \
2377 urlparse.urlparse(_urljoin(self.baseuri, href))
2378 segments = path.split('/')
2379 tag = segments.pop()
2381 tag = segments.pop()
2382 tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
2383 if not tagscheme.endswith('/'):
2385 self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
2387 def findEnclosures(self):
2389 enclosure_match = re.compile(r'\benclosure\b')
2390 for elm in self.document(all, {'href': re.compile(r'.+')}):
2391 if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue
2392 if elm.attrMap not in self.enclosures:
2393 self.enclosures.append(elm.attrMap)
2394 if elm.string and not elm.get('title'):
2395 self.enclosures[-1]['title'] = elm.string
2399 for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
2400 rels = elm.get('rel', '').split()
2403 if rel in self.known_xfn_relationships:
2404 xfn_rels.append(rel)
2406 self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
2408 def _parseMicroformats(htmlSource, baseURI, encoding):
2409 if not BeautifulSoup: return
2410 if _debug: sys.stderr.write('entering _parseMicroformats\n')
2412 p = _MicroformatsParser(htmlSource, baseURI, encoding)
2413 except UnicodeEncodeError:
2414 # sgmllib throws this exception when performing lookups of tags
2415 # with non-ASCII characters in them.
2417 p.vcard = p.findVCards(p.document)
2421 return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
2423 class _RelativeURIResolver(_BaseHTMLProcessor):
2424 relative_uris = [('a', 'href'),
2425 ('applet', 'codebase'),
2427 ('blockquote', 'cite'),
2428 ('body', 'background'),
2431 ('frame', 'longdesc'),
2433 ('iframe', 'longdesc'),
2435 ('head', 'profile'),
2436 ('img', 'longdesc'),
2440 ('input', 'usemap'),
2443 ('object', 'classid'),
2444 ('object', 'codebase'),
2446 ('object', 'usemap'),
2450 def __init__(self, baseuri, encoding, _type):
2451 _BaseHTMLProcessor.__init__(self, encoding, _type)
2452 self.baseuri = baseuri
2454 def resolveURI(self, uri):
2455 return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
2457 def unknown_starttag(self, tag, attrs):
2459 sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
2460 attrs = self.normalize_attrs(attrs)
2461 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2462 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2464 def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2466 sys.stderr.write('entering _resolveRelativeURIs\n')
2468 p = _RelativeURIResolver(baseURI, encoding, _type)
2472 def _makeSafeAbsoluteURI(base, rel=None):
2473 # bail if ACCEPTABLE_URI_SCHEMES is empty
2474 if not ACCEPTABLE_URI_SCHEMES:
2475 return _urljoin(base, rel or u'')
2479 if base.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2482 uri = _urljoin(base, rel)
2483 if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2487 class _HTMLSanitizer(_BaseHTMLProcessor):
2488 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
2489 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
2490 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
2491 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
2492 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
2493 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
2494 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
2495 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
2496 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
2497 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2498 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
2499 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
2500 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
2502 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
2503 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2504 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2505 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2506 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2507 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2508 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2509 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2510 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2511 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2512 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2513 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2514 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2515 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2516 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2517 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
2518 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
2519 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
2520 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
2521 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
2522 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
2525 unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
2527 acceptable_css_properties = ['azimuth', 'background-color',
2528 'border-bottom-color', 'border-collapse', 'border-color',
2529 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2530 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2531 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2532 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2533 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2534 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2535 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2536 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2537 'white-space', 'width']
2539 # survey of common keywords found in feeds
2540 acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
2541 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2542 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2543 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2544 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2545 'transparent', 'underline', 'white', 'yellow']
2547 valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2548 '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2550 mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
2551 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
2552 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
2553 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
2554 'munderover', 'none', 'semantics']
2556 mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
2557 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
2558 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
2559 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
2560 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
2561 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
2562 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
2563 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
2564 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
2566 # svgtiny - foreignObject + linearGradient + radialGradient + stop
2567 svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
2568 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2569 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2570 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2571 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2572 'svg', 'switch', 'text', 'title', 'tspan', 'use']
2574 # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2575 svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
2576 'arabic-form', 'ascent', 'attributeName', 'attributeType',
2577 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2578 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2579 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2580 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2581 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2582 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2583 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2584 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2585 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2586 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2587 'overline-position', 'overline-thickness', 'panose-1', 'path',
2588 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2589 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2590 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2591 'stop-color', 'stop-opacity', 'strikethrough-position',
2592 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2593 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2594 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2595 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2596 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2597 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2598 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2599 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2600 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2606 acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
2607 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2611 _BaseHTMLProcessor.reset(self)
2612 self.unacceptablestack = 0
2616 def unknown_starttag(self, tag, attrs):
2617 acceptable_attributes = self.acceptable_attributes
2619 if not tag in self.acceptable_elements or self.svgOK:
2620 if tag in self.unacceptable_elements_with_end_tag:
2621 self.unacceptablestack += 1
2623 # add implicit namespaces to html5 inline svg/mathml
2624 if self._type.endswith('html'):
2625 if not dict(attrs).get('xmlns'):
2627 attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2629 attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2631 # not otherwise acceptable, perhaps it is MathML or SVG?
2632 if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2634 if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2637 # chose acceptable attributes based on tag class, else bail
2638 if self.mathmlOK and tag in self.mathml_elements:
2639 acceptable_attributes = self.mathml_attributes
2640 elif self.svgOK and tag in self.svg_elements:
2641 # for most vocabularies, lowercasing is a good idea. Many
2642 # svg elements, however, are camel case
2643 if not self.svg_attr_map:
2644 lower=[attr.lower() for attr in self.svg_attributes]
2645 mix=[a for a in self.svg_attributes if a not in lower]
2646 self.svg_attributes = lower
2647 self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2649 lower=[attr.lower() for attr in self.svg_elements]
2650 mix=[a for a in self.svg_elements if a not in lower]
2651 self.svg_elements = lower
2652 self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2653 acceptable_attributes = self.svg_attributes
2654 tag = self.svg_elem_map.get(tag,tag)
2655 keymap = self.svg_attr_map
2656 elif not tag in self.acceptable_elements:
2659 # declare xlink namespace, if needed
2660 if self.mathmlOK or self.svgOK:
2661 if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2662 if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2663 attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2666 for key, value in self.normalize_attrs(attrs):
2667 if key in acceptable_attributes:
2668 key=keymap.get(key,key)
2669 clean_attrs.append((key,value))
2671 clean_value = self.sanitize_style(value)
2672 if clean_value: clean_attrs.append((key,clean_value))
2673 _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2675 def unknown_endtag(self, tag):
2676 if not tag in self.acceptable_elements:
2677 if tag in self.unacceptable_elements_with_end_tag:
2678 self.unacceptablestack -= 1
2679 if self.mathmlOK and tag in self.mathml_elements:
2680 if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1
2681 elif self.svgOK and tag in self.svg_elements:
2682 tag = self.svg_elem_map.get(tag,tag)
2683 if tag == 'svg' and self.svgOK: self.svgOK -= 1
2686 _BaseHTMLProcessor.unknown_endtag(self, tag)
2688 def handle_pi(self, text):
2691 def handle_decl(self, text):
2694 def handle_data(self, text):
2695 if not self.unacceptablestack:
2696 _BaseHTMLProcessor.handle_data(self, text)
2698 def sanitize_style(self, style):
2700 style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2703 if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
2704 # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2705 if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''
2708 for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2709 if not value: continue
2710 if prop.lower() in self.acceptable_css_properties:
2711 clean.append(prop + ': ' + value + ';')
2712 elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2713 for keyword in value.split():
2714 if not keyword in self.acceptable_css_keywords and \
2715 not self.valid_css_values.match(keyword):
2718 clean.append(prop + ': ' + value + ';')
2719 elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2720 clean.append(prop + ': ' + value + ';')
2722 return ' '.join(clean)
2725 def _sanitizeHTML(htmlSource, encoding, _type):
2726 p = _HTMLSanitizer(encoding, _type)
2727 htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[')
2731 # loop through list of preferred Tidy interfaces looking for one that's installed,
2732 # then set up a common _tidy function to wrap the interface-specific API.
2734 for tidy_interface in PREFERRED_TIDY_INTERFACES:
2736 if tidy_interface == "uTidy":
2737 from tidy import parseString as _utidy
2738 def _tidy(data, **kwargs):
2739 return str(_utidy(data, **kwargs))
2741 elif tidy_interface == "mxTidy":
2742 from mx.Tidy import Tidy as _mxtidy
2743 def _tidy(data, **kwargs):
2744 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
2750 utf8 = type(data) == type(u'')
2752 data = data.encode('utf-8')
2753 data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
2755 data = unicode(data, 'utf-8')
2756 if data.count('<body'):
2757 data = data.split('<body', 1)[1]
2759 data = data.split('>', 1)[1]
2760 if data.count('</body'):
2761 data = data.split('</body', 1)[0]
2762 data = data.strip().replace('\r\n', '\n')
2765 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2766 def http_error_default(self, req, fp, code, msg, headers):
2767 if ((code / 100) == 3) and (code != 304):
2768 return self.http_error_302(req, fp, code, msg, headers)
2769 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2770 infourl.status = code
2773 def http_error_302(self, req, fp, code, msg, headers):
2774 if headers.dict.has_key('location'):
2775 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
2777 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2778 if not hasattr(infourl, 'status'):
2779 infourl.status = code
2782 def http_error_301(self, req, fp, code, msg, headers):
2783 if headers.dict.has_key('location'):
2784 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
2786 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2787 if not hasattr(infourl, 'status'):
2788 infourl.status = code
2791 http_error_300 = http_error_302
2792 http_error_303 = http_error_302
2793 http_error_307 = http_error_302
2795 def http_error_401(self, req, fp, code, msg, headers):
2797 # - server requires digest auth, AND
2798 # - we tried (unsuccessfully) with basic auth, AND
2799 # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
2800 # If all conditions hold, parse authentication information
2801 # out of the Authorization header we sent the first time
2802 # (for the username and password) and the WWW-Authenticate
2803 # header the server sent back (for the realm) and retry
2804 # the request with the appropriate digest auth headers instead.
2805 # This evil genius hack has been brought to you by Aaron Swartz.
2806 host = urlparse.urlparse(req.get_full_url())[1]
2808 assert sys.version.split()[0] >= '2.3.3'
2809 assert base64 != None
2810 user, passw = _base64decode(req.headers['Authorization'].split(' ')[1]).split(':')
2811 realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2812 self.add_password(realm, host, user, passw)
2813 retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2814 self.reset_retry_count()
2817 return self.http_error_default(req, fp, code, msg, headers)
2819 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
2820 """URL, filename, or string --> stream
2822 This function lets you define parsers that take any input source
2823 (URL, pathname to local or network file, or actual data as a string)
2824 and deal with it in a uniform manner. Returned object is guaranteed
2825 to have all the basic stdio read methods (read, readline, readlines).
2826 Just .close() the object when you're done with it.
2828 If the etag argument is supplied, it will be used as the value of an
2829 If-None-Match request header.
2831 If the modified argument is supplied, it can be a tuple of 9 integers
2832 (as returned by gmtime() in the standard Python time module) or a date
2833 string in any format supported by feedparser. Regardless, it MUST
2834 be in GMT (Greenwich Mean Time). It will be reformatted into an
2835 RFC 1123-compliant date and used as the value of an If-Modified-Since
2838 If the agent argument is supplied, it will be used as the value of a
2839 User-Agent request header.
2841 If the referrer argument is supplied, it will be used as the value of a
2842 Referer[sic] request header.
2844 If handlers is supplied, it is a list of handlers used to build a
2847 if request_headers is supplied it is a dictionary of HTTP request headers
2848 that will override the values generated by FeedParser.
2851 if hasattr(url_file_stream_or_string, 'read'):
2852 return url_file_stream_or_string
2854 if url_file_stream_or_string == '-':
2857 if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2858 # Deal with the feed URI scheme
2859 if url_file_stream_or_string.startswith('feed:http'):
2860 url_file_stream_or_string = url_file_stream_or_string[5:]
2861 elif url_file_stream_or_string.startswith('feed:'):
2862 url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2865 # test for inline user:password for basic auth
2868 urltype, rest = urllib.splittype(url_file_stream_or_string)
2869 realhost, rest = urllib.splithost(rest)
2871 user_passwd, realhost = urllib.splituser(realhost)
2873 url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2874 auth = base64.standard_b64encode(user_passwd).strip()
2878 if isinstance(url_file_stream_or_string,unicode):
2879 url_file_stream_or_string = url_file_stream_or_string.encode('idna').decode('utf-8')
2881 url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna').decode('utf-8')
2885 # try to open with urllib2 (to use optional headers)
2886 request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2887 opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()]))
2888 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2890 return opener.open(request)
2892 opener.close() # JohnD
2894 # try to open with native open function (if url_file_stream_or_string is a filename)
2896 return open(url_file_stream_or_string, 'rb')
2900 # treat url_file_stream_or_string as string
2901 return _StringIO(str(url_file_stream_or_string))
2903 def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
2904 request = urllib2.Request(url)
2905 request.add_header('User-Agent', agent)
2907 request.add_header('If-None-Match', etag)
2908 if type(modified) == type(''):
2909 modified = _parse_date(modified)
2910 elif isinstance(modified, datetime.datetime):
2911 modified = modified.utctimetuple()
2913 # format into an RFC 1123-compliant timestamp. We can't use
2914 # time.strftime() since the %a and %b directives can be affected
2915 # by the current locale, but RFC 2616 states that dates must be
2917 short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2918 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2919 request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2921 request.add_header('Referer', referrer)
2923 request.add_header('Accept-encoding', 'gzip, deflate')
2925 request.add_header('Accept-encoding', 'gzip')
2927 request.add_header('Accept-encoding', 'deflate')
2929 request.add_header('Accept-encoding', '')
2931 request.add_header('Authorization', 'Basic %s' % auth)
2933 request.add_header('Accept', ACCEPT_HEADER)
2934 # use this for whatever -- cookies, special headers, etc
2935 # [('Cookie','Something'),('x-special-header','Another Value')]
2936 for header_name, header_value in request_headers.items():
2937 request.add_header(header_name, header_value)
2938 request.add_header('A-IM', 'feed') # RFC 3229 support
2942 def registerDateHandler(func):
2943 '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2944 _date_handlers.insert(0, func)
2946 # ISO-8601 date parsing routines written by Fazal Majid.
2947 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2948 # parser is beyond the scope of feedparser and would be a worthwhile addition
2949 # to the Python library.
2950 # A single regular expression cannot parse ISO 8601 date formats into groups
2951 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2952 # 0301-04-01), so we use templates instead.
2953 # Please note the order in templates is significant because we need a
2955 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2956 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2957 '-YY-?MM', '-OOO', '-YY',
2963 'YYYY', r'(?P<year>\d{4})').replace(
2964 'YY', r'(?P<year>\d\d)').replace(
2965 'MM', r'(?P<month>[01]\d)').replace(
2966 'DD', r'(?P<day>[0123]\d)').replace(
2967 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2968 'CC', r'(?P<century>\d\d$)')
2969 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2970 + r'(:(?P<second>\d{2}))?'
2971 + r'(\.(?P<fracsecond>\d+))?'
2972 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2973 for tmpl in _iso8601_tmpl]
2978 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2983 def _parse_date_iso8601(dateString):
2984 '''Parse a variety of ISO-8601-compatible formats like 20040105'''
2986 for _iso8601_match in _iso8601_matches:
2987 m = _iso8601_match(dateString)
2990 if m.span() == (0, 0): return
2991 params = m.groupdict()
2992 ordinal = params.get('ordinal', 0)
2994 ordinal = int(ordinal)
2997 year = params.get('year', '--')
2998 if not year or year == '--':
2999 year = time.gmtime()[0]
3000 elif len(year) == 2:
3001 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
3002 year = 100 * int(time.gmtime()[0] / 100) + int(year)
3005 month = params.get('month', '-')
3006 if not month or month == '-':
3007 # ordinals are NOT normalized by mktime, we simulate them
3008 # by setting month=1, day=ordinal
3012 month = time.gmtime()[1]
3014 day = params.get('day', 0)
3019 elif params.get('century', 0) or \
3020 params.get('year', 0) or params.get('month', 0):
3023 day = time.gmtime()[2]
3026 # special case of the century - is the first year of the 21st century
3027 # 2000 or 2001 ? The debate goes on...
3028 if 'century' in params.keys():
3029 year = (int(params['century']) - 1) * 100 + 1
3030 # in ISO 8601 most fields are optional
3031 for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
3032 if not params.get(field, None):
3034 hour = int(params.get('hour', 0))
3035 minute = int(params.get('minute', 0))
3036 second = int(float(params.get('second', 0)))
3037 # weekday is normalized by mktime(), we can ignore it
3039 daylight_savings_flag = -1
3040 tm = [year, month, day, hour, minute, second, weekday,
3041 ordinal, daylight_savings_flag]
3042 # ISO 8601 time zone adjustments
3043 tz = params.get('tz')
3044 if tz and tz != 'Z':
3046 tm[3] += int(params.get('tzhour', 0))
3047 tm[4] += int(params.get('tzmin', 0))
3049 tm[3] -= int(params.get('tzhour', 0))
3050 tm[4] -= int(params.get('tzmin', 0))
3053 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
3054 # which is guaranteed to normalize d/m/y/h/m/s.
3055 # Many implementations have bugs, but we'll pretend they don't.
3056 return time.localtime(time.mktime(tuple(tm)))
3057 registerDateHandler(_parse_date_iso8601)
3059 # 8-bit date handling routines written by ytrewq1.
3060 _korean_year = u'\ub144' # b3e2 in euc-kr
3061 _korean_month = u'\uc6d4' # bff9 in euc-kr
3062 _korean_day = u'\uc77c' # c0cf in euc-kr
3063 _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
3064 _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
3066 _korean_onblog_date_re = \
3067 re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
3068 (_korean_year, _korean_month, _korean_day))
3069 _korean_nate_date_re = \
3070 re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
3071 (_korean_am, _korean_pm))
3072 def _parse_date_onblog(dateString):
3073 '''Parse a string according to the OnBlog 8-bit date format'''
3074 m = _korean_onblog_date_re.match(dateString)
3076 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3077 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3078 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3079 'zonediff': '+09:00'}
3080 if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
3081 return _parse_date_w3dtf(w3dtfdate)
3082 registerDateHandler(_parse_date_onblog)
3084 def _parse_date_nate(dateString):
3085 '''Parse a string according to the Nate 8-bit date format'''
3086 m = _korean_nate_date_re.match(dateString)
3088 hour = int(m.group(5))
3090 if (ampm == _korean_pm):
3095 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3096 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3097 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
3098 'zonediff': '+09:00'}
3099 if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
3100 return _parse_date_w3dtf(w3dtfdate)
3101 registerDateHandler(_parse_date_nate)
3104 re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
3105 def _parse_date_mssql(dateString):
3106 '''Parse a string according to the MS SQL date format'''
3107 m = _mssql_date_re.match(dateString)
3109 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3110 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3111 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3112 'zonediff': '+09:00'}
3113 if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
3114 return _parse_date_w3dtf(w3dtfdate)
3115 registerDateHandler(_parse_date_mssql)
3117 # Unicode strings for Greek date strings
3120 u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
3121 u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
3122 u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
3123 u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
3124 u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
3125 u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
3126 u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
3127 u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
3128 u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
3129 u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
3130 u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
3131 u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
3132 u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
3133 u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
3134 u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
3135 u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
3136 u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
3137 u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
3138 u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
3143 u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
3144 u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
3145 u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
3146 u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
3147 u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
3148 u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
3149 u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
3152 _greek_date_format_re = \
3153 re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
3155 def _parse_date_greek(dateString):
3156 '''Parse a string according to a Greek 8-bit date format.'''
3157 m = _greek_date_format_re.match(dateString)
3160 wday = _greek_wdays[m.group(1)]
3161 month = _greek_months[m.group(3)]
3164 rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
3165 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
3166 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
3167 'zonediff': m.group(8)}
3168 if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
3169 return _parse_date_rfc822(rfc822date)
3170 registerDateHandler(_parse_date_greek)
3172 # Unicode strings for Hungarian date strings
3173 _hungarian_months = \
3175 u'janu\u00e1r': u'01', # e1 in iso-8859-2
3176 u'febru\u00e1ri': u'02', # e1 in iso-8859-2
3177 u'm\u00e1rcius': u'03', # e1 in iso-8859-2
3178 u'\u00e1prilis': u'04', # e1 in iso-8859-2
3179 u'm\u00e1ujus': u'05', # e1 in iso-8859-2
3180 u'j\u00fanius': u'06', # fa in iso-8859-2
3181 u'j\u00falius': u'07', # fa in iso-8859-2
3182 u'augusztus': u'08',
3183 u'szeptember': u'09',
3184 u'okt\u00f3ber': u'10', # f3 in iso-8859-2
3189 _hungarian_date_format_re = \
3190 re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3192 def _parse_date_hungarian(dateString):
3193 '''Parse a string according to a Hungarian 8-bit date format.'''
3194 m = _hungarian_date_format_re.match(dateString)
3197 month = _hungarian_months[m.group(2)]
3206 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3207 {'year': m.group(1), 'month': month, 'day': day,\
3208 'hour': hour, 'minute': m.group(5),\
3209 'zonediff': m.group(6)}
3210 if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
3211 return _parse_date_w3dtf(w3dtfdate)
3212 registerDateHandler(_parse_date_hungarian)
3214 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
3215 # Drake and licensed under the Python license. Removed all range checking
3216 # for month, day, hour, minute, and second, since mktime will normalize
3218 def _parse_date_w3dtf(dateString):
3219 def __extract_date(m):
3220 year = int(m.group('year'))
3222 year = 100 * int(time.gmtime()[0] / 100) + int(year)
3225 julian = m.group('julian')
3227 julian = int(julian)
3228 month = julian / 30 + 1
3229 day = julian % 30 + 1
3231 while jday != julian:
3232 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
3233 jday = time.gmtime(t)[-2]
3234 diff = abs(jday - julian)
3246 return year, month, day
3247 month = m.group('month')
3253 day = m.group('day')
3258 return year, month, day
3260 def __extract_time(m):
3263 hours = m.group('hours')
3267 minutes = int(m.group('minutes'))
3268 seconds = m.group('seconds')
3270 seconds = int(seconds)
3273 return hours, minutes, seconds
3275 def __extract_tzd(m):
3276 '''Return the Time Zone Designator as an offset in seconds from UTC.'''
3279 tzd = m.group('tzd')
3284 hours = int(m.group('tzdhours'))
3285 minutes = m.group('tzdminutes')
3287 minutes = int(minutes)
3290 offset = (hours*60 + minutes) * 60
3295 __date_re = ('(?P<year>\d\d\d\d)'
3297 '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?'
3298 '|(?P<julian>\d\d\d)))?')
3299 __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
3300 __tzd_rx = re.compile(__tzd_re)
3301 __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
3302 '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?'
3304 __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
3305 __datetime_rx = re.compile(__datetime_re)
3306 m = __datetime_rx.match(dateString)
3307 if (m is None) or (m.group() != dateString): return
3308 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
3309 if gmt[0] == 0: return
3310 return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
3311 registerDateHandler(_parse_date_w3dtf)
3313 def _parse_date_rfc822(dateString):
3314 '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
3315 data = dateString.split()
3316 if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
3322 data[3:] = [s[:i], s[i+1:]]
3325 dateString = " ".join(data)
3326 # Account for the Etc/GMT timezone by stripping 'Etc/'
3327 elif len(data) == 5 and data[4].lower().startswith('etc/'):
3328 data[4] = data[4][4:]
3329 dateString = " ".join(data)
3331 dateString += ' 00:00:00 GMT'
3332 tm = rfc822.parsedate_tz(dateString)
3334 return time.gmtime(rfc822.mktime_tz(tm))
3335 # rfc822.py defines several time zones, but we define some extra ones.
3336 # 'ET' is equivalent to 'EST', etc.
3337 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
3338 rfc822._timezones.update(_additional_timezones)
3339 registerDateHandler(_parse_date_rfc822)
3341 def _parse_date_perforce(aDateString):
3342 """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3343 # Fri, 2006/09/15 08:19:53 EDT
3344 _my_date_pattern = re.compile( \
3345 r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3347 dow, year, month, day, hour, minute, second, tz = \
3348 _my_date_pattern.search(aDateString).groups()
3349 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3350 dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3351 tm = rfc822.parsedate_tz(dateString)
3353 return time.gmtime(rfc822.mktime_tz(tm))
3354 registerDateHandler(_parse_date_perforce)
3356 def _parse_date(dateString):
3357 '''Parses a variety of date formats into a 9-tuple in GMT'''
3358 for handler in _date_handlers:
3360 date9tuple = handler(dateString)
3361 if not date9tuple: continue
3362 if len(date9tuple) != 9:
3363 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
3365 map(int, date9tuple)
3367 except Exception, e:
3368 if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
3372 def _getCharacterEncoding(http_headers, xml_data):
3373 '''Get the character encoding of the XML document
3375 http_headers is a dictionary
3376 xml_data is a raw string (not Unicode)
3378 This is so much trickier than it sounds, it's not even funny.
3379 According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3380 is application/xml, application/*+xml,
3381 application/xml-external-parsed-entity, or application/xml-dtd,
3382 the encoding given in the charset parameter of the HTTP Content-Type
3383 takes precedence over the encoding given in the XML prefix within the
3384 document, and defaults to 'utf-8' if neither are specified. But, if
3385 the HTTP Content-Type is text/xml, text/*+xml, or
3386 text/xml-external-parsed-entity, the encoding given in the XML prefix
3387 within the document is ALWAYS IGNORED and only the encoding given in
3388 the charset parameter of the HTTP Content-Type header should be
3389 respected, and it defaults to 'us-ascii' if not specified.
3391 Furthermore, discussion on the atom-syntax mailing list with the
3392 author of RFC 3023 leads me to the conclusion that any document
3393 served with a Content-Type of text/* and no charset parameter
3394 must be treated as us-ascii. (We now do this.) And also that it
3395 must always be flagged as non-well-formed. (We now do this too.)
3397 If Content-Type is unspecified (input was local file or non-HTTP source)
3398 or unrecognized (server just got it totally wrong), then go by the
3399 encoding given in the XML prefix of the document and default to
3400 'iso-8859-1' as per the HTTP specification (RFC 2616).
3402 Then, assuming we didn't find a character encoding in the HTTP headers
3403 (and the HTTP Content-type allowed us to look in the body), we need
3404 to sniff the first few bytes of the XML data and try to determine
3405 whether the encoding is ASCII-compatible. Section F of the XML
3406 specification shows the way here:
3407 http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3409 If the sniffed encoding is not ASCII-compatible, we need to make it
3410 ASCII compatible so that we can sniff further into the XML declaration
3411 to find the encoding attribute, which will tell us the true encoding.
3413 Of course, none of this guarantees that we will be able to parse the
3414 feed in the declared character encoding (assuming it was declared
3415 correctly, which many are not). CJKCodecs and iconv_codec help a lot;
3416 you should definitely install them if you can.
3417 http://cjkpython.i18n.org/
3420 def _parseHTTPContentType(content_type):
3421 '''takes HTTP Content-Type header and returns (content type, charset)
3423 If no charset is specified, returns (content type, '')
3424 If no content type is specified, returns ('', '')
3425 Both return parameters are guaranteed to be lowercase strings
3427 content_type = content_type or ''
3428 content_type, params = cgi.parse_header(content_type)
3429 return content_type, params.get('charset', '').replace("'", '')
3431 sniffed_xml_encoding = ''
3434 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type')))
3435 # Must sniff for non-ASCII-compatible character encodings before
3436 # searching for XML declaration. This heuristic is defined in
3437 # section F of the XML specification:
3438 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3440 if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
3442 xml_data = _ebcdic_to_ascii(xml_data)
3443 elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
3445 sniffed_xml_encoding = 'utf-16be'
3446 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
3447 elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3449 sniffed_xml_encoding = 'utf-16be'
3450 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
3451 elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
3453 sniffed_xml_encoding = 'utf-16le'
3454 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
3455 elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3457 sniffed_xml_encoding = 'utf-16le'
3458 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
3459 elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
3461 sniffed_xml_encoding = 'utf-32be'
3462 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
3463 elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
3465 sniffed_xml_encoding = 'utf-32le'
3466 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
3467 elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3469 sniffed_xml_encoding = 'utf-32be'
3470 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
3471 elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3473 sniffed_xml_encoding = 'utf-32le'
3474 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
3475 elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3477 sniffed_xml_encoding = 'utf-8'
3478 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
3482 xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
3484 xml_encoding_match = None
3485 if xml_encoding_match:
3486 xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
3487 if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
3488 xml_encoding = sniffed_xml_encoding
3489 acceptable_content_type = 0
3490 application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
3491 text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
3492 if (http_content_type in application_content_types) or \
3493 (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
3494 acceptable_content_type = 1
3495 true_encoding = http_encoding or xml_encoding or 'utf-8'
3496 elif (http_content_type in text_content_types) or \
3497 (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
3498 acceptable_content_type = 1
3499 true_encoding = http_encoding or 'us-ascii'
3500 elif http_content_type.startswith('text/'):
3501 true_encoding = http_encoding or 'us-ascii'
3502 elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))):
3503 true_encoding = xml_encoding or 'iso-8859-1'
3505 true_encoding = xml_encoding or 'utf-8'
3506 # some feeds claim to be gb2312 but are actually gb18030.
3507 # apparently MSIE and Firefox both do the following switch:
3508 if true_encoding.lower() == 'gb2312':
3509 true_encoding = 'gb18030'
3510 return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3512 def _toUTF8(data, encoding):
3513 '''Changes an XML data stream on the fly to specify a new encoding
3515 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3516 encoding is a string recognized by encodings.aliases
3518 if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
3519 # strip Byte Order Mark (if present)
3520 if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3522 sys.stderr.write('stripping BOM\n')
3523 if encoding != 'utf-16be':
3524 sys.stderr.write('trying utf-16be instead\n')
3525 encoding = 'utf-16be'
3527 elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3529 sys.stderr.write('stripping BOM\n')
3530 if encoding != 'utf-16le':
3531 sys.stderr.write('trying utf-16le instead\n')
3532 encoding = 'utf-16le'
3534 elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3536 sys.stderr.write('stripping BOM\n')
3537 if encoding != 'utf-8':
3538 sys.stderr.write('trying utf-8 instead\n')
3541 elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3543 sys.stderr.write('stripping BOM\n')
3544 if encoding != 'utf-32be':
3545 sys.stderr.write('trying utf-32be instead\n')
3546 encoding = 'utf-32be'
3548 elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3550 sys.stderr.write('stripping BOM\n')
3551 if encoding != 'utf-32le':
3552 sys.stderr.write('trying utf-32le instead\n')
3553 encoding = 'utf-32le'
3555 newdata = unicode(data, encoding)
3556 if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
3557 declmatch = re.compile('^<\?xml[^>]*?>')
3558 newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
3559 if declmatch.search(newdata):
3560 newdata = declmatch.sub(newdecl, newdata)
3562 newdata = newdecl + u'\n' + newdata
3563 return newdata.encode('utf-8')
3565 def _stripDoctype(data):
3566 '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3568 rss_version may be 'rss091n' or None
3569 stripped_data is the same XML document, minus the DOCTYPE
3571 start = re.search(_s2bytes('<\w'), data)
3572 start = start and start.start() or -1
3573 head,data = data[:start+1], data[start+1:]
3575 entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
3576 entity_results=entity_pattern.findall(head)
3577 head = entity_pattern.sub(_s2bytes(''), head)
3578 doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
3579 doctype_results = doctype_pattern.findall(head)
3580 doctype = doctype_results and doctype_results[0] or _s2bytes('')
3581 if doctype.lower().count(_s2bytes('netscape')):
3586 # only allow in 'safe' inline entity definitions
3587 replacement=_s2bytes('')
3588 if len(doctype_results)==1 and entity_results:
3589 safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
3590 safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
3592 replacement=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
3593 data = doctype_pattern.sub(replacement, head) + data
3595 return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
3597 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], request_headers={}, response_headers={}):
3598 '''Parse a feed from a URL, file, stream, or string.
3600 request_headers, if given, is a dict from http header name to value to add
3601 to the request; this overrides internally generated values.
3603 result = FeedParserDict()
3604 result['feed'] = FeedParserDict()
3605 result['entries'] = []
3608 if not isinstance(handlers, list):
3609 handlers = [handlers]
3611 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3613 except Exception, e:
3615 result['bozo_exception'] = e
3619 if hasattr(f, 'headers'):
3620 result['headers'] = dict(f.headers)
3621 # overwrite existing headers using response_headers
3622 if 'headers' in result:
3623 result['headers'].update(response_headers)
3624 elif response_headers:
3625 result['headers'] = copy.deepcopy(response_headers)
3627 # if feed is gzip-compressed, decompress it
3628 if f and data and 'headers' in result:
3629 if gzip and result['headers'].get('content-encoding') == 'gzip':
3631 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3632 except Exception, e:
3633 # Some feeds claim to be gzipped but they're not, so
3634 # we get garbage. Ideally, we should re-request the
3635 # feed without the 'Accept-encoding: gzip' header,
3638 result['bozo_exception'] = e
3640 elif zlib and result['headers'].get('content-encoding') == 'deflate':
3642 data = zlib.decompress(data, -zlib.MAX_WBITS)
3643 except Exception, e:
3645 result['bozo_exception'] = e
3649 if 'headers' in result:
3650 if 'etag' in result['headers'] or 'ETag' in result['headers']:
3651 etag = result['headers'].get('etag', result['headers'].get('ETag'))
3653 result['etag'] = etag
3654 if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']:
3655 modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified'))
3657 result['modified'] = _parse_date(modified)
3658 if hasattr(f, 'url'):
3659 result['href'] = f.url
3660 result['status'] = 200
3661 if hasattr(f, 'status'):
3662 result['status'] = f.status
3663 if hasattr(f, 'close'):
3666 # there are four encodings to keep track of:
3667 # - http_encoding is the encoding declared in the Content-Type HTTP header
3668 # - xml_encoding is the encoding declared in the <?xml declaration
3669 # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
3670 # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3671 http_headers = result.get('headers', {})
3672 result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
3673 _getCharacterEncoding(http_headers, data)
3674 if http_headers and (not acceptable_content_type):
3675 if http_headers.has_key('content-type') or http_headers.has_key('Content-type'):
3676 bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type'))
3678 bozo_message = 'no Content-type specified'
3680 result['bozo_exception'] = NonXMLContentType(bozo_message)
3682 if data is not None:
3683 result['version'], data, entities = _stripDoctype(data)
3685 # ensure that baseuri is an absolute uri using an acceptable URI scheme
3686 contentloc = http_headers.get('content-location', http_headers.get('Content-Location', ''))
3687 href = result.get('href', '')
3688 baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3690 baselang = http_headers.get('content-language', http_headers.get('Content-Language', None))
3692 # if server sent 304, we're done
3693 if result.get('status', 0) == 304:
3694 result['version'] = ''
3695 result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3696 'so the server sent no data. This is a feature, not a bug!'
3699 # if there was a problem downloading, we're done
3703 # determine character encoding
3704 use_strict_parser = 0
3706 tried_encodings = []
3707 # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3708 for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
3709 if not proposed_encoding: continue
3710 if proposed_encoding in tried_encodings: continue
3711 tried_encodings.append(proposed_encoding)
3713 data = _toUTF8(data, proposed_encoding)
3714 known_encoding = use_strict_parser = 1
3718 # if no luck and we have auto-detection library, try that
3719 if (not known_encoding) and chardet:
3721 proposed_encoding = chardet.detect(data)['encoding']
3722 if proposed_encoding and (proposed_encoding not in tried_encodings):
3723 tried_encodings.append(proposed_encoding)
3724 data = _toUTF8(data, proposed_encoding)
3725 known_encoding = use_strict_parser = 1
3728 # if still no luck and we haven't tried utf-8 yet, try that
3729 if (not known_encoding) and ('utf-8' not in tried_encodings):
3731 proposed_encoding = 'utf-8'
3732 tried_encodings.append(proposed_encoding)
3733 data = _toUTF8(data, proposed_encoding)
3734 known_encoding = use_strict_parser = 1
3737 # if still no luck and we haven't tried windows-1252 yet, try that
3738 if (not known_encoding) and ('windows-1252' not in tried_encodings):
3740 proposed_encoding = 'windows-1252'
3741 tried_encodings.append(proposed_encoding)
3742 data = _toUTF8(data, proposed_encoding)
3743 known_encoding = use_strict_parser = 1
3746 # if still no luck and we haven't tried iso-8859-2 yet, try that.
3747 if (not known_encoding) and ('iso-8859-2' not in tried_encodings):
3749 proposed_encoding = 'iso-8859-2'
3750 tried_encodings.append(proposed_encoding)
3751 data = _toUTF8(data, proposed_encoding)
3752 known_encoding = use_strict_parser = 1
3755 # if still no luck, give up
3756 if not known_encoding:
3758 result['bozo_exception'] = CharacterEncodingUnknown( \
3759 'document encoding unknown, I tried ' + \
3760 '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
3761 (result['encoding'], xml_encoding))
3762 result['encoding'] = ''
3763 elif proposed_encoding != result['encoding']:
3765 result['bozo_exception'] = CharacterEncodingOverride( \
3766 'document declared as %s, but parsed as %s' % \
3767 (result['encoding'], proposed_encoding))
3768 result['encoding'] = proposed_encoding
3770 if not _XML_AVAILABLE:
3771 use_strict_parser = 0
3772 if use_strict_parser:
3773 # initialize the SAX parser
3774 feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3775 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3776 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3777 saxparser.setContentHandler(feedparser)
3778 saxparser.setErrorHandler(feedparser)
3779 source = xml.sax.xmlreader.InputSource()
3780 source.setByteStream(_StringIO(data))
3781 if hasattr(saxparser, '_ns_stack'):
3782 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
3783 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
3784 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
3786 saxparser.parse(source)
3787 except Exception, e:
3790 traceback.print_stack()
3791 traceback.print_exc()
3792 sys.stderr.write('xml parsing failed\n')
3794 result['bozo_exception'] = feedparser.exc or e
3795 use_strict_parser = 0
3796 if not use_strict_parser:
3797 feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3798 feedparser.feed(data.decode('utf-8', 'replace'))
3799 result['feed'] = feedparser.feeddata
3800 result['entries'] = feedparser.entries
3801 result['version'] = result['version'] or feedparser.version
3802 result['namespaces'] = feedparser.namespacesInUse
3806 def __init__(self, results):
3807 self.results = results
3809 class TextSerializer(Serializer):
3810 def write(self, stream=sys.stdout):
3811 self._writer(stream, self.results, '')
3813 def _writer(self, stream, node, prefix):
3815 if hasattr(node, 'keys'):
3819 if k in ('description', 'link'): continue
3820 if node.has_key(k + '_detail'): continue
3821 if node.has_key(k + '_parsed'): continue
3822 self._writer(stream, node[k], prefix + k + '.')
3823 elif type(node) == types.ListType:
3826 self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].')
3830 s = str(node).encode('utf-8')
3831 s = s.replace('\\', '\\\\')
3832 s = s.replace('\r', '')
3833 s = s.replace('\n', r'\n')
3834 stream.write(prefix[:-1])
3841 class PprintSerializer(Serializer):
3842 def write(self, stream=sys.stdout):
3843 if self.results.has_key('href'):
3844 stream.write(self.results['href'] + '\n\n')
3845 from pprint import pprint
3846 pprint(self.results, stream)
3849 if __name__ == '__main__':
3851 from optparse import OptionParser
3856 optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-")
3857 optionParser.set_defaults(format="pprint")
3858 optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs")
3859 optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs")
3860 optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs")
3861 optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
3862 optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)")
3863 optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr")
3864 (options, urls) = optionParser.parse_args()
3868 optionParser.print_help()
3871 if not sys.argv[1:]:
3875 etag = modified = agent = referrer = None
3877 options = _Options()
3880 zopeCompatibilityHack()
3882 serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer)
3884 results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer)
3885 serializer(results).write(sys.stdout)