From: W. Trevor King <wking@tremily.us> Date: Tue, 10 Sep 2013 18:41:12 +0000 (-0700) Subject: feed: Disable feedparser's PREFERRED_XML_PARSERS X-Git-Tag: v3.7~8^2 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=2d045d41fd3c3124884c10e83d76141deea616c6;p=rss2email.git feed: Disable feedparser's PREFERRED_XML_PARSERS Feedparser's default parser (drv_libxml2) has trouble parsing byte streams in Python 3: $ python -c 'import rss2email.feed; import doctest; doctest.testmod(rss2email.feed)' ... File "rss2email/feed.py", line 319, in rss2email.feed.Feed._fetch Failed example: parsed = feed._fetch() Exception raised: Traceback (most recent call last): File "rss2email/util.py", line 61, in run self.result = self._target(*self._args, **self._kwargs) File "/.../feedparser/feedparser.py", line 3745, in parse saxparser.parse(source) File "/usr/lib64/python3.2/site-packages/drv_libxml2.py", line 270, in parse _d(reader.Name()),_d(reader.Value())) File "/usr/lib64/python3.2/site-packages/drv_libxml2.py", line 70, in _d return _decoder(s)[0] File "/usr/lib64/python3.2/encodings/utf_8.py", line 16, in decode return codecs.utf_8_decode(input, errors, True) TypeError: 'str' does not support the buffer interface The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/usr/lib64/python3.2/doctest.py", line 1288, in __run compileflags, 1), test.globs) File "<doctest rss2email.feed.Feed._fetch[1]>", line 1, in <module> parsed = feed._fetch() File "rss2email/feed.py", line 336, in _fetch return f(self.url, self.etag, modified=self.modified, **kwargs) File "rss2email/util.py", line 76, in __call__ time_limited_function=self) from self.error[1] rss2email.error.TimeoutError: error while running time limited function: 'str' does not support the buffer interface ... You can reproduce the underlying exception with this minimal script: import io import xml.sax import xml.sax.handler data = b'<feed xmlns="http://www.w3.org/2005/Atom"><entry><author><name>Example author</name><email>me@example.com</email><url>http://example.com/</url></author></entry></feed>' source = xml.sax.xmlreader.InputSource() source.setByteStream(io.BytesIO(data)) saxparser = xml.sax.make_parser(["drv_libxml2"]) saxparser.setContentHandler(xml.sax.handler.ContentHandler()) saxparser.parse(source) which raises: Traceback (most recent call last): File "<stdin>", line 13, in <module> saxparser.parse(source) File "/usr/lib64/python3.2/site-packages/drv_libxml2.py", line 222, in parse eltName = _d(reader.Name()) File "/usr/lib64/python3.2/site-packages/drv_libxml2.py", line 70, in _d return _decoder(s)[0] File "/usr/lib64/python3.2/encodings/utf_8.py", line 16, in decode return codecs.utf_8_decode(input, errors, True) TypeError: 'str' does not support the buffer interface at least for libxml2-2.9.1. By using the stdlib's default parser (instead of drv_libxml2), we can avoid the error and get successful parsing. If you don't have drv_libxml2 installed, sax was already falling back on the stdlib's default parser, so this commit will be a no-op. Signed-off-by: W. Trevor King <wking@tremily.us> --- diff --git a/rss2email/feed.py b/rss2email/feed.py index 3999b0c..41ede76 100644 --- a/rss2email/feed.py +++ b/rss2email/feed.py @@ -64,6 +64,10 @@ for e in ['error', 'herror', 'gaierror']: del e # cleanup namespace _SOCKET_ERRORS = tuple(_SOCKET_ERRORS) +# drv_libxml2 raises: +# TypeError: 'str' does not support the buffer interface +_feedparser.PREFERRED_XML_PARSERS = [] + class Feed (object): """Utility class for feed manipulation and storage.