From 2d045d41fd3c3124884c10e83d76141deea616c6 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Tue, 10 Sep 2013 11:41:12 -0700 Subject: [PATCH] feed: Disable feedparser's PREFERRED_XML_PARSERS Feedparser's default parser (drv_libxml2) has trouble parsing byte streams in Python 3: $ python -c 'import rss2email.feed; import doctest; doctest.testmod(rss2email.feed)' ... File "rss2email/feed.py", line 319, in rss2email.feed.Feed._fetch Failed example: parsed = feed._fetch() Exception raised: Traceback (most recent call last): File "rss2email/util.py", line 61, in run self.result = self._target(*self._args, **self._kwargs) File "/.../feedparser/feedparser.py", line 3745, in parse saxparser.parse(source) File "/usr/lib64/python3.2/site-packages/drv_libxml2.py", line 270, in parse _d(reader.Name()),_d(reader.Value())) File "/usr/lib64/python3.2/site-packages/drv_libxml2.py", line 70, in _d return _decoder(s)[0] File "/usr/lib64/python3.2/encodings/utf_8.py", line 16, in decode return codecs.utf_8_decode(input, errors, True) TypeError: 'str' does not support the buffer interface The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/usr/lib64/python3.2/doctest.py", line 1288, in __run compileflags, 1), test.globs) File "", line 1, in parsed = feed._fetch() File "rss2email/feed.py", line 336, in _fetch return f(self.url, self.etag, modified=self.modified, **kwargs) File "rss2email/util.py", line 76, in __call__ time_limited_function=self) from self.error[1] rss2email.error.TimeoutError: error while running time limited function: 'str' does not support the buffer interface ... You can reproduce the underlying exception with this minimal script: import io import xml.sax import xml.sax.handler data = b'Example authorme@example.comhttp://example.com/' source = xml.sax.xmlreader.InputSource() source.setByteStream(io.BytesIO(data)) saxparser = xml.sax.make_parser(["drv_libxml2"]) saxparser.setContentHandler(xml.sax.handler.ContentHandler()) saxparser.parse(source) which raises: Traceback (most recent call last): File "", line 13, in saxparser.parse(source) File "/usr/lib64/python3.2/site-packages/drv_libxml2.py", line 222, in parse eltName = _d(reader.Name()) File "/usr/lib64/python3.2/site-packages/drv_libxml2.py", line 70, in _d return _decoder(s)[0] File "/usr/lib64/python3.2/encodings/utf_8.py", line 16, in decode return codecs.utf_8_decode(input, errors, True) TypeError: 'str' does not support the buffer interface at least for libxml2-2.9.1. By using the stdlib's default parser (instead of drv_libxml2), we can avoid the error and get successful parsing. If you don't have drv_libxml2 installed, sax was already falling back on the stdlib's default parser, so this commit will be a no-op. Signed-off-by: W. Trevor King --- rss2email/feed.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rss2email/feed.py b/rss2email/feed.py index 3999b0c..41ede76 100644 --- a/rss2email/feed.py +++ b/rss2email/feed.py @@ -64,6 +64,10 @@ for e in ['error', 'herror', 'gaierror']: del e # cleanup namespace _SOCKET_ERRORS = tuple(_SOCKET_ERRORS) +# drv_libxml2 raises: +# TypeError: 'str' does not support the buffer interface +_feedparser.PREFERRED_XML_PARSERS = [] + class Feed (object): """Utility class for feed manipulation and storage. -- 2.26.2