From: W. Trevor King Date: Sat, 28 Sep 2013 16:51:03 +0000 (-0700) Subject: feed: Add 'default' argument to Feed._html2text for HTMLParseError X-Git-Tag: v3.7~3^2~1 X-Git-Url: http://git.tremily.us/?p=rss2email.git;a=commitdiff_plain;h=f1284d7eefafbdecd85c721eaf4fa16c03ffd30e feed: Add 'default' argument to Feed._html2text for HTMLParseError This allows us to easily fall back on an unconverted string in the event that the input HTML is malformed. We already caught HTMLParseError when converting HTML to plain test for non-html mail, but we didn't catch it in Feed._get_entry_title. Now we gracefully handle the situation by treating the malformed HTML as plain text. --- diff --git a/rss2email/feed.py b/rss2email/feed.py index 3999b0c..16191e2 100644 --- a/rss2email/feed.py +++ b/rss2email/feed.py @@ -416,9 +416,14 @@ class Feed (object): not version): raise _error.ProcessingError(parsed=parsed, feed=feed) - def _html2text(self, html, baseurl=''): + def _html2text(self, html, baseurl='', default=None): self.config.setup_html2text(section=self.section) - return _html2text.html2text(html=html, baseurl=baseurl) + try: + return _html2text.html2text(html=html, baseurl=baseurl) + except _html_parser.HTMLParseError as e: + if default is not None: + return default + raise def _process_entry(self, parsed, entry): id_ = self._get_entry_id(entry) @@ -501,12 +506,12 @@ class Feed (object): if hasattr(entry, 'title_detail') and entry.title_detail: title = entry.title_detail.value if 'html' in entry.title_detail.type: - title = self._html2text(title) + title = self._html2text(title, default=title) else: content = self._get_entry_content(entry) value = content['value'] if content['type'] in ('text/html', 'application/xhtml+xml'): - value = self._html2text(value) + value = self._html2text(value, default=value) title = value[:70] title = title.replace('\n', ' ').strip() return title