This allows us to easily fall back on an unconverted string in the
event that the input HTML is malformed. We already caught
HTMLParseError when converting HTML to plain test for non-html mail,
but we didn't catch it in Feed._get_entry_title. Now we gracefully
handle the situation by treating the malformed HTML as plain text.
not version):
raise _error.ProcessingError(parsed=parsed, feed=feed)
not version):
raise _error.ProcessingError(parsed=parsed, feed=feed)
- def _html2text(self, html, baseurl=''):
+ def _html2text(self, html, baseurl='', default=None):
self.config.setup_html2text(section=self.section)
self.config.setup_html2text(section=self.section)
- return _html2text.html2text(html=html, baseurl=baseurl)
+ try:
+ return _html2text.html2text(html=html, baseurl=baseurl)
+ except _html_parser.HTMLParseError as e:
+ if default is not None:
+ return default
+ raise
def _process_entry(self, parsed, entry):
id_ = self._get_entry_id(entry)
def _process_entry(self, parsed, entry):
id_ = self._get_entry_id(entry)
if hasattr(entry, 'title_detail') and entry.title_detail:
title = entry.title_detail.value
if 'html' in entry.title_detail.type:
if hasattr(entry, 'title_detail') and entry.title_detail:
title = entry.title_detail.value
if 'html' in entry.title_detail.type:
- title = self._html2text(title)
+ title = self._html2text(title, default=title)
else:
content = self._get_entry_content(entry)
value = content['value']
if content['type'] in ('text/html', 'application/xhtml+xml'):
else:
content = self._get_entry_content(entry)
value = content['value']
if content['type'] in ('text/html', 'application/xhtml+xml'):
- value = self._html2text(value)
+ value = self._html2text(value, default=value)
title = value[:70]
title = title.replace('\n', ' ').strip()
return title
title = value[:70]
title = title.replace('\n', ' ').strip()
return title