From: W. Trevor King Date: Wed, 20 Mar 2013 09:27:03 +0000 (-0400) Subject: feed: Catch parsing errors during html2text X-Git-Tag: v3.3~9 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=a3719f88fbd2faed3418c8391c3245465b4b850b;p=rss2email.git feed: Catch parsing errors during html2text This avoids crashing with: Traceback (most recent call last): ... File ".../rss2email/feed.py", line 732, in _process_entry_content lines = [_html2text.html2text(content['value'])] ... File "/usr/lib/python3.2/html/parser.py", line 149, in error raise HTMLParseError(message, self.getpos()) html.parser.HTMLParseError: EOF in middle of construct, at line 1, column 262 The troublesome feed was: $ wget -S http://www.cell.com/rssFeed/biophysj/rss.NewIssueAndArticles.xml --2013-03-20 05:22:08-- http://www.cell.com/rssFeed/biophysj/rss.NewIssueAndArticles.xml Resolving www.cell.com... 145.36.42.28 Connecting to www.cell.com|145.36.42.28|:80... connected. HTTP request sent, awaiting response... HTTP/1.1 200 OK Date: Wed, 20 Mar 2013 09:23:19 GMT Server: IBM_HTTP_Server Last-Modified: Tue, 19 Mar 2013 22:00:04 GMT Accept-Ranges: bytes Content-Length: 15362 Vary: Accept-Encoding Keep-Alive: timeout=10, max=100 Connection: Keep-Alive Content-Type: text/xml Length: 15362 (15K) [text/xml] Saving to: ‘rss.NewIssueAndArticles.xml’ 100%[======================================>] 15,362 94.1KB/s in 0.2s 2013-03-20 05:22:08 (94.1 KB/s) - ‘rss.NewIssueAndArticles.xml’ saved [15362/15362] which contained the poorly split summary: Synergistic Insertion of Antimicrobial Magainin-Family Peptides in Membranes Depends on the Lipid Spontaneous Curvature http://www.cell.com/biophysj/abstract/S0006-3495(13)00153-7 Erik Strandberg, Jonathan Zerweck, Parvesh Wadhwani, Anne S. Ulrich. PGLa and magainin 2 (MAG2) are amphiphilic antimicrobial peptides from frog skin with known synergistic activity. The orientation of the two helices in membranes was studied using solid-state <sup.... Tue, 19 Mar 2013 00:00:00 GMT http://www.cell.com/biophysj/abstract/S0006-3495(13)00153-7 2013-03-19T00:00:00Z The ' --- diff --git a/rss2email/feed.py b/rss2email/feed.py index a86be3e..e5f962d 100644 --- a/rss2email/feed.py +++ b/rss2email/feed.py @@ -30,6 +30,7 @@ import collections as _collections from email.utils import formataddr as _formataddr import hashlib as _hashlib +import html.parser as _html_parser import re as _re import socket as _socket import time as _time @@ -434,8 +435,12 @@ class Feed (object): self.bonus_header)) content = self._get_entry_content(entry) - content = self._process_entry_content( - entry=entry, content=content, subject=subject) + try: + content = self._process_entry_content( + entry=entry, content=content, subject=subject) + except _error.ProcessingError as e: + e.parsed = parsed + raise message = _email.get_message( sender=sender, recipient=self.to, @@ -724,7 +729,10 @@ class Feed (object): return content else: # not self.html_mail if content['type'] in ('text/html', 'application/xhtml+xml'): - lines = [_html2text.html2text(content['value'])] + try: + lines = [_html2text.html2text(content['value'])] + except _html_parser.HTMLParseError as e: + raise _error.ProcessingError(parsed=None, feed=self) else: lines = [content['value']] lines.append('')