feed: don't emit error if parser able to auto-determine encoding
[rss2email.git] / rss2email / feed.py
index 0e67d319a4be8a9b2c444c4b44c411cc89dd7b25..3d9654f53243add7c53a14632a62e38af70e97d9 100644 (file)
@@ -28,6 +28,8 @@
 """
 
 import collections as _collections
+from email.mime.message import MIMEMessage as _MIMEMessage
+from email.mime.multipart import MIMEMultipart as _MIMEMultipart
 from email.utils import formataddr as _formataddr
 import hashlib as _hashlib
 import html.parser as _html_parser
@@ -52,7 +54,8 @@ from . import error as _error
 from . import util as _util
 
 
-_feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
+_USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
+_feedparser.USER_AGENT = _USER_AGENT
 _urllib_request.install_opener(_urllib_request.build_opener())
 _SOCKET_ERRORS = []
 for e in ['error', 'herror', 'gaierror']:
@@ -158,6 +161,7 @@ class Feed (object):
 
     # hints for value conversion
     _boolean_attributes = [
+        'digest',
         'force_from',
         'use_publisher_email',
         'friendly_name',
@@ -184,6 +188,7 @@ class Feed (object):
 
     _function_attributes = [
         'post_process',
+        'digest_post_process',
         ]
 
     def __init__(self, name=None, url=None, to=None, config=None):
@@ -399,6 +404,11 @@ class Feed (object):
         elif isinstance(exc, _sax.SAXParseException):
             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
             warned = True
+        elif (parsed.bozo and
+              isinstance(exc, _feedparser.CharacterEncodingOverride)):
+            _LOG.warning(
+                'incorrectly declared encoding: {}: {}'.format(exc, self))
+            warned = True
         elif parsed.bozo or exc:
             if exc is None:
                 exc = "can't process"
@@ -411,6 +421,10 @@ class Feed (object):
             not version):
             raise _error.ProcessingError(parsed=parsed, feed=feed)
 
+    def _html2text(self, html, baseurl=''):
+        self.config.setup_html2text(section=self.section)
+        return _html2text.html2text(html=html, baseurl=baseurl)
+
     def _process_entry(self, parsed, entry):
         id_ = self._get_entry_id(entry)
         # If .trust_guid isn't set, we get back hashes of the content.
@@ -428,7 +442,7 @@ class Feed (object):
         extra_headers = _collections.OrderedDict((
                 ('Date', self._get_entry_date(entry)),
                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
-                ('User-Agent', 'rss2email'),
+                ('User-Agent', _USER_AGENT),
                 ('X-RSS-Feed', self.url),
                 ('X-RSS-ID', id_),
                 ('X-RSS-URL', self._get_entry_link(entry)),
@@ -492,12 +506,12 @@ class Feed (object):
         if hasattr(entry, 'title_detail') and entry.title_detail:
             title = entry.title_detail.value
             if 'html' in entry.title_detail.type:
-                title = _html2text.html2text(title)
+                title = self._html2text(title)
         else:
             content = self._get_entry_content(entry)
             value = content['value']
             if content['type'] in ('text/html', 'application/xhtml+xml'):
-                value = _html2text.html2text(value)
+                value = self._html2text(value)
             title = value[:70]
         title = title.replace('\n', ' ').strip()
         return title
@@ -700,7 +714,7 @@ class Feed (object):
             lines.extend([
                     '</head>',
                     '<body>',
-                    '<div id="entry>',
+                    '<div id="entry">',
                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
                         link, subject),
                     '<div id="body">',
@@ -743,7 +757,7 @@ class Feed (object):
         else:  # not self.html_mail
             if content['type'] in ('text/html', 'application/xhtml+xml'):
                 try:
-                    lines = [_html2text.html2text(content['value'])]
+                    lines = [self._html2text(content['value'])]
                 except _html_parser.HTMLParseError as e:
                     raise _error.ProcessingError(parsed=None, feed=self)
             else:
@@ -788,12 +802,65 @@ class Feed (object):
         if not self.to:
             raise _error.NoToEmailAddress(feed=self)
         parsed = self._fetch()
+
+        if self.digest:
+            digest = self._new_digest()
+            seen = []
+
         for (guid, id_, sender, message) in self._process(parsed):
             _LOG.debug('new message: {}'.format(message['Subject']))
-            if send:
-                self._send(sender=sender, message=message)
+            if self.digest:
+                seen.append((guid, id_))
+                self._append_to_digest(digest=digest, message=message)
+            else:
+                if send:
+                    self._send(sender=sender, message=message)
+                if guid not in self.seen:
+                    self.seen[guid] = {}
+                self.seen[guid]['id'] = id_
+
+        if self.digest and seen:
+            if self.digest_post_process:
+                digest = self.digest_post_process(
+                    feed=self, parsed=parsed, seen=seen, message=digest)
+                if not digest:
+                    return
+            self._send_digest(
+                digest=digest, seen=seen, sender=sender, send=send)
+
+        self.etag = parsed.get('etag', None)
+        self.modified = parsed.get('modified', None)
+
+    def _new_digest(self):
+        digest = _MIMEMultipart('digest')
+        digest['To'] = self.to  # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
+        digest['Subject'] = 'digest for {}'.format(self.name)
+        digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
+        digest['User-Agent'] = _USER_AGENT
+        digest['X-RSS-Feed'] = self.url
+        return digest
+
+    def _append_to_digest(self, digest, message):
+        part = _MIMEMessage(message)
+        part.add_header('Content-Disposition', 'attachment')
+        digest.attach(part)
+
+    def _send_digest(self, digest, seen, sender, send=True):
+        """Send a digest message
+
+        The date is extracted from the last message in the digest
+        payload.  We assume that this part exists.  If you don't have
+        any messages in the digest, don't call this function.
+        """
+        digest['From'] = sender  # TODO: _Header(), _formataddr()...
+        last_part = digest.get_payload()[-1]
+        last_message = last_part.get_payload()[0]
+        digest['Date'] = last_message['Date']
+
+        _LOG.debug('new digest for {}'.format(self))
+        if send:
+            self._send(sender=sender, message=digest)
+        for (guid, id_) in seen:
             if guid not in self.seen:
                 self.seen[guid] = {}
             self.seen[guid]['id'] = id_
-        self.etag = parsed.get('etag', None)
-        self.modified = parsed.get('modified', None)