rss2email/feed.py

   1 # Copyright (C) 2004-2013 Aaron Swartz
   2 #                         Brian Lalor
   3 #                         Dean Jackson
   4 #                         Erik Hetzner
   5 #                         Etienne Millon <me@emillon.org>
   6 #                         Joey Hess
   7 #                         Lindsey Smith <lindsey.smith@gmail.com>
   8 #                         Marcel Ackermann
   9 #                         Martin 'Joey' Schulze
  10 #                         Matej Cepl
  11 #                         W. Trevor King <wking@tremily.us>
  12 #
  13 # This file is part of rss2email.
  14 #
  15 # rss2email is free software: you can redistribute it and/or modify it under
  16 # the terms of the GNU General Public License as published by the Free Software
  17 # Foundation, either version 2 of the License, or (at your option) version 3 of
  18 # the License.
  19 #
  20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
  21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  22 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  23 #
  24 # You should have received a copy of the GNU General Public License along with
  25 # rss2email.  If not, see <http://www.gnu.org/licenses/>.
  26
  27 """Define the ``Feed`` class for handling a single feed
  28 """
  29
  30 import collections as _collections
  31 from email.mime.message import MIMEMessage as _MIMEMessage
  32 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
  33 from email.utils import formataddr as _formataddr
  34 import hashlib as _hashlib
  35 import html.parser as _html_parser
  36 import re as _re
  37 import socket as _socket
  38 import time as _time
  39 import urllib.error as _urllib_error
  40 import urllib.request as _urllib_request
  41 import uuid as _uuid
  42 import xml.sax as _sax
  43 import xml.sax.saxutils as _saxutils
  44
  45 import feedparser as _feedparser
  46 import html2text as _html2text
  47
  48 from . import __url__
  49 from . import __version__
  50 from . import LOG as _LOG
  51 from . import config as _config
  52 from . import email as _email
  53 from . import error as _error
  54 from . import util as _util
  55
  56
  57 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
  58 _feedparser.USER_AGENT = _USER_AGENT
  59 _urllib_request.install_opener(_urllib_request.build_opener())
  60 _SOCKET_ERRORS = []
  61 for e in ['error', 'herror', 'gaierror']:
  62     if hasattr(_socket, e):
  63         _SOCKET_ERRORS.append(getattr(_socket, e))
  64 del e  # cleanup namespace
  65 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
  66
  67
  68 class Feed (object):
  69     """Utility class for feed manipulation and storage.
  70
  71     >>> import pickle
  72     >>> import sys
  73     >>> from .config import CONFIG
  74
  75     >>> feed = Feed(
  76     ...    name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
  77     >>> print(feed)
  78     test-feed (http://example.com/feed.atom -> a@b.com)
  79     >>> feed.section
  80     'feed.test-feed'
  81     >>> feed.from_email
  82     'user@rss2email.invalid'
  83
  84     >>> feed.from_email = 'a@b.com'
  85     >>> feed.save_to_config()
  86     >>> feed.config.write(sys.stdout)  # doctest: +REPORT_UDIFF, +ELLIPSIS
  87     [DEFAULT]
  88     from = user@rss2email.invalid
  89     ...
  90     verbose = warning
  91     <BLANKLINE>
  92     [feed.test-feed]
  93     url = http://example.com/feed.atom
  94     from = a@b.com
  95     to = a@b.com
  96     <BLANKLINE>
  97
  98     >>> feed.etag = 'dummy etag'
  99     >>> string = pickle.dumps(feed)
 100     >>> feed = pickle.loads(string)
 101     >>> feed.load_from_config(config=CONFIG)
 102     >>> feed.etag
 103     'dummy etag'
 104     >>> feed.url
 105     'http://example.com/feed.atom'
 106
 107     Names can only contain ASCII letters, digits, and '._-'.  Here the
 108     invalid space causes an exception:
 109
 110     >>> Feed(name='invalid name')
 111     Traceback (most recent call last):
 112       ...
 113     rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
 114
 115     You must define a URL:
 116
 117     >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
 118     Traceback (most recent call last):
 119       ...
 120     rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
 121
 122
 123     Cleanup `CONFIG`.
 124
 125     >>> CONFIG['DEFAULT']['to'] = ''
 126     >>> test_section = CONFIG.pop('feed.test-feed')
 127     """
 128     _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
 129
 130     # saved/loaded from feed.dat using __getstate__/__setstate__.
 131     _dynamic_attributes = [
 132         'name',
 133         'etag',
 134         'modified',
 135         'seen',
 136         ]
 137
 138     ## saved/loaded from ConfigParser instance
 139     # attributes that aren't in DEFAULT
 140     _non_default_configured_attributes = [
 141         'url',
 142         ]
 143     # attributes that are in DEFAULT
 144     _default_configured_attributes = [
 145         key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
 146     _default_configured_attributes[
 147         _default_configured_attributes.index('from')
 148         ] = 'from_email'  # `from` is a Python keyword
 149     # all attributes that are saved/loaded from .config
 150     _configured_attributes = (
 151         _non_default_configured_attributes + _default_configured_attributes)
 152     # attribute name -> .config option
 153     _configured_attribute_translations = dict(
 154         (attr,attr) for attr in _non_default_configured_attributes)
 155     _configured_attribute_translations.update(dict(
 156             zip(_default_configured_attributes,
 157                 _config.CONFIG['DEFAULT'].keys())))
 158     # .config option -> attribute name
 159     _configured_attribute_inverse_translations = dict(
 160         (v,k) for k,v in _configured_attribute_translations.items())
 161
 162     # hints for value conversion
 163     _boolean_attributes = [
 164         'digest',
 165         'force_from',
 166         'use_publisher_email',
 167         'active',
 168         'date_header',
 169         'trust_guid',
 170         'html_mail',
 171         'use_css',
 172         'unicode_snob',
 173         'links_after_each_paragraph',
 174         'use_smtp',
 175         'smtp_ssl',
 176         ]
 177
 178     _integer_attributes = [
 179         'feed_timeout',
 180         'body_width',
 181         ]
 182
 183     _list_attributes = [
 184         'date_header_order',
 185         'encodings',
 186         ]
 187
 188     _function_attributes = [
 189         'post_process',
 190         'digest_post_process',
 191         ]
 192
 193     def __init__(self, name=None, url=None, to=None, config=None):
 194         self._set_name(name=name)
 195         self.reset()
 196         self.__setstate__(dict(
 197                 (attr, getattr(self, attr))
 198                 for attr in self._dynamic_attributes))
 199         self.load_from_config(config=config)
 200         if url:
 201             self.url = url
 202         if to:
 203             self.to = to
 204
 205     def __str__(self):
 206         return '{} ({} -> {})'.format(self.name, self.url, self.to)
 207
 208     def __repr__(self):
 209         return '<Feed {}>'.format(str(self))
 210
 211     def __getstate__(self):
 212         "Save dyamic attributes"
 213         return dict(
 214             (key,getattr(self,key)) for key in self._dynamic_attributes)
 215
 216     get_state = __getstate__  # make it publicly accessible
 217
 218     def __setstate__(self, state):
 219         "Restore dynamic attributes"
 220         keys = sorted(state.keys())
 221         if keys != sorted(self._dynamic_attributes):
 222             raise ValueError(state)
 223         self._set_name(name=state['name'])
 224         self.__dict__.update(state)
 225
 226     set_state = __setstate__  # make it publicly accessible
 227
 228     def save_to_config(self):
 229         "Save configured attributes"
 230         data = _collections.OrderedDict()
 231         default = self.config['DEFAULT']
 232         for attr in self._configured_attributes:
 233             key = self._configured_attribute_translations[attr]
 234             value = getattr(self, attr)
 235             if value is not None:
 236                 value = self._get_configured_option_value(
 237                     attribute=attr, value=value)
 238                 if (attr in self._non_default_configured_attributes or
 239                     value != default[key]):
 240                     data[key] = value
 241         self.config[self.section] = data
 242
 243     def load_from_config(self, config=None):
 244         "Restore configured attributes"
 245         if config is None:
 246             config = _config.CONFIG
 247         self.config = config
 248         if self.section in self.config:
 249             data = self.config[self.section]
 250         else:
 251             data = self.config['DEFAULT']
 252         keys = sorted(data.keys())
 253         expected = sorted(self._configured_attribute_translations.values())
 254         if keys != expected:
 255             for key in expected:
 256                 if (key not in keys and
 257                     key not in self._non_default_configured_attributes):
 258                     raise _error.InvalidFeedConfig(
 259                         setting=key, feed=self,
 260                         message='missing configuration key: {}'.format(key))
 261             for key in keys:
 262                 if key not in expected:
 263                     raise _error.InvalidFeedConfig(
 264                         setting=key, feed=self,
 265                         message='extra configuration key: {}'.format(key))
 266         data = dict(
 267             (self._configured_attribute_inverse_translations[k],
 268              self._get_configured_attribute_value(
 269                   attribute=self._configured_attribute_inverse_translations[k],
 270                   key=k, data=data))
 271             for k in data.keys())
 272         for attr in self._non_default_configured_attributes:
 273             if attr not in data:
 274                 data[attr] = None
 275         self.__dict__.update(data)
 276
 277     def _get_configured_option_value(self, attribute, value):
 278         if value is None:
 279             return ''
 280         elif attribute in self._list_attributes:
 281             return ', '.join(value)
 282         elif attribute in self._function_attributes:
 283             return _util.import_name(value)
 284         return str(value)
 285
 286     def _get_configured_attribute_value(self, attribute, key, data):
 287         if attribute in self._boolean_attributes:
 288             return data.getboolean(key)
 289         elif attribute in self._integer_attributes:
 290             return data.getint(key)
 291         elif attribute in self._list_attributes:
 292             return [x.strip() for x in data[key].split(',')]
 293         elif attribute in self._function_attributes:
 294             if data[key]:
 295                 return _util.import_function(data[key])
 296             return None
 297         return data[key]
 298
 299     def reset(self):
 300         """Reset dynamic data
 301         """
 302         self.etag = None
 303         self.modified = None
 304         self.seen = {}
 305
 306     def _set_name(self, name):
 307         if not self._name_regexp.match(name):
 308             raise _error.InvalidFeedName(name=name, feed=self)
 309         self.name = name
 310         self.section = 'feed.{}'.format(self.name)
 311
 312     def _fetch(self):
 313         """Fetch and parse a feed using feedparser.
 314
 315         >>> feed = Feed(
 316         ...    name='test-feed',
 317         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 318         >>> parsed = feed._fetch()
 319         >>> parsed.status
 320         200
 321         """
 322         _LOG.info('fetch {}'.format(self))
 323         if not self.url:
 324             raise _error.InvalidFeedConfig(setting='url', feed=self)
 325         if self.section in self.config:
 326             config = self.config[self.section]
 327         else:
 328             config = self.config['DEFAULT']
 329         proxy = config['proxy']
 330         timeout = config.getint('feed-timeout')
 331         kwargs = {}
 332         if proxy:
 333             kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
 334         f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
 335         return f(self.url, self.etag, modified=self.modified, **kwargs)
 336
 337     def _process(self, parsed):
 338         _LOG.info('process {}'.format(self))
 339         self._check_for_errors(parsed)
 340         for entry in reversed(parsed.entries):
 341             _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
 342             processed = self._process_entry(parsed=parsed, entry=entry)
 343             if processed:
 344                 guid,id_,sender,message = processed
 345                 if self.post_process:
 346                     message = self.post_process(
 347                         feed=self, parsed=parsed, entry=entry, guid=guid,
 348                         message=message)
 349                     if not message:
 350                         continue
 351                 yield (guid, id_, sender, message)
 352
 353     def _check_for_errors(self, parsed):
 354         warned = False
 355         status = getattr(parsed, 'status', 200)
 356         _LOG.debug('HTTP status {}'.format(status))
 357         if status == 301:
 358             _LOG.info('redirect {} from {} to {}'.format(
 359                     self.name, self.url, parsed['url']))
 360             self.url = parsed['url']
 361         elif status not in [200, 302, 304]:
 362             raise _error.HTTPError(status=status, feed=self)
 363
 364         http_headers = parsed.get('headers', {})
 365         if http_headers:
 366             _LOG.debug('HTTP headers: {}'.format(http_headers))
 367         if not http_headers:
 368             _LOG.warning('could not get HTTP headers: {}'.format(self))
 369             warned = True
 370         else:
 371             if 'html' in http_headers.get('content-type', 'rss'):
 372                 _LOG.warning('looks like HTML: {}'.format(self))
 373                 warned = True
 374             if http_headers.get('content-length', '1') == '0':
 375                 _LOG.warning('empty page: {}'.format(self))
 376                 warned = True
 377
 378         version = parsed.get('version', None)
 379         if version:
 380             _LOG.debug('feed version {}'.format(version))
 381         else:
 382             _LOG.warning('unrecognized version: {}'.format(self))
 383             warned = True
 384
 385         exc = parsed.get('bozo_exception', None)
 386         if isinstance(exc, _socket.timeout):
 387             _LOG.error('timed out: {}'.format(self))
 388             warned = True
 389         elif isinstance(exc, OSError):
 390             _LOG.error('{}: {}'.format(exc, self))
 391             warned = True
 392         elif isinstance(exc, _SOCKET_ERRORS):
 393             _LOG.error('{}: {}'.format(exc, self))
 394             warned = True
 395         elif isinstance(exc, _feedparser.zlib.error):
 396             _LOG.error('broken compression: {}'.format(self))
 397             warned = True
 398         elif isinstance(exc, (IOError, AttributeError)):
 399             _LOG.error('{}: {}'.format(exc, self))
 400             warned = True
 401         elif isinstance(exc, KeyboardInterrupt):
 402             raise exc
 403         elif isinstance(exc, _sax.SAXParseException):
 404             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
 405             warned = True
 406         elif parsed.bozo or exc:
 407             if exc is None:
 408                 exc = "can't process"
 409             _LOG.error('processing error: {}: {}'.format(exc, self))
 410             warned = True
 411
 412         if (not warned and
 413             status in [200, 302] and
 414             not parsed.entries and
 415             not version):
 416             raise _error.ProcessingError(parsed=parsed, feed=feed)
 417
 418     def _html2text(self, html, baseurl=''):
 419         self.config.setup_html2text(section=self.section)
 420         return _html2text.html2text(html=html, baseurl=baseurl)
 421
 422     def _process_entry(self, parsed, entry):
 423         id_ = self._get_entry_id(entry)
 424         # If .trust_guid isn't set, we get back hashes of the content.
 425         # Instead of letting these run wild, we put them in context
 426         # by associating them with the actual ID (if it exists).
 427         guid = entry.get('id', id_)
 428         if isinstance(guid, dict):
 429             guid = guid.values()[0]
 430         if guid in self.seen:
 431             if self.seen[guid]['id'] == id_:
 432                 _LOG.debug('already seen {}'.format(id_))
 433                 return  # already seen
 434         sender = self._get_entry_email(parsed=parsed, entry=entry)
 435         subject = self._get_entry_title(entry)
 436         extra_headers = _collections.OrderedDict((
 437                 ('Date', self._get_entry_date(entry)),
 438                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
 439                 ('User-Agent', _USER_AGENT),
 440                 ('X-RSS-Feed', self.url),
 441                 ('X-RSS-ID', id_),
 442                 ('X-RSS-URL', self._get_entry_link(entry)),
 443                 ('X-RSS-TAGS', self._get_entry_tags(entry)),
 444                 ))
 445         for k,v in extra_headers.items():  # remove empty tags, etc.
 446             if v is None:
 447                 extra_headers.pop(k)
 448         if self.bonus_header:
 449             for header in self.bonus_header.splitlines():
 450                 if ':' in header:
 451                     key,value = header.split(':', 1)
 452                     extra_headers[key.strip()] = value.strip()
 453                 else:
 454                     _LOG.warning(
 455                         'malformed bonus-header: {}'.format(
 456                             self.bonus_header))
 457
 458         content = self._get_entry_content(entry)
 459         try:
 460             content = self._process_entry_content(
 461                 entry=entry, content=content, subject=subject)
 462         except _error.ProcessingError as e:
 463             e.parsed = parsed
 464             raise
 465         message = _email.get_message(
 466             sender=sender,
 467             recipient=self.to,
 468             subject=subject,
 469             body=content['value'],
 470             content_type=content['type'].split('/', 1)[1],
 471             extra_headers=extra_headers,
 472             config=self.config,
 473             section=self.section)
 474         return (guid, id_, sender, message)
 475
 476     def _get_entry_id(self, entry):
 477         """Get best ID from an entry."""
 478         if self.trust_guid:
 479             if getattr(entry, 'id', None):
 480                 # Newer versions of feedparser could return a dictionary
 481                 if isinstance(entry.id, dict):
 482                     return entry.id.values()[0]
 483                 return entry.id
 484         content = self._get_entry_content(entry)
 485         content_value = content['value'].strip()
 486         if content_value:
 487             return _hashlib.sha1(
 488                 content_value.encode('unicode-escape')).hexdigest()
 489         elif getattr(entry, 'link', None):
 490             return _hashlib.sha1(
 491                 entry.link.encode('unicode-escape')).hexdigest()
 492         elif getattr(entry, 'title', None):
 493             return _hashlib.sha1(
 494                 entry.title.encode('unicode-escape')).hexdigest()
 495
 496     def _get_entry_link(self, entry):
 497         return entry.get('link', None)
 498
 499     def _get_entry_title(self, entry):
 500         if hasattr(entry, 'title_detail') and entry.title_detail:
 501             title = entry.title_detail.value
 502             if 'html' in entry.title_detail.type:
 503                 title = self._html2text(title)
 504         else:
 505             content = self._get_entry_content(entry)
 506             value = content['value']
 507             if content['type'] in ('text/html', 'application/xhtml+xml'):
 508                 value = self._html2text(value)
 509             title = value[:70]
 510         title = title.replace('\n', ' ').strip()
 511         return title
 512
 513     def _get_entry_date(self, entry):
 514         datetime = _time.gmtime()
 515         if self.date_header:
 516             for datetype in self.date_header_order:
 517                 kind = datetype + '_parsed'
 518                 if entry.get(kind, None):
 519                     datetime = entry[kind]
 520                     break
 521         return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 522
 523     def _get_entry_name(self, parsed, entry):
 524         """Get the best name
 525
 526         >>> import feedparser
 527         >>> f = Feed(name='test-feed')
 528         >>> parsed = feedparser.parse(
 529         ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
 530         ...     '  <entry>\\n'
 531         ...     '    <author>\\n'
 532         ...     '      <name>Example author</name>\\n'
 533         ...     '      <email>me@example.com</email>\\n'
 534         ...     '      <url>http://example.com/</url>\\n'
 535         ...     '    </author>\\n'
 536         ...     '  </entry>\\n'
 537         ...     '</feed>\\n'
 538         ...     )
 539         >>> entry = parsed.entries[0]
 540         >>> f.name_format = ''
 541         >>> f._get_entry_name(parsed, entry)
 542         ''
 543         >>> f.name_format = '{author}'
 544         >>> f._get_entry_name(parsed, entry)
 545         'Example author'
 546         >>> f.name_format = '{feed-title}: {author}'
 547         >>> f._get_entry_name(parsed, entry)
 548         ': Example author'
 549         >>> f.name_format = '{author} ({feed.name})'
 550         >>> f._get_entry_name(parsed, entry)
 551         'Example author (test-feed)'
 552         """
 553         if not self.name_format:
 554             return ''
 555         data = {'feed': self}
 556         feed = parsed.feed
 557         data['feed-title'] = feed.get('title', '')
 558         for x in [entry, feed]:
 559             if 'name' in x.get('author_detail', []):
 560                 if x.author_detail.name:
 561                     data['author'] = x.author_detail.name
 562                     break
 563         if 'name' in feed.get('publisher_detail', []):
 564             data['publisher'] = feed.publisher_detail.name
 565         name = self.name_format.format(**data)
 566         return _html2text.unescape(name)
 567
 568     def _validate_email(self, email, default=None):
 569         """Do a basic quality check on email address
 570
 571         Return `default` if the address doesn't appear to be
 572         well-formed.  If `default` is `None`, return
 573         `self.from_email`.
 574
 575         >>> f = Feed(name='test-feed')
 576         >>> f._validate_email('valid@example.com', 'default@example.com')
 577         'valid@example.com'
 578         >>> f._validate_email('invalid@', 'default@example.com')
 579         'default@example.com'
 580         >>> f._validate_email('@invalid', 'default@example.com')
 581         'default@example.com'
 582         >>> f._validate_email('invalid', 'default@example.com')
 583         'default@example.com'
 584         """
 585         parts = email.split('@')
 586         if len(parts) != 2 or '' in parts:
 587             if default is None:
 588                 return self.from_email
 589             return default
 590         return email
 591
 592     def _get_entry_address(self, parsed, entry):
 593         """Get the best From email address ('<jdoe@a.com>')
 594
 595         If the best guess isn't well-formed (something@somthing.com),
 596         use `self.from_email` instead.
 597         """
 598         if self.force_from:
 599             return self.from_email
 600         feed = parsed.feed
 601         if 'email' in entry.get('author_detail', []):
 602             return self._validate_email(entry.author_detail.email)
 603         elif 'email' in feed.get('author_detail', []):
 604             return self._validate_email(feed.author_detail.email)
 605         if self.use_publisher_email:
 606             if 'email' in feed.get('publisher_detail', []):
 607                 return self._validate_email(feed.publisher_detail.email)
 608             if feed.get('errorreportsto', None):
 609                 return self._validate_email(feed.errorreportsto)
 610         _LOG.debug('no sender address found, fallback to default')
 611         return self.from_email
 612
 613     def _get_entry_email(self, parsed, entry):
 614         """Get the best From email address ('John <jdoe@a.com>')
 615         """
 616         name = self._get_entry_name(parsed=parsed, entry=entry)
 617         address = self._get_entry_address(parsed=parsed, entry=entry)
 618         return _formataddr((name, address))
 619
 620     def _get_entry_tags(self, entry):
 621         """Add post tags, if available
 622
 623         >>> f = Feed(name='test-feed')
 624         >>> f._get_entry_tags({
 625         ...         'tags': [{'term': 'tag1',
 626         ...                   'scheme': None,
 627         ...                   'label': None}]})
 628         'tag1'
 629         >>> f._get_entry_tags({
 630         ...         'tags': [{'term': 'tag1',
 631         ...                   'scheme': None,
 632         ...                   'label': None},
 633         ...                  {'term': 'tag2',
 634         ...                   'scheme': None,
 635         ...                   'label': None}]})
 636         'tag1,tag2'
 637
 638         Test some troublesome cases.  No tags:
 639
 640         >>> f._get_entry_tags({})
 641
 642         Empty tags:
 643
 644         >>> f._get_entry_tags({'tags': []})
 645
 646         Tags without a ``term`` entry:
 647
 648         >>> f._get_entry_tags({
 649         ...         'tags': [{'scheme': None,
 650         ...                   'label': None}]})
 651
 652         Tags with an empty term:
 653
 654         >>> f._get_entry_tags({
 655         ...         'tags': [{'term': '',
 656         ...                   'scheme': None,
 657         ...                   'label': None}]})
 658         """
 659         taglist = [tag['term'] for tag in entry.get('tags', [])
 660                    if tag.get('term', '')]
 661         if taglist:
 662             return ','.join(taglist)
 663
 664     def _get_entry_content(self, entry):
 665         """Select the best content from an entry.
 666
 667         Returns a feedparser content dict.
 668         """
 669         # How this works:
 670         #  * We have a bunch of potential contents.
 671         #  * We go thru looking for our first choice.
 672         #    (HTML or text, depending on self.html_mail)
 673         #  * If that doesn't work, we go thru looking for our second choice.
 674         #  * If that still doesn't work, we just take the first one.
 675         #
 676         # Possible future improvement:
 677         #  * Instead of just taking the first one
 678         #    pick the one in the "best" language.
 679         #  * HACK: hardcoded .html_mail, should take a tuple of media types
 680         contents = list(entry.get('content', []))
 681         if entry.get('summary_detail', None):
 682             contents.append(entry.summary_detail)
 683         if self.html_mail:
 684             types = ['text/html', 'text/plain']
 685         else:
 686             types = ['text/plain', 'text/html']
 687         for content_type in types:
 688             for content in contents:
 689                 if content['type'] == content_type:
 690                     return content
 691         if contents:
 692             return contents[0]
 693         return {'type': 'text/plain', 'value': ''}
 694
 695     def _process_entry_content(self, entry, content, subject):
 696         "Convert entry content to the requested format."
 697         link = self._get_entry_link(entry)
 698         if self.html_mail:
 699             lines = [
 700                 '<!DOCTYPE html>',
 701                 '<html>',
 702                 '  <head>',
 703                 ]
 704             if self.use_css and self.css:
 705                 lines.extend([
 706                         '    <style type="text/css">',
 707                         self.css,
 708                         '    </style>',
 709                         ])
 710             lines.extend([
 711                     '</head>',
 712                     '<body>',
 713                     '<div id="entry">',
 714                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
 715                         link, subject),
 716                     '<div id="body">',
 717                     ])
 718             if content['type'] in ('text/html', 'application/xhtml+xml'):
 719                 lines.append(content['value'].strip())
 720             else:
 721                 lines.append(_saxutils.escape(content['value'].strip()))
 722             lines.append('</div>')
 723             lines.extend([
 724                     '<div class="footer">'
 725                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
 726                     ])
 727             for enclosure in getattr(entry, 'enclosures', []):
 728                 if getattr(enclosure, 'url', None):
 729                     lines.append(
 730                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 731                             enclosure.url))
 732                 if getattr(enclosure, 'src', None):
 733                     lines.append(
 734                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 735                             enclosure.src))
 736                     lines.append(
 737                         '<p><img src="{}" /></p>'.format(enclosure.src))
 738             for elink in getattr(entry, 'links', []):
 739                 if elink.get('rel', None) == 'via':
 740                     url = elink['href']
 741                     title = elink.get('title', url)
 742                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
 743                             url, title))
 744             lines.extend([
 745                     '</div>',  # /footer
 746                     '</div>',  # /entry
 747                     '</body>',
 748                     '</html>',
 749                     ''])
 750             content['type'] = 'text/html'
 751             content['value'] = '\n'.join(lines)
 752             return content
 753         else:  # not self.html_mail
 754             if content['type'] in ('text/html', 'application/xhtml+xml'):
 755                 try:
 756                     lines = [self._html2text(content['value'])]
 757                 except _html_parser.HTMLParseError as e:
 758                     raise _error.ProcessingError(parsed=None, feed=self)
 759             else:
 760                 lines = [content['value']]
 761             lines.append('')
 762             lines.append('URL: {}'.format(link))
 763             for enclosure in getattr(entry, 'enclosures', []):
 764                 if getattr(enclosure, 'url', None):
 765                     lines.append('Enclosure: {}'.format(enclosure.url))
 766                 if getattr(enclosure, 'src', None):
 767                     lines.append('Enclosure: {}'.format(enclosure.src))
 768             for elink in getattr(entry, 'links', []):
 769                 if elink.get('rel', None) == 'via':
 770                     url = elink['href']
 771                     title = elink.get('title', url)
 772                     lines.append('Via: {} {}'.format(title, url))
 773             content['type'] = 'text/plain'
 774             content['value'] = '\n'.join(lines)
 775             return content
 776
 777     def _send(self, sender, message):
 778         _LOG.info('send message for {}'.format(self))
 779         section = self.section
 780         if section not in self.config:
 781             section = 'DEFAULT'
 782         _email.send(sender=sender, recipient=self.to, message=message,
 783                     config=self.config, section=section)
 784
 785     def run(self, send=True):
 786         """Fetch and process the feed, mailing entry emails.
 787
 788         >>> feed = Feed(
 789         ...    name='test-feed',
 790         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 791         >>> def send(sender, message):
 792         ...    print('send from {}:'.format(sender))
 793         ...    print(message.as_string())
 794         >>> feed._send = send
 795         >>> feed.to = 'jdoe@dummy.invalid'
 796         >>> #parsed = feed.run()  # enable for debugging
 797         """
 798         if not self.to:
 799             raise _error.NoToEmailAddress(feed=self)
 800         parsed = self._fetch()
 801
 802         if self.digest:
 803             digest = self._new_digest()
 804             seen = []
 805
 806         for (guid, id_, sender, message) in self._process(parsed):
 807             _LOG.debug('new message: {}'.format(message['Subject']))
 808             if self.digest:
 809                 seen.append((guid, id_))
 810                 self._append_to_digest(digest=digest, message=message)
 811             else:
 812                 if send:
 813                     self._send(sender=sender, message=message)
 814                 if guid not in self.seen:
 815                     self.seen[guid] = {}
 816                 self.seen[guid]['id'] = id_
 817
 818         if self.digest and seen:
 819             if self.digest_post_process:
 820                 digest = self.digest_post_process(
 821                     feed=self, parsed=parsed, seen=seen, message=digest)
 822                 if not digest:
 823                     return
 824             self._send_digest(
 825                 digest=digest, seen=seen, sender=sender, send=send)
 826
 827         self.etag = parsed.get('etag', None)
 828         self.modified = parsed.get('modified', None)
 829
 830     def _new_digest(self):
 831         digest = _MIMEMultipart('digest')
 832         digest['To'] = self.to  # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
 833         digest['Subject'] = 'digest for {}'.format(self.name)
 834         digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
 835         digest['User-Agent'] = _USER_AGENT
 836         digest['X-RSS-Feed'] = self.url
 837         return digest
 838
 839     def _append_to_digest(self, digest, message):
 840         part = _MIMEMessage(message)
 841         part.add_header('Content-Disposition', 'attachment')
 842         digest.attach(part)
 843
 844     def _send_digest(self, digest, seen, sender, send=True):
 845         """Send a digest message
 846
 847         The date is extracted from the last message in the digest
 848         payload.  We assume that this part exists.  If you don't have
 849         any messages in the digest, don't call this function.
 850         """
 851         digest['From'] = sender  # TODO: _Header(), _formataddr()...
 852         last_part = digest.get_payload()[-1]
 853         last_message = last_part.get_payload()[0]
 854         digest['Date'] = last_message['Date']
 855
 856         _LOG.debug('new digest for {}'.format(self))
 857         if send:
 858             self._send(sender=sender, message=digest)
 859         for (guid, id_) in seen:
 860             if guid not in self.seen:
 861                 self.seen[guid] = {}
 862             self.seen[guid]['id'] = id_