rss2email/feed.py

   1 # Copyright (C) 2004-2013 Aaron Swartz
   2 #                         Brian Lalor
   3 #                         Dean Jackson
   4 #                         Erik Hetzner
   5 #                         Etienne Millon <me@emillon.org>
   6 #                         Joey Hess
   7 #                         Lindsey Smith <lindsey.smith@gmail.com>
   8 #                         Marcel Ackermann
   9 #                         Martin 'Joey' Schulze
  10 #                         Matej Cepl
  11 #                         W. Trevor King <wking@tremily.us>
  12 #
  13 # This file is part of rss2email.
  14 #
  15 # rss2email is free software: you can redistribute it and/or modify it under
  16 # the terms of the GNU General Public License as published by the Free Software
  17 # Foundation, either version 2 of the License, or (at your option) version 3 of
  18 # the License.
  19 #
  20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
  21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  22 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  23 #
  24 # You should have received a copy of the GNU General Public License along with
  25 # rss2email.  If not, see <http://www.gnu.org/licenses/>.
  26
  27 """Define the ``Feed`` class for handling a single feed
  28 """
  29
  30 import collections as _collections
  31 from email.mime.message import MIMEMessage as _MIMEMessage
  32 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
  33 from email.utils import formataddr as _formataddr
  34 import hashlib as _hashlib
  35 import html.parser as _html_parser
  36 import re as _re
  37 import socket as _socket
  38 import time as _time
  39 import urllib.error as _urllib_error
  40 import urllib.request as _urllib_request
  41 import uuid as _uuid
  42 import xml.sax as _sax
  43 import xml.sax.saxutils as _saxutils
  44
  45 import feedparser as _feedparser
  46 import html2text as _html2text
  47
  48 from . import __url__
  49 from . import __version__
  50 from . import LOG as _LOG
  51 from . import config as _config
  52 from . import email as _email
  53 from . import error as _error
  54 from . import util as _util
  55
  56
  57 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
  58 _feedparser.USER_AGENT = _USER_AGENT
  59 _urllib_request.install_opener(_urllib_request.build_opener())
  60 _SOCKET_ERRORS = []
  61 for e in ['error', 'herror', 'gaierror']:
  62     if hasattr(_socket, e):
  63         _SOCKET_ERRORS.append(getattr(_socket, e))
  64 del e  # cleanup namespace
  65 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
  66
  67
  68 class Feed (object):
  69     """Utility class for feed manipulation and storage.
  70
  71     >>> import pickle
  72     >>> import sys
  73     >>> from .config import CONFIG
  74
  75     >>> feed = Feed(
  76     ...    name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
  77     >>> print(feed)
  78     test-feed (http://example.com/feed.atom -> a@b.com)
  79     >>> feed.section
  80     'feed.test-feed'
  81     >>> feed.from_email
  82     'user@rss2email.invalid'
  83
  84     >>> feed.from_email = 'a@b.com'
  85     >>> feed.save_to_config()
  86     >>> feed.config.write(sys.stdout)  # doctest: +REPORT_UDIFF, +ELLIPSIS
  87     [DEFAULT]
  88     from = user@rss2email.invalid
  89     ...
  90     verbose = warning
  91     <BLANKLINE>
  92     [feed.test-feed]
  93     url = http://example.com/feed.atom
  94     from = a@b.com
  95     to = a@b.com
  96     <BLANKLINE>
  97
  98     >>> feed.etag = 'dummy etag'
  99     >>> string = pickle.dumps(feed)
 100     >>> feed = pickle.loads(string)
 101     >>> feed.load_from_config(config=CONFIG)
 102     >>> feed.etag
 103     'dummy etag'
 104     >>> feed.url
 105     'http://example.com/feed.atom'
 106
 107     Names can only contain ASCII letters, digits, and '._-'.  Here the
 108     invalid space causes an exception:
 109
 110     >>> Feed(name='invalid name')
 111     Traceback (most recent call last):
 112       ...
 113     rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
 114
 115     You must define a URL:
 116
 117     >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
 118     Traceback (most recent call last):
 119       ...
 120     rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
 121
 122
 123     Cleanup `CONFIG`.
 124
 125     >>> CONFIG['DEFAULT']['to'] = ''
 126     >>> test_section = CONFIG.pop('feed.test-feed')
 127     """
 128     _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
 129
 130     # saved/loaded from feed.dat using __getstate__/__setstate__.
 131     _dynamic_attributes = [
 132         'name',
 133         'etag',
 134         'modified',
 135         'seen',
 136         ]
 137
 138     ## saved/loaded from ConfigParser instance
 139     # attributes that aren't in DEFAULT
 140     _non_default_configured_attributes = [
 141         'url',
 142         ]
 143     # attributes that are in DEFAULT
 144     _default_configured_attributes = [
 145         key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
 146     _default_configured_attributes[
 147         _default_configured_attributes.index('from')
 148         ] = 'from_email'  # `from` is a Python keyword
 149     # all attributes that are saved/loaded from .config
 150     _configured_attributes = (
 151         _non_default_configured_attributes + _default_configured_attributes)
 152     # attribute name -> .config option
 153     _configured_attribute_translations = dict(
 154         (attr,attr) for attr in _non_default_configured_attributes)
 155     _configured_attribute_translations.update(dict(
 156             zip(_default_configured_attributes,
 157                 _config.CONFIG['DEFAULT'].keys())))
 158     # .config option -> attribute name
 159     _configured_attribute_inverse_translations = dict(
 160         (v,k) for k,v in _configured_attribute_translations.items())
 161
 162     # hints for value conversion
 163     _boolean_attributes = [
 164         'digest',
 165         'force_from',
 166         'use_publisher_email',
 167         'friendly_name',
 168         'active',
 169         'date_header',
 170         'trust_guid',
 171         'html_mail',
 172         'use_css',
 173         'unicode_snob',
 174         'links_after_each_paragraph',
 175         'use_smtp',
 176         'smtp_ssl',
 177         ]
 178
 179     _integer_attributes = [
 180         'feed_timeout',
 181         'body_width',
 182         ]
 183
 184     _list_attributes = [
 185         'date_header_order',
 186         'encodings',
 187         ]
 188
 189     _function_attributes = [
 190         'post_process',
 191         'digest_post_process',
 192         ]
 193
 194     def __init__(self, name=None, url=None, to=None, config=None):
 195         self._set_name(name=name)
 196         self.reset()
 197         self.__setstate__(dict(
 198                 (attr, getattr(self, attr))
 199                 for attr in self._dynamic_attributes))
 200         self.load_from_config(config=config)
 201         if url:
 202             self.url = url
 203         if to:
 204             self.to = to
 205
 206     def __str__(self):
 207         return '{} ({} -> {})'.format(self.name, self.url, self.to)
 208
 209     def __repr__(self):
 210         return '<Feed {}>'.format(str(self))
 211
 212     def __getstate__(self):
 213         "Save dyamic attributes"
 214         return dict(
 215             (key,getattr(self,key)) for key in self._dynamic_attributes)
 216
 217     get_state = __getstate__  # make it publicly accessible
 218
 219     def __setstate__(self, state):
 220         "Restore dynamic attributes"
 221         keys = sorted(state.keys())
 222         if keys != sorted(self._dynamic_attributes):
 223             raise ValueError(state)
 224         self._set_name(name=state['name'])
 225         self.__dict__.update(state)
 226
 227     set_state = __setstate__  # make it publicly accessible
 228
 229     def save_to_config(self):
 230         "Save configured attributes"
 231         data = _collections.OrderedDict()
 232         default = self.config['DEFAULT']
 233         for attr in self._configured_attributes:
 234             key = self._configured_attribute_translations[attr]
 235             value = getattr(self, attr)
 236             if value is not None:
 237                 value = self._get_configured_option_value(
 238                     attribute=attr, value=value)
 239                 if (attr in self._non_default_configured_attributes or
 240                     value != default[key]):
 241                     data[key] = value
 242         self.config[self.section] = data
 243
 244     def load_from_config(self, config=None):
 245         "Restore configured attributes"
 246         if config is None:
 247             config = _config.CONFIG
 248         self.config = config
 249         if self.section in self.config:
 250             data = self.config[self.section]
 251         else:
 252             data = self.config['DEFAULT']
 253         keys = sorted(data.keys())
 254         expected = sorted(self._configured_attribute_translations.values())
 255         if keys != expected:
 256             for key in expected:
 257                 if (key not in keys and
 258                     key not in self._non_default_configured_attributes):
 259                     raise _error.InvalidFeedConfig(
 260                         setting=key, feed=self,
 261                         message='missing configuration key: {}'.format(key))
 262             for key in keys:
 263                 if key not in expected:
 264                     raise _error.InvalidFeedConfig(
 265                         setting=key, feed=self,
 266                         message='extra configuration key: {}'.format(key))
 267         data = dict(
 268             (self._configured_attribute_inverse_translations[k],
 269              self._get_configured_attribute_value(
 270                   attribute=self._configured_attribute_inverse_translations[k],
 271                   key=k, data=data))
 272             for k in data.keys())
 273         for attr in self._non_default_configured_attributes:
 274             if attr not in data:
 275                 data[attr] = None
 276         self.__dict__.update(data)
 277
 278     def _get_configured_option_value(self, attribute, value):
 279         if value is None:
 280             return ''
 281         elif attribute in self._list_attributes:
 282             return ', '.join(value)
 283         elif attribute in self._function_attributes:
 284             return _util.import_name(value)
 285         return str(value)
 286
 287     def _get_configured_attribute_value(self, attribute, key, data):
 288         if attribute in self._boolean_attributes:
 289             return data.getboolean(key)
 290         elif attribute in self._integer_attributes:
 291             return data.getint(key)
 292         elif attribute in self._list_attributes:
 293             return [x.strip() for x in data[key].split(',')]
 294         elif attribute in self._function_attributes:
 295             if data[key]:
 296                 return _util.import_function(data[key])
 297             return None
 298         return data[key]
 299
 300     def reset(self):
 301         """Reset dynamic data
 302         """
 303         self.etag = None
 304         self.modified = None
 305         self.seen = {}
 306
 307     def _set_name(self, name):
 308         if not self._name_regexp.match(name):
 309             raise _error.InvalidFeedName(name=name, feed=self)
 310         self.name = name
 311         self.section = 'feed.{}'.format(self.name)
 312
 313     def _fetch(self):
 314         """Fetch and parse a feed using feedparser.
 315
 316         >>> feed = Feed(
 317         ...    name='test-feed',
 318         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 319         >>> parsed = feed._fetch()
 320         >>> parsed.status
 321         200
 322         """
 323         _LOG.info('fetch {}'.format(self))
 324         if not self.url:
 325             raise _error.InvalidFeedConfig(setting='url', feed=self)
 326         if self.section in self.config:
 327             config = self.config[self.section]
 328         else:
 329             config = self.config['DEFAULT']
 330         proxy = config['proxy']
 331         timeout = config.getint('feed-timeout')
 332         kwargs = {}
 333         if proxy:
 334             kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
 335         f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
 336         return f(self.url, self.etag, modified=self.modified, **kwargs)
 337
 338     def _process(self, parsed):
 339         _LOG.info('process {}'.format(self))
 340         self._check_for_errors(parsed)
 341         for entry in reversed(parsed.entries):
 342             _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
 343             processed = self._process_entry(parsed=parsed, entry=entry)
 344             if processed:
 345                 guid,id_,sender,message = processed
 346                 if self.post_process:
 347                     message = self.post_process(
 348                         feed=self, parsed=parsed, entry=entry, guid=guid,
 349                         message=message)
 350                     if not message:
 351                         continue
 352                 yield (guid, id_, sender, message)
 353
 354     def _check_for_errors(self, parsed):
 355         warned = False
 356         status = getattr(parsed, 'status', 200)
 357         _LOG.debug('HTTP status {}'.format(status))
 358         if status == 301:
 359             _LOG.info('redirect {} from {} to {}'.format(
 360                     self.name, self.url, parsed['url']))
 361             self.url = parsed['url']
 362         elif status not in [200, 302, 304]:
 363             raise _error.HTTPError(status=status, feed=self)
 364
 365         http_headers = parsed.get('headers', {})
 366         if http_headers:
 367             _LOG.debug('HTTP headers: {}'.format(http_headers))
 368         if not http_headers:
 369             _LOG.warning('could not get HTTP headers: {}'.format(self))
 370             warned = True
 371         else:
 372             if 'html' in http_headers.get('content-type', 'rss'):
 373                 _LOG.warning('looks like HTML: {}'.format(self))
 374                 warned = True
 375             if http_headers.get('content-length', '1') == '0':
 376                 _LOG.warning('empty page: {}'.format(self))
 377                 warned = True
 378
 379         version = parsed.get('version', None)
 380         if version:
 381             _LOG.debug('feed version {}'.format(version))
 382         else:
 383             _LOG.warning('unrecognized version: {}'.format(self))
 384             warned = True
 385
 386         exc = parsed.get('bozo_exception', None)
 387         if isinstance(exc, _socket.timeout):
 388             _LOG.error('timed out: {}'.format(self))
 389             warned = True
 390         elif isinstance(exc, OSError):
 391             _LOG.error('{}: {}'.format(exc, self))
 392             warned = True
 393         elif isinstance(exc, _SOCKET_ERRORS):
 394             _LOG.error('{}: {}'.format(exc, self))
 395             warned = True
 396         elif isinstance(exc, _feedparser.zlib.error):
 397             _LOG.error('broken compression: {}'.format(self))
 398             warned = True
 399         elif isinstance(exc, (IOError, AttributeError)):
 400             _LOG.error('{}: {}'.format(exc, self))
 401             warned = True
 402         elif isinstance(exc, KeyboardInterrupt):
 403             raise exc
 404         elif isinstance(exc, _sax.SAXParseException):
 405             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
 406             warned = True
 407         elif parsed.bozo or exc:
 408             if exc is None:
 409                 exc = "can't process"
 410             _LOG.error('processing error: {}: {}'.format(exc, self))
 411             warned = True
 412
 413         if (not warned and
 414             status in [200, 302] and
 415             not parsed.entries and
 416             not version):
 417             raise _error.ProcessingError(parsed=parsed, feed=feed)
 418
 419     def _html2text(self, html, baseurl='', default=None):
 420         self.config.setup_html2text(section=self.section)
 421         try:
 422             return _html2text.html2text(html=html, baseurl=baseurl)
 423         except _html_parser.HTMLParseError as e:
 424             if default is not None:
 425                 return default
 426             raise
 427
 428     def _process_entry(self, parsed, entry):
 429         id_ = self._get_entry_id(entry)
 430         # If .trust_guid isn't set, we get back hashes of the content.
 431         # Instead of letting these run wild, we put them in context
 432         # by associating them with the actual ID (if it exists).
 433         guid = entry.get('id', id_)
 434         if isinstance(guid, dict):
 435             guid = guid.values()[0]
 436         if guid in self.seen:
 437             if self.seen[guid]['id'] == id_:
 438                 _LOG.debug('already seen {}'.format(id_))
 439                 return  # already seen
 440         sender = self._get_entry_email(parsed=parsed, entry=entry)
 441         subject = self._get_entry_title(entry)
 442         extra_headers = _collections.OrderedDict((
 443                 ('Date', self._get_entry_date(entry)),
 444                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
 445                 ('User-Agent', _USER_AGENT),
 446                 ('X-RSS-Feed', self.url),
 447                 ('X-RSS-ID', id_),
 448                 ('X-RSS-URL', self._get_entry_link(entry)),
 449                 ('X-RSS-TAGS', self._get_entry_tags(entry)),
 450                 ))
 451         for k,v in extra_headers.items():  # remove empty tags, etc.
 452             if v is None:
 453                 extra_headers.pop(k)
 454         if self.bonus_header:
 455             for header in self.bonus_header.splitlines():
 456                 if ':' in header:
 457                     key,value = header.split(':', 1)
 458                     extra_headers[key.strip()] = value.strip()
 459                 else:
 460                     _LOG.warning(
 461                         'malformed bonus-header: {}'.format(
 462                             self.bonus_header))
 463
 464         content = self._get_entry_content(entry)
 465         try:
 466             content = self._process_entry_content(
 467                 entry=entry, content=content, subject=subject)
 468         except _error.ProcessingError as e:
 469             e.parsed = parsed
 470             raise
 471         message = _email.get_message(
 472             sender=sender,
 473             recipient=self.to,
 474             subject=subject,
 475             body=content['value'],
 476             content_type=content['type'].split('/', 1)[1],
 477             extra_headers=extra_headers,
 478             config=self.config,
 479             section=self.section)
 480         return (guid, id_, sender, message)
 481
 482     def _get_entry_id(self, entry):
 483         """Get best ID from an entry."""
 484         if self.trust_guid:
 485             if getattr(entry, 'id', None):
 486                 # Newer versions of feedparser could return a dictionary
 487                 if isinstance(entry.id, dict):
 488                     return entry.id.values()[0]
 489                 return entry.id
 490         content = self._get_entry_content(entry)
 491         content_value = content['value'].strip()
 492         if content_value:
 493             return _hashlib.sha1(
 494                 content_value.encode('unicode-escape')).hexdigest()
 495         elif getattr(entry, 'link', None):
 496             return _hashlib.sha1(
 497                 entry.link.encode('unicode-escape')).hexdigest()
 498         elif getattr(entry, 'title', None):
 499             return _hashlib.sha1(
 500                 entry.title.encode('unicode-escape')).hexdigest()
 501
 502     def _get_entry_link(self, entry):
 503         return entry.get('link', None)
 504
 505     def _get_entry_title(self, entry):
 506         if hasattr(entry, 'title_detail') and entry.title_detail:
 507             title = entry.title_detail.value
 508             if 'html' in entry.title_detail.type:
 509                 title = self._html2text(title, default=title)
 510         else:
 511             content = self._get_entry_content(entry)
 512             value = content['value']
 513             if content['type'] in ('text/html', 'application/xhtml+xml'):
 514                 value = self._html2text(value, default=value)
 515             title = value[:70]
 516         title = title.replace('\n', ' ').strip()
 517         return title
 518
 519     def _get_entry_date(self, entry):
 520         datetime = _time.gmtime()
 521         if self.date_header:
 522             for datetype in self.date_header_order:
 523                 kind = datetype + '_parsed'
 524                 if entry.get(kind, None):
 525                     datetime = entry[kind]
 526                     break
 527         return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 528
 529     def _get_entry_name(self, parsed, entry):
 530         """Get the best name
 531
 532         >>> import feedparser
 533         >>> f = Feed(name='test-feed')
 534         >>> parsed = feedparser.parse(
 535         ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
 536         ...     '  <entry>\\n'
 537         ...     '    <author>\\n'
 538         ...     '      <name>Example author</name>\\n'
 539         ...     '      <email>me@example.com</email>\\n'
 540         ...     '      <url>http://example.com/</url>\\n'
 541         ...     '    </author>\\n'
 542         ...     '  </entry>\\n'
 543         ...     '</feed>\\n'
 544         ...     )
 545         >>> entry = parsed.entries[0]
 546         >>> f.friendly_name = False
 547         >>> f._get_entry_name(parsed, entry)
 548         ''
 549         >>> f.friendly_name = True
 550         >>> f._get_entry_name(parsed, entry)
 551         'Example author'
 552         """
 553         if not self.friendly_name:
 554             return ''
 555         parts = ['']
 556         feed = parsed.feed
 557         parts.append(feed.get('title', ''))
 558         for x in [entry, feed]:
 559             if 'name' in x.get('author_detail', []):
 560                 if x.author_detail.name:
 561                     if ''.join(parts):
 562                         parts.append(': ')
 563                     parts.append(x.author_detail.name)
 564                     break
 565         if not ''.join(parts) and self.use_publisher_email:
 566             if 'name' in feed.get('publisher_detail', []):
 567                 if ''.join(parts):
 568                     parts.append(': ')
 569                 parts.append(feed.publisher_detail.name)
 570         return _html2text.unescape(''.join(parts))
 571
 572     def _validate_email(self, email, default=None):
 573         """Do a basic quality check on email address
 574
 575         Return `default` if the address doesn't appear to be
 576         well-formed.  If `default` is `None`, return
 577         `self.from_email`.
 578
 579         >>> f = Feed(name='test-feed')
 580         >>> f._validate_email('valid@example.com', 'default@example.com')
 581         'valid@example.com'
 582         >>> f._validate_email('invalid@', 'default@example.com')
 583         'default@example.com'
 584         >>> f._validate_email('@invalid', 'default@example.com')
 585         'default@example.com'
 586         >>> f._validate_email('invalid', 'default@example.com')
 587         'default@example.com'
 588         """
 589         parts = email.split('@')
 590         if len(parts) != 2 or '' in parts:
 591             if default is None:
 592                 return self.from_email
 593             return default
 594         return email
 595
 596     def _get_entry_address(self, parsed, entry):
 597         """Get the best From email address ('<jdoe@a.com>')
 598
 599         If the best guess isn't well-formed (something@somthing.com),
 600         use `self.from_email` instead.
 601         """
 602         if self.force_from:
 603             return self.from_email
 604         feed = parsed.feed
 605         if 'email' in entry.get('author_detail', []):
 606             return self._validate_email(entry.author_detail.email)
 607         elif 'email' in feed.get('author_detail', []):
 608             return self._validate_email(feed.author_detail.email)
 609         if self.use_publisher_email:
 610             if 'email' in feed.get('publisher_detail', []):
 611                 return self._validate_email(feed.publisher_detail.email)
 612             if feed.get('errorreportsto', None):
 613                 return self._validate_email(feed.errorreportsto)
 614         _LOG.debug('no sender address found, fallback to default')
 615         return self.from_email
 616
 617     def _get_entry_email(self, parsed, entry):
 618         """Get the best From email address ('John <jdoe@a.com>')
 619         """
 620         name = self._get_entry_name(parsed=parsed, entry=entry)
 621         address = self._get_entry_address(parsed=parsed, entry=entry)
 622         return _formataddr((name, address))
 623
 624     def _get_entry_tags(self, entry):
 625         """Add post tags, if available
 626
 627         >>> f = Feed(name='test-feed')
 628         >>> f._get_entry_tags({
 629         ...         'tags': [{'term': 'tag1',
 630         ...                   'scheme': None,
 631         ...                   'label': None}]})
 632         'tag1'
 633         >>> f._get_entry_tags({
 634         ...         'tags': [{'term': 'tag1',
 635         ...                   'scheme': None,
 636         ...                   'label': None},
 637         ...                  {'term': 'tag2',
 638         ...                   'scheme': None,
 639         ...                   'label': None}]})
 640         'tag1,tag2'
 641
 642         Test some troublesome cases.  No tags:
 643
 644         >>> f._get_entry_tags({})
 645
 646         Empty tags:
 647
 648         >>> f._get_entry_tags({'tags': []})
 649
 650         Tags without a ``term`` entry:
 651
 652         >>> f._get_entry_tags({
 653         ...         'tags': [{'scheme': None,
 654         ...                   'label': None}]})
 655
 656         Tags with an empty term:
 657
 658         >>> f._get_entry_tags({
 659         ...         'tags': [{'term': '',
 660         ...                   'scheme': None,
 661         ...                   'label': None}]})
 662         """
 663         taglist = [tag['term'] for tag in entry.get('tags', [])
 664                    if tag.get('term', '')]
 665         if taglist:
 666             return ','.join(taglist)
 667
 668     def _get_entry_content(self, entry):
 669         """Select the best content from an entry.
 670
 671         Returns a feedparser content dict.
 672         """
 673         # How this works:
 674         #  * We have a bunch of potential contents.
 675         #  * We go thru looking for our first choice.
 676         #    (HTML or text, depending on self.html_mail)
 677         #  * If that doesn't work, we go thru looking for our second choice.
 678         #  * If that still doesn't work, we just take the first one.
 679         #
 680         # Possible future improvement:
 681         #  * Instead of just taking the first one
 682         #    pick the one in the "best" language.
 683         #  * HACK: hardcoded .html_mail, should take a tuple of media types
 684         contents = list(entry.get('content', []))
 685         if entry.get('summary_detail', None):
 686             contents.append(entry.summary_detail)
 687         if self.html_mail:
 688             types = ['text/html', 'text/plain']
 689         else:
 690             types = ['text/plain', 'text/html']
 691         for content_type in types:
 692             for content in contents:
 693                 if content['type'] == content_type:
 694                     return content
 695         if contents:
 696             return contents[0]
 697         return {'type': 'text/plain', 'value': ''}
 698
 699     def _process_entry_content(self, entry, content, subject):
 700         "Convert entry content to the requested format."
 701         link = self._get_entry_link(entry)
 702         if self.html_mail:
 703             lines = [
 704                 '<!DOCTYPE html>',
 705                 '<html>',
 706                 '  <head>',
 707                 ]
 708             if self.use_css and self.css:
 709                 lines.extend([
 710                         '    <style type="text/css">',
 711                         self.css,
 712                         '    </style>',
 713                         ])
 714             lines.extend([
 715                     '</head>',
 716                     '<body>',
 717                     '<div id="entry">',
 718                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
 719                         link, subject),
 720                     '<div id="body">',
 721                     ])
 722             if content['type'] in ('text/html', 'application/xhtml+xml'):
 723                 lines.append(content['value'].strip())
 724             else:
 725                 lines.append(_saxutils.escape(content['value'].strip()))
 726             lines.append('</div>')
 727             lines.extend([
 728                     '<div class="footer">'
 729                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
 730                     ])
 731             for enclosure in getattr(entry, 'enclosures', []):
 732                 if getattr(enclosure, 'url', None):
 733                     lines.append(
 734                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 735                             enclosure.url))
 736                 if getattr(enclosure, 'src', None):
 737                     lines.append(
 738                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 739                             enclosure.src))
 740                     lines.append(
 741                         '<p><img src="{}" /></p>'.format(enclosure.src))
 742             for elink in getattr(entry, 'links', []):
 743                 if elink.get('rel', None) == 'via':
 744                     url = elink['href']
 745                     title = elink.get('title', url)
 746                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
 747                             url, title))
 748             lines.extend([
 749                     '</div>',  # /footer
 750                     '</div>',  # /entry
 751                     '</body>',
 752                     '</html>',
 753                     ''])
 754             content['type'] = 'text/html'
 755             content['value'] = '\n'.join(lines)
 756             return content
 757         else:  # not self.html_mail
 758             if content['type'] in ('text/html', 'application/xhtml+xml'):
 759                 try:
 760                     lines = [self._html2text(content['value'])]
 761                 except _html_parser.HTMLParseError as e:
 762                     raise _error.ProcessingError(parsed=None, feed=self)
 763             else:
 764                 lines = [content['value']]
 765             lines.append('')
 766             lines.append('URL: {}'.format(link))
 767             for enclosure in getattr(entry, 'enclosures', []):
 768                 if getattr(enclosure, 'url', None):
 769                     lines.append('Enclosure: {}'.format(enclosure.url))
 770                 if getattr(enclosure, 'src', None):
 771                     lines.append('Enclosure: {}'.format(enclosure.src))
 772             for elink in getattr(entry, 'links', []):
 773                 if elink.get('rel', None) == 'via':
 774                     url = elink['href']
 775                     title = elink.get('title', url)
 776                     lines.append('Via: {} {}'.format(title, url))
 777             content['type'] = 'text/plain'
 778             content['value'] = '\n'.join(lines)
 779             return content
 780
 781     def _send(self, sender, message):
 782         _LOG.info('send message for {}'.format(self))
 783         section = self.section
 784         if section not in self.config:
 785             section = 'DEFAULT'
 786         _email.send(sender=sender, recipient=self.to, message=message,
 787                     config=self.config, section=section)
 788
 789     def run(self, send=True):
 790         """Fetch and process the feed, mailing entry emails.
 791
 792         >>> feed = Feed(
 793         ...    name='test-feed',
 794         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 795         >>> def send(sender, message):
 796         ...    print('send from {}:'.format(sender))
 797         ...    print(message.as_string())
 798         >>> feed._send = send
 799         >>> feed.to = 'jdoe@dummy.invalid'
 800         >>> #parsed = feed.run()  # enable for debugging
 801         """
 802         if not self.to:
 803             raise _error.NoToEmailAddress(feed=self)
 804         parsed = self._fetch()
 805
 806         if self.digest:
 807             digest = self._new_digest()
 808             seen = []
 809
 810         for (guid, id_, sender, message) in self._process(parsed):
 811             _LOG.debug('new message: {}'.format(message['Subject']))
 812             if self.digest:
 813                 seen.append((guid, id_))
 814                 self._append_to_digest(digest=digest, message=message)
 815             else:
 816                 if send:
 817                     self._send(sender=sender, message=message)
 818                 if guid not in self.seen:
 819                     self.seen[guid] = {}
 820                 self.seen[guid]['id'] = id_
 821
 822         if self.digest and seen:
 823             if self.digest_post_process:
 824                 digest = self.digest_post_process(
 825                     feed=self, parsed=parsed, seen=seen, message=digest)
 826                 if not digest:
 827                     return
 828             self._send_digest(
 829                 digest=digest, seen=seen, sender=sender, send=send)
 830
 831         self.etag = parsed.get('etag', None)
 832         self.modified = parsed.get('modified', None)
 833
 834     def _new_digest(self):
 835         digest = _MIMEMultipart('digest')
 836         digest['To'] = self.to  # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
 837         digest['Subject'] = 'digest for {}'.format(self.name)
 838         digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
 839         digest['User-Agent'] = _USER_AGENT
 840         digest['X-RSS-Feed'] = self.url
 841         return digest
 842
 843     def _append_to_digest(self, digest, message):
 844         part = _MIMEMessage(message)
 845         part.add_header('Content-Disposition', 'attachment')
 846         digest.attach(part)
 847
 848     def _send_digest(self, digest, seen, sender, send=True):
 849         """Send a digest message
 850
 851         The date is extracted from the last message in the digest
 852         payload.  We assume that this part exists.  If you don't have
 853         any messages in the digest, don't call this function.
 854         """
 855         digest['From'] = sender  # TODO: _Header(), _formataddr()...
 856         last_part = digest.get_payload()[-1]
 857         last_message = last_part.get_payload()[0]
 858         digest['Date'] = last_message['Date']
 859
 860         _LOG.debug('new digest for {}'.format(self))
 861         if send:
 862             self._send(sender=sender, message=digest)
 863         for (guid, id_) in seen:
 864             if guid not in self.seen:
 865                 self.seen[guid] = {}
 866             self.seen[guid]['id'] = id_