rss2email/feed.py

   1 # Copyright (C) 2004-2013 Aaron Swartz
   2 #                         Brian Lalor
   3 #                         Dean Jackson
   4 #                         Erik Hetzner
   5 #                         Etienne Millon <me@emillon.org>
   6 #                         Joey Hess
   7 #                         Lindsey Smith <lindsey.smith@gmail.com>
   8 #                         Marcel Ackermann
   9 #                         Martin 'Joey' Schulze
  10 #                         Matej Cepl
  11 #                         W. Trevor King <wking@tremily.us>
  12 #
  13 # This file is part of rss2email.
  14 #
  15 # rss2email is free software: you can redistribute it and/or modify it under
  16 # the terms of the GNU General Public License as published by the Free Software
  17 # Foundation, either version 2 of the License, or (at your option) version 3 of
  18 # the License.
  19 #
  20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
  21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  22 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  23 #
  24 # You should have received a copy of the GNU General Public License along with
  25 # rss2email.  If not, see <http://www.gnu.org/licenses/>.
  26
  27 """Define the ``Feed`` class for handling a single feed
  28 """
  29
  30 import collections as _collections
  31 from email.utils import formataddr as _formataddr
  32 import hashlib as _hashlib
  33 import re as _re
  34 import socket as _socket
  35 import time as _time
  36 import urllib.error as _urllib_error
  37 import urllib.request as _urllib_request
  38 import uuid as _uuid
  39 import xml.sax as _sax
  40 import xml.sax.saxutils as _saxutils
  41
  42 import feedparser as _feedparser
  43 import html2text as _html2text
  44
  45 from . import __url__
  46 from . import __version__
  47 from . import LOG as _LOG
  48 from . import config as _config
  49 from . import email as _email
  50 from . import error as _error
  51 from . import util as _util
  52
  53
  54 _feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
  55 _urllib_request.install_opener(_urllib_request.build_opener())
  56 _SOCKET_ERRORS = []
  57 for e in ['error', 'gaierror']:
  58     if hasattr(_socket, e):
  59         _SOCKET_ERRORS.append(getattr(_socket, e))
  60 del e  # cleanup namespace
  61 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
  62
  63
  64 class Feed (object):
  65     """Utility class for feed manipulation and storage.
  66
  67     >>> import pickle
  68     >>> import sys
  69     >>> from .config import CONFIG
  70
  71     >>> feed = Feed(
  72     ...    name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
  73     >>> print(feed)
  74     test-feed (http://example.com/feed.atom -> a@b.com)
  75     >>> feed.section
  76     'feed.test-feed'
  77     >>> feed.from_email
  78     'user@rss2email.invalid'
  79
  80     >>> feed.from_email = 'a@b.com'
  81     >>> feed.save_to_config()
  82     >>> feed.config.write(sys.stdout)  # doctest: +REPORT_UDIFF, +ELLIPSIS
  83     [DEFAULT]
  84     from = user@rss2email.invalid
  85     ...
  86     verbose = warning
  87     <BLANKLINE>
  88     [feed.test-feed]
  89     url = http://example.com/feed.atom
  90     from = a@b.com
  91     to = a@b.com
  92     <BLANKLINE>
  93
  94     >>> feed.etag = 'dummy etag'
  95     >>> string = pickle.dumps(feed)
  96     >>> feed = pickle.loads(string)
  97     >>> feed.load_from_config(config=CONFIG)
  98     >>> feed.etag
  99     'dummy etag'
 100     >>> feed.url
 101     'http://example.com/feed.atom'
 102
 103     Names can only contain ASCII letters, digits, and '._-'.  Here the
 104     invalid space causes an exception:
 105
 106     >>> Feed(name='invalid name')
 107     Traceback (most recent call last):
 108       ...
 109     rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
 110
 111     You must define a URL:
 112
 113     >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
 114     Traceback (most recent call last):
 115       ...
 116     rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
 117
 118
 119     Cleanup `CONFIG`.
 120
 121     >>> CONFIG['DEFAULT']['to'] = ''
 122     >>> test_section = CONFIG.pop('feed.test-feed')
 123     """
 124     _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
 125
 126     # saved/loaded from feed.dat using __getstate__/__setstate__.
 127     _dynamic_attributes = [
 128         'name',
 129         'etag',
 130         'modified',
 131         'seen',
 132         ]
 133
 134     ## saved/loaded from ConfigParser instance
 135     # attributes that aren't in DEFAULT
 136     _non_default_configured_attributes = [
 137         'url',
 138         ]
 139     # attributes that are in DEFAULT
 140     _default_configured_attributes = [
 141         key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
 142     _default_configured_attributes[
 143         _default_configured_attributes.index('from')
 144         ] = 'from_email'  # `from` is a Python keyword
 145     # all attributes that are saved/loaded from .config
 146     _configured_attributes = (
 147         _non_default_configured_attributes + _default_configured_attributes)
 148     # attribute name -> .config option
 149     _configured_attribute_translations = dict(
 150         (attr,attr) for attr in _non_default_configured_attributes)
 151     _configured_attribute_translations.update(dict(
 152             zip(_default_configured_attributes,
 153                 _config.CONFIG['DEFAULT'].keys())))
 154     # .config option -> attribute name
 155     _configured_attribute_inverse_translations = dict(
 156         (v,k) for k,v in _configured_attribute_translations.items())
 157
 158     # hints for value conversion
 159     _boolean_attributes = [
 160         'force_from',
 161         'use_publisher_email',
 162         'friendly_name',
 163         'active',
 164         'date_header',
 165         'trust_guid',
 166         'html_mail',
 167         'use_css',
 168         'unicode_snob',
 169         'links_after_each_paragraph',
 170         'use_smtp',
 171         'smtp_ssl',
 172         ]
 173
 174     _integer_attributes = [
 175         'feed_timeout',
 176         'body_width',
 177         ]
 178
 179     _list_attributes = [
 180         'date_header_order',
 181         'encodings',
 182         ]
 183
 184     _function_attributes = [
 185         'post_process',
 186         ]
 187
 188     def __init__(self, name=None, url=None, to=None, config=None):
 189         self._set_name(name=name)
 190         self.reset()
 191         self.__setstate__(dict(
 192                 (attr, getattr(self, attr))
 193                 for attr in self._dynamic_attributes))
 194         self.load_from_config(config=config)
 195         if url:
 196             self.url = url
 197         if to:
 198             self.to = to
 199
 200     def __str__(self):
 201         return '{} ({} -> {})'.format(self.name, self.url, self.to)
 202
 203     def __repr__(self):
 204         return '<Feed {}>'.format(str(self))
 205
 206     def __getstate__(self):
 207         "Save dyamic attributes"
 208         return dict(
 209             (key,getattr(self,key)) for key in self._dynamic_attributes)
 210
 211     get_state = __getstate__  # make it publicly accessible
 212
 213     def __setstate__(self, state):
 214         "Restore dynamic attributes"
 215         keys = sorted(state.keys())
 216         if keys != sorted(self._dynamic_attributes):
 217             raise ValueError(state)
 218         self._set_name(name=state['name'])
 219         self.__dict__.update(state)
 220
 221     set_state = __setstate__  # make it publicly accessible
 222
 223     def save_to_config(self):
 224         "Save configured attributes"
 225         data = _collections.OrderedDict()
 226         default = self.config['DEFAULT']
 227         for attr in self._configured_attributes:
 228             key = self._configured_attribute_translations[attr]
 229             value = getattr(self, attr)
 230             if value is not None:
 231                 value = self._get_configured_option_value(
 232                     attribute=attr, value=value)
 233                 if (attr in self._non_default_configured_attributes or
 234                     value != default[key]):
 235                     data[key] = value
 236         self.config[self.section] = data
 237
 238     def load_from_config(self, config=None):
 239         "Restore configured attributes"
 240         if config is None:
 241             config = _config.CONFIG
 242         self.config = config
 243         if self.section in self.config:
 244             data = self.config[self.section]
 245         else:
 246             data = self.config['DEFAULT']
 247         keys = sorted(data.keys())
 248         expected = sorted(self._configured_attribute_translations.values())
 249         if keys != expected:
 250             for key in expected:
 251                 if (key not in keys and
 252                     key not in self._non_default_configured_attributes):
 253                     raise ValueError('missing key: {}'.format(key))
 254             for key in keys:
 255                 if key not in expected:
 256                     raise ValueError('extra key: {}'.format(key))
 257         data = dict(
 258             (self._configured_attribute_inverse_translations[k],
 259              self._get_configured_attribute_value(
 260                   attribute=self._configured_attribute_inverse_translations[k],
 261                   key=k, data=data))
 262             for k in data.keys())
 263         for attr in self._non_default_configured_attributes:
 264             if attr not in data:
 265                 data[attr] = None
 266         self.__dict__.update(data)
 267
 268     def _get_configured_option_value(self, attribute, value):
 269         if value is None:
 270             return ''
 271         elif attribute in self._list_attributes:
 272             return ', '.join(value)
 273         elif attribute in self._function_attributes:
 274             return _util.import_name(value)
 275         return str(value)
 276
 277     def _get_configured_attribute_value(self, attribute, key, data):
 278         if attribute in self._boolean_attributes:
 279             return data.getboolean(key)
 280         elif attribute in self._integer_attributes:
 281             return data.getint(key)
 282         elif attribute in self._list_attributes:
 283             return [x.strip() for x in data[key].split(',')]
 284         elif attribute in self._function_attributes:
 285             if data[key]:
 286                 return _util.import_function(data[key])
 287             return None
 288         return data[key]
 289
 290     def reset(self):
 291         """Reset dynamic data
 292         """
 293         self.etag = None
 294         self.modified = None
 295         self.seen = {}
 296
 297     def _set_name(self, name):
 298         if not self._name_regexp.match(name):
 299             raise _error.InvalidFeedName(name=name, feed=self)
 300         self.name = name
 301         self.section = 'feed.{}'.format(self.name)
 302
 303     def _fetch(self):
 304         """Fetch and parse a feed using feedparser.
 305
 306         >>> feed = Feed(
 307         ...    name='test-feed',
 308         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 309         >>> parsed = feed._fetch()
 310         >>> parsed.status
 311         200
 312         """
 313         _LOG.info('fetch {}'.format(self))
 314         if not self.url:
 315             raise _error.InvalidFeedConfig(setting='url', feed=self)
 316         if self.section in self.config:
 317             config = self.config[self.section]
 318         else:
 319             config = self.config['DEFAULT']
 320         proxy = config['proxy']
 321         timeout = config.getint('feed-timeout')
 322         kwargs = {}
 323         if proxy:
 324             kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
 325         f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
 326         return f(self.url, self.etag, modified=self.modified, **kwargs)
 327
 328     def _process(self, parsed):
 329         _LOG.info('process {}'.format(self))
 330         self._check_for_errors(parsed)
 331         for entry in reversed(parsed.entries):
 332             _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
 333             processed = self._process_entry(parsed=parsed, entry=entry)
 334             if processed:
 335                 guid,id_,sender,message = processed
 336                 if self.post_process:
 337                     message = self.post_process(
 338                         feed=self, parsed=parsed, entry=entry, guid=guid,
 339                         message=message)
 340                     if not message:
 341                         continue
 342                 yield (guid, id_, sender, message)
 343
 344     def _check_for_errors(self, parsed):
 345         warned = False
 346         status = getattr(parsed, 'status', 200)
 347         _LOG.debug('HTTP status {}'.format(status))
 348         if status == 301:
 349             _LOG.info('redirect {} from {} to {}'.format(
 350                     self.name, self.url, parsed['url']))
 351             self.url = parsed['url']
 352         elif status not in [200, 302, 304]:
 353             raise _error.HTTPError(status=status, feed=self)
 354
 355         http_headers = parsed.get('headers', {})
 356         if http_headers:
 357             _LOG.debug('HTTP headers: {}'.format(http_headers))
 358         if not http_headers:
 359             _LOG.warning('could not get HTTP headers: {}'.format(self))
 360             warned = True
 361         else:
 362             if 'html' in http_headers.get('content-type', 'rss'):
 363                 _LOG.warning('looks like HTML: {}'.format(self))
 364                 warned = True
 365             if http_headers.get('content-length', '1') == '0':
 366                 _LOG.warning('empty page: {}'.format(self))
 367                 warned = True
 368
 369         version = parsed.get('version', None)
 370         if version:
 371             _LOG.debug('feed version {}'.format(version))
 372         else:
 373             _LOG.warning('unrecognized version: {}'.format(self))
 374             warned = True
 375
 376         exc = parsed.get('bozo_exception', None)
 377         if isinstance(exc, _socket.timeout):
 378             _LOG.error('timed out: {}'.format(self))
 379             warned = True
 380         elif isinstance(exc, _SOCKET_ERRORS):
 381             reason = exc.args[1]
 382             _LOG.error('{}: {}'.format(exc, self))
 383             warned = True
 384         elif (hasattr(exc, 'reason') and
 385               isinstance(exc.reason, _urllib_error.URLError)):
 386             if isinstance(exc.reason, _SOCKET_ERRORS):
 387                 reason = exc.reason.args[1]
 388             else:
 389                 reason = exc.reason
 390             _LOG.error('{}: {}'.format(exc, self))
 391             warned = True
 392         elif isinstance(exc, _feedparser.zlib.error):
 393             _LOG.error('broken compression: {}'.format(self))
 394             warned = True
 395         elif isinstance(exc, (IOError, AttributeError)):
 396             _LOG.error('{}: {}'.format(exc, self))
 397             warned = True
 398         elif isinstance(exc, KeyboardInterrupt):
 399             raise exc
 400         elif isinstance(exc, _sax.SAXParseException):
 401             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
 402             warned = True
 403         elif parsed.bozo or exc:
 404             if exc is None:
 405                 exc = "can't process"
 406             _LOG.error('processing error: {}: {}'.format(exc, self))
 407             warned = True
 408
 409         if (not warned and
 410             status in [200, 302] and
 411             not parsed.entries and
 412             not version):
 413             raise _error.ProcessingError(parsed=parsed, feed=feed)
 414
 415     def _process_entry(self, parsed, entry):
 416         id_ = self._get_entry_id(entry)
 417         # If .trust_guid isn't set, we get back hashes of the content.
 418         # Instead of letting these run wild, we put them in context
 419         # by associating them with the actual ID (if it exists).
 420         guid = entry.get('id', id_)
 421         if isinstance(guid, dict):
 422             guid = guid.values()[0]
 423         if guid in self.seen:
 424             if self.seen[guid]['id'] == id_:
 425                 _LOG.debug('already seen {}'.format(id_))
 426                 return  # already seen
 427         sender = self._get_entry_email(parsed=parsed, entry=entry)
 428         subject = self._get_entry_title(entry)
 429         extra_headers = _collections.OrderedDict((
 430                 ('Date', self._get_entry_date(entry)),
 431                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
 432                 ('User-Agent', 'rss2email'),
 433                 ('X-RSS-Feed', self.url),
 434                 ('X-RSS-ID', id_),
 435                 ('X-RSS-URL', self._get_entry_link(entry)),
 436                 ('X-RSS-TAGS', self._get_entry_tags(entry)),
 437                 ))
 438         for k,v in extra_headers.items():  # remove empty tags, etc.
 439             if v is None:
 440                 extra_headers.pop(k)
 441         if self.bonus_header:
 442             for header in self.bonus_header.splitlines():
 443                 if ':' in header:
 444                     key,value = header.split(':', 1)
 445                     extra_headers[key.strip()] = value.strip()
 446                 else:
 447                     _LOG.warning(
 448                         'malformed bonus-header: {}'.format(
 449                             self.bonus_header))
 450
 451         content = self._get_entry_content(entry)
 452         content = self._process_entry_content(
 453             entry=entry, content=content, subject=subject)
 454         message = _email.get_message(
 455             sender=sender,
 456             recipient=self.to,
 457             subject=subject,
 458             body=content['value'],
 459             content_type=content['type'].split('/', 1)[1],
 460             extra_headers=extra_headers)
 461         return (guid, id_, sender, message)
 462
 463     def _get_entry_id(self, entry):
 464         """Get best ID from an entry."""
 465         if self.trust_guid:
 466             if getattr(entry, 'id', None):
 467                 # Newer versions of feedparser could return a dictionary
 468                 if isinstance(entry.id, dict):
 469                     return entry.id.values()[0]
 470                 return entry.id
 471         content = self._get_entry_content(entry)
 472         content_value = content['value'].strip()
 473         if content_value:
 474             return _hashlib.sha1(
 475                 content_value.encode('unicode-escape')).hexdigest()
 476         elif getattr(entry, 'link', None):
 477             return _hashlib.sha1(
 478                 entry.link.encode('unicode-escape')).hexdigest()
 479         elif getattr(entry, 'title', None):
 480             return _hashlib.sha1(
 481                 entry.title.encode('unicode-escape')).hexdigest()
 482
 483     def _get_entry_link(self, entry):
 484         return entry.get('link', None)
 485
 486     def _get_entry_title(self, entry):
 487         if hasattr(entry, 'title_detail') and entry.title_detail:
 488             title = entry.title_detail.value
 489             if 'html' in entry.title_detail.type:
 490                 title = _html2text.html2text(title)
 491         else:
 492             content = self._get_entry_content(entry)
 493             value = content['value']
 494             if content['type'] in ('text/html', 'application/xhtml+xml'):
 495                 value = _html2text.html2text(value)
 496             title = value[:70]
 497         title = title.replace('\n', ' ').strip()
 498         return title
 499
 500     def _get_entry_date(self, entry):
 501         datetime = _time.gmtime()
 502         if self.date_header:
 503             for datetype in self.date_header_order:
 504                 kind = datetype + '_parsed'
 505                 if entry.get(kind, None):
 506                     datetime = entry[kind]
 507                     break
 508         return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 509
 510     def _get_entry_name(self, parsed, entry):
 511         """Get the best name
 512
 513         >>> import feedparser
 514         >>> f = Feed(name='test-feed')
 515         >>> parsed = feedparser.parse(
 516         ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
 517         ...     '  <entry>\\n'
 518         ...     '    <author>\\n'
 519         ...     '      <name>Example author</name>\\n'
 520         ...     '      <email>me@example.com</email>\\n'
 521         ...     '      <url>http://example.com/</url>\\n'
 522         ...     '    </author>\\n'
 523         ...     '  </entry>\\n'
 524         ...     '</feed>\\n'
 525         ...     )
 526         >>> entry = parsed.entries[0]
 527         >>> f.friendly_name = False
 528         >>> f._get_entry_name(parsed, entry)
 529         ''
 530         >>> f.friendly_name = True
 531         >>> f._get_entry_name(parsed, entry)
 532         'Example author'
 533         """
 534         if not self.friendly_name:
 535             return ''
 536         parts = ['']
 537         feed = parsed.feed
 538         parts.append(feed.get('title', ''))
 539         for x in [entry, feed]:
 540             if 'name' in x.get('author_detail', []):
 541                 if x.author_detail.name:
 542                     if ''.join(parts):
 543                         parts.append(': ')
 544                     parts.append(x.author_detail.name)
 545                     break
 546         if not ''.join(parts) and self.use_publisher_email:
 547             if 'name' in feed.get('publisher_detail', []):
 548                 if ''.join(parts):
 549                     parts.append(': ')
 550                 parts.append(feed.publisher_detail.name)
 551         return _html2text.unescape(''.join(parts))
 552
 553     def _validate_email(self, email, default=None):
 554         """Do a basic quality check on email address
 555
 556         Return `default` if the address doesn't appear to be
 557         well-formed.  If `default` is `None`, return
 558         `self.from_email`.
 559
 560         >>> f = Feed(name='test-feed')
 561         >>> f._validate_email('valid@example.com', 'default@example.com')
 562         'valid@example.com'
 563         >>> f._validate_email('invalid@', 'default@example.com')
 564         'default@example.com'
 565         >>> f._validate_email('@invalid', 'default@example.com')
 566         'default@example.com'
 567         >>> f._validate_email('invalid', 'default@example.com')
 568         'default@example.com'
 569         """
 570         parts = email.split('@')
 571         if len(parts) != 2 or '' in parts:
 572             if default is None:
 573                 return self.from_email
 574             return default
 575         return email
 576
 577     def _get_entry_address(self, parsed, entry):
 578         """Get the best From email address ('<jdoe@a.com>')
 579
 580         If the best guess isn't well-formed (something@somthing.com),
 581         use `self.from_email` instead.
 582         """
 583         if self.force_from:
 584             return self.from_email
 585         feed = parsed.feed
 586         if 'email' in entry.get('author_detail', []):
 587             return self._validate_email(entry.author_detail.email)
 588         elif 'email' in feed.get('author_detail', []):
 589             return self._validate_email(feed.author_detail.email)
 590         if self.use_publisher_email:
 591             if 'email' in feed.get('publisher_detail', []):
 592                 return self._validate_email(feed.publisher_detail.email)
 593             if feed.get('errorreportsto', None):
 594                 return self._validate_email(feed.errorreportsto)
 595         _LOG.debug('no sender address found, fallback to default')
 596         return self.from_email
 597
 598     def _get_entry_email(self, parsed, entry):
 599         """Get the best From email address ('John <jdoe@a.com>')
 600         """
 601         name = self._get_entry_name(parsed=parsed, entry=entry)
 602         address = self._get_entry_address(parsed=parsed, entry=entry)
 603         return _formataddr((name, address))
 604
 605     def _get_entry_tags(self, entry):
 606         """Add post tags, if available
 607
 608         >>> f = Feed(name='test-feed')
 609         >>> f._get_entry_tags({
 610         ...         'tags': [{'term': 'tag1',
 611         ...                   'scheme': None,
 612         ...                   'label': None}]})
 613         'tag1'
 614         >>> f._get_entry_tags({
 615         ...         'tags': [{'term': 'tag1',
 616         ...                   'scheme': None,
 617         ...                   'label': None},
 618         ...                  {'term': 'tag2',
 619         ...                   'scheme': None,
 620         ...                   'label': None}]})
 621         'tag1,tag2'
 622
 623         Test some troublesome cases.  No tags:
 624
 625         >>> f._get_entry_tags({})
 626
 627         Empty tags:
 628
 629         >>> f._get_entry_tags({'tags': []})
 630
 631         Tags without a ``term`` entry:
 632
 633         >>> f._get_entry_tags({
 634         ...         'tags': [{'scheme': None,
 635         ...                   'label': None}]})
 636
 637         Tags with an empty term:
 638
 639         >>> f._get_entry_tags({
 640         ...         'tags': [{'term': '',
 641         ...                   'scheme': None,
 642         ...                   'label': None}]})
 643         """
 644         taglist = [tag['term'] for tag in entry.get('tags', [])
 645                    if tag.get('term', '')]
 646         if taglist:
 647             return ','.join(taglist)
 648
 649     def _get_entry_content(self, entry):
 650         """Select the best content from an entry.
 651
 652         Returns a feedparser content dict.
 653         """
 654         # How this works:
 655         #  * We have a bunch of potential contents.
 656         #  * We go thru looking for our first choice.
 657         #    (HTML or text, depending on self.html_mail)
 658         #  * If that doesn't work, we go thru looking for our second choice.
 659         #  * If that still doesn't work, we just take the first one.
 660         #
 661         # Possible future improvement:
 662         #  * Instead of just taking the first one
 663         #    pick the one in the "best" language.
 664         #  * HACK: hardcoded .html_mail, should take a tuple of media types
 665         contents = list(entry.get('content', []))
 666         if entry.get('summary_detail', None):
 667             contents.append(entry.summary_detail)
 668         if self.html_mail:
 669             types = ['text/html', 'text/plain']
 670         else:
 671             types = ['text/plain', 'text/html']
 672         for content_type in types:
 673             for content in contents:
 674                 if content['type'] == content_type:
 675                     return content
 676         if contents:
 677             return contents[0]
 678         return {'type': 'text/plain', 'value': ''}
 679
 680     def _process_entry_content(self, entry, content, subject):
 681         "Convert entry content to the requested format."
 682         link = self._get_entry_link(entry)
 683         if self.html_mail:
 684             lines = [
 685                 '<!DOCTYPE html>',
 686                 '<html>',
 687                 '  <head>',
 688                 ]
 689             if self.use_css and self.css:
 690                 lines.extend([
 691                         '    <style type="text/css">',
 692                         self.css,
 693                         '    </style>',
 694                         ])
 695             lines.extend([
 696                     '</head>',
 697                     '<body>',
 698                     '<div id="entry>',
 699                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
 700                         link, subject),
 701                     '<div id="body"><table><tr><td>',
 702                     ])
 703             if content['type'] in ('text/html', 'application/xhtml+xml'):
 704                 lines.append(content['value'].strip())
 705             else:
 706                 lines.append(_saxutils.escape(content['value'].strip()))
 707             lines.append('</td></tr></table></div>')
 708             lines.extend([
 709                     '<div class="footer">'
 710                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
 711                     ])
 712             for enclosure in getattr(entry, 'enclosures', []):
 713                 if getattr(enclosure, 'url', None):
 714                     lines.append(
 715                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 716                             enclosure.url))
 717                 if getattr(enclosure, 'src', None):
 718                     lines.append(
 719                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 720                             enclosure.src))
 721                     lines.append(
 722                         '<p><img src="{}" /></p>'.format(enclosure.src))
 723             for elink in getattr(entry, 'links', []):
 724                 if elink.get('rel', None) == 'via':
 725                     url = elink['href']
 726                     url = url.replace(
 727                         'http://www.google.com/reader/public/atom/',
 728                         'http://www.google.com/reader/view/')
 729                     title = url
 730                     if elink.get('title', None):
 731                         title = elink['title']
 732                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
 733                             url, title))
 734             lines.extend([
 735                     '</div>',  # /footer
 736                     '</div>',  # /entry
 737                     '</body>',
 738                     '</html>',
 739                     ''])
 740             content['type'] = 'text/html'
 741             content['value'] = '\n'.join(lines)
 742             return content
 743         else:  # not self.html_mail
 744             if content['type'] in ('text/html', 'application/xhtml+xml'):
 745                 lines = [_html2text.html2text(content['value'])]
 746             else:
 747                 lines = [content['value']]
 748             lines.append('')
 749             lines.append('URL: {}'.format(link))
 750             for enclosure in getattr(entry, 'enclosures', []):
 751                 if getattr(enclosure, 'url', None):
 752                     lines.append('Enclosure: {}'.format(enclosure.url))
 753                 if getattr(enclosure, 'src', None):
 754                     lines.append('Enclosure: {}'.format(enclosure.src))
 755             for elink in getattr(entry, 'links', []):
 756                 if elink.get('rel', None) == 'via':
 757                     url = elink['href']
 758                     url = url.replace(
 759                         'http://www.google.com/reader/public/atom/',
 760                         'http://www.google.com/reader/view/')
 761                     title = url
 762                     if elink.get('title', None):
 763                         title = elink['title']
 764                     lines.append('Via: {} {}'.format(title, url))
 765             content['type'] = 'text/plain'
 766             content['value'] = '\n'.join(lines)
 767             return content
 768
 769     def _send(self, sender, message):
 770         _LOG.info('send message for {}'.format(self))
 771         section = self.section
 772         if section not in self.config:
 773             section = 'DEFAULT'
 774         _email.send(sender=sender, recipient=self.to, message=message,
 775                     config=self.config, section=section)
 776
 777     def run(self, send=True):
 778         """Fetch and process the feed, mailing entry emails.
 779
 780         >>> feed = Feed(
 781         ...    name='test-feed',
 782         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 783         >>> def send(sender, message):
 784         ...    print('send from {}:'.format(sender))
 785         ...    print(message.as_string())
 786         >>> feed._send = send
 787         >>> feed.to = 'jdoe@dummy.invalid'
 788         >>> #parsed = feed.run()  # enable for debugging
 789         """
 790         if not self.to:
 791             raise _error.NoToEmailAddress(feed=self)
 792         parsed = self._fetch()
 793         for (guid, id_, sender, message) in self._process(parsed):
 794             _LOG.debug('new message: {}'.format(message['Subject']))
 795             if send:
 796                 self._send(sender=sender, message=message)
 797             if guid not in self.seen:
 798                 self.seen[guid] = {}
 799             self.seen[guid]['id'] = id_
 800         self.etag = parsed.get('etag', None)
 801         self.modified = parsed.get('modified', None)