rss2email/feed.py

   1 # Copyright (C) 2004-2013 Aaron Swartz
   2 #                         Brian Lalor
   3 #                         Dean Jackson
   4 #                         Erik Hetzner
   5 #                         Etienne Millon <me@emillon.org>
   6 #                         Joey Hess
   7 #                         Lindsey Smith <lindsey.smith@gmail.com>
   8 #                         Marcel Ackermann
   9 #                         Martin 'Joey' Schulze
  10 #                         Matej Cepl
  11 #                         W. Trevor King <wking@tremily.us>
  12 #
  13 # This file is part of rss2email.
  14 #
  15 # rss2email is free software: you can redistribute it and/or modify it under
  16 # the terms of the GNU General Public License as published by the Free Software
  17 # Foundation, either version 2 of the License, or (at your option) version 3 of
  18 # the License.
  19 #
  20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
  21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  22 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  23 #
  24 # You should have received a copy of the GNU General Public License along with
  25 # rss2email.  If not, see <http://www.gnu.org/licenses/>.
  26
  27 """Define the ``Feed`` class for handling a single feed
  28 """
  29
  30 import collections as _collections
  31 from email.utils import formataddr as _formataddr
  32 import hashlib as _hashlib
  33 import html.parser as _html_parser
  34 import re as _re
  35 import socket as _socket
  36 import time as _time
  37 import urllib.error as _urllib_error
  38 import urllib.request as _urllib_request
  39 import uuid as _uuid
  40 import xml.sax as _sax
  41 import xml.sax.saxutils as _saxutils
  42
  43 import feedparser as _feedparser
  44 import html2text as _html2text
  45
  46 from . import __url__
  47 from . import __version__
  48 from . import LOG as _LOG
  49 from . import config as _config
  50 from . import email as _email
  51 from . import error as _error
  52 from . import util as _util
  53
  54
  55 _feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
  56 _urllib_request.install_opener(_urllib_request.build_opener())
  57 _SOCKET_ERRORS = []
  58 for e in ['error', 'herror', 'gaierror']:
  59     if hasattr(_socket, e):
  60         _SOCKET_ERRORS.append(getattr(_socket, e))
  61 del e  # cleanup namespace
  62 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
  63
  64
  65 class Feed (object):
  66     """Utility class for feed manipulation and storage.
  67
  68     >>> import pickle
  69     >>> import sys
  70     >>> from .config import CONFIG
  71
  72     >>> feed = Feed(
  73     ...    name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
  74     >>> print(feed)
  75     test-feed (http://example.com/feed.atom -> a@b.com)
  76     >>> feed.section
  77     'feed.test-feed'
  78     >>> feed.from_email
  79     'user@rss2email.invalid'
  80
  81     >>> feed.from_email = 'a@b.com'
  82     >>> feed.save_to_config()
  83     >>> feed.config.write(sys.stdout)  # doctest: +REPORT_UDIFF, +ELLIPSIS
  84     [DEFAULT]
  85     from = user@rss2email.invalid
  86     ...
  87     verbose = warning
  88     <BLANKLINE>
  89     [feed.test-feed]
  90     url = http://example.com/feed.atom
  91     from = a@b.com
  92     to = a@b.com
  93     <BLANKLINE>
  94
  95     >>> feed.etag = 'dummy etag'
  96     >>> string = pickle.dumps(feed)
  97     >>> feed = pickle.loads(string)
  98     >>> feed.load_from_config(config=CONFIG)
  99     >>> feed.etag
 100     'dummy etag'
 101     >>> feed.url
 102     'http://example.com/feed.atom'
 103
 104     Names can only contain ASCII letters, digits, and '._-'.  Here the
 105     invalid space causes an exception:
 106
 107     >>> Feed(name='invalid name')
 108     Traceback (most recent call last):
 109       ...
 110     rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
 111
 112     You must define a URL:
 113
 114     >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
 115     Traceback (most recent call last):
 116       ...
 117     rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
 118
 119
 120     Cleanup `CONFIG`.
 121
 122     >>> CONFIG['DEFAULT']['to'] = ''
 123     >>> test_section = CONFIG.pop('feed.test-feed')
 124     """
 125     _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
 126
 127     # saved/loaded from feed.dat using __getstate__/__setstate__.
 128     _dynamic_attributes = [
 129         'name',
 130         'etag',
 131         'modified',
 132         'seen',
 133         ]
 134
 135     ## saved/loaded from ConfigParser instance
 136     # attributes that aren't in DEFAULT
 137     _non_default_configured_attributes = [
 138         'url',
 139         ]
 140     # attributes that are in DEFAULT
 141     _default_configured_attributes = [
 142         key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
 143     _default_configured_attributes[
 144         _default_configured_attributes.index('from')
 145         ] = 'from_email'  # `from` is a Python keyword
 146     # all attributes that are saved/loaded from .config
 147     _configured_attributes = (
 148         _non_default_configured_attributes + _default_configured_attributes)
 149     # attribute name -> .config option
 150     _configured_attribute_translations = dict(
 151         (attr,attr) for attr in _non_default_configured_attributes)
 152     _configured_attribute_translations.update(dict(
 153             zip(_default_configured_attributes,
 154                 _config.CONFIG['DEFAULT'].keys())))
 155     # .config option -> attribute name
 156     _configured_attribute_inverse_translations = dict(
 157         (v,k) for k,v in _configured_attribute_translations.items())
 158
 159     # hints for value conversion
 160     _boolean_attributes = [
 161         'force_from',
 162         'use_publisher_email',
 163         'friendly_name',
 164         'active',
 165         'date_header',
 166         'trust_guid',
 167         'html_mail',
 168         'use_css',
 169         'unicode_snob',
 170         'links_after_each_paragraph',
 171         'use_smtp',
 172         'smtp_ssl',
 173         ]
 174
 175     _integer_attributes = [
 176         'feed_timeout',
 177         'body_width',
 178         ]
 179
 180     _list_attributes = [
 181         'date_header_order',
 182         'encodings',
 183         ]
 184
 185     def __init__(self, name=None, url=None, to=None, config=None):
 186         self._set_name(name=name)
 187         self.reset()
 188         self.__setstate__(dict(
 189                 (attr, getattr(self, attr))
 190                 for attr in self._dynamic_attributes))
 191         self.load_from_config(config=config)
 192         if url:
 193             self.url = url
 194         if to:
 195             self.to = to
 196
 197     def __str__(self):
 198         return '{} ({} -> {})'.format(self.name, self.url, self.to)
 199
 200     def __repr__(self):
 201         return '<Feed {}>'.format(str(self))
 202
 203     def __getstate__(self):
 204         "Save dyamic attributes"
 205         return dict(
 206             (key,getattr(self,key)) for key in self._dynamic_attributes)
 207
 208     get_state = __getstate__  # make it publicly accessible
 209
 210     def __setstate__(self, state):
 211         "Restore dynamic attributes"
 212         keys = sorted(state.keys())
 213         if keys != sorted(self._dynamic_attributes):
 214             raise ValueError(state)
 215         self._set_name(name=state['name'])
 216         self.__dict__.update(state)
 217
 218     set_state = __setstate__  # make it publicly accessible
 219
 220     def save_to_config(self):
 221         "Save configured attributes"
 222         data = _collections.OrderedDict()
 223         default = self.config['DEFAULT']
 224         for attr in self._configured_attributes:
 225             key = self._configured_attribute_translations[attr]
 226             value = getattr(self, attr)
 227             if value is not None:
 228                 value = self._get_configured_option_value(
 229                     attribute=attr, value=value)
 230                 if (attr in self._non_default_configured_attributes or
 231                     value != default[key]):
 232                     data[key] = value
 233         self.config[self.section] = data
 234
 235     def load_from_config(self, config=None):
 236         "Restore configured attributes"
 237         if config is None:
 238             config = _config.CONFIG
 239         self.config = config
 240         if self.section in self.config:
 241             data = self.config[self.section]
 242         else:
 243             data = self.config['DEFAULT']
 244         keys = sorted(data.keys())
 245         expected = sorted(self._configured_attribute_translations.values())
 246         if keys != expected:
 247             for key in expected:
 248                 if (key not in keys and
 249                     key not in self._non_default_configured_attributes):
 250                     raise _error.InvalidFeedConfig(
 251                         setting=key, feed=self,
 252                         message='missing configuration key: {}'.format(key))
 253             for key in keys:
 254                 if key not in expected:
 255                     raise _error.InvalidFeedConfig(
 256                         setting=key, feed=self,
 257                         message='extra configuration key: {}'.format(key))
 258         data = dict(
 259             (self._configured_attribute_inverse_translations[k],
 260              self._get_configured_attribute_value(
 261                   attribute=self._configured_attribute_inverse_translations[k],
 262                   key=k, data=data))
 263             for k in data.keys())
 264         for attr in self._non_default_configured_attributes:
 265             if attr not in data:
 266                 data[attr] = None
 267         self.__dict__.update(data)
 268
 269     def _get_configured_option_value(self, attribute, value):
 270         if value and attribute in self._list_attributes:
 271             return ', '.join(value)
 272         return str(value)
 273
 274     def _get_configured_attribute_value(self, attribute, key, data):
 275         if attribute in self._boolean_attributes:
 276             return data.getboolean(key)
 277         elif attribute in self._integer_attributes:
 278             return data.getint(key)
 279         elif attribute in self._list_attributes:
 280             return [x.strip() for x in data[key].split(',')]
 281         return data[key]
 282
 283     def reset(self):
 284         """Reset dynamic data
 285         """
 286         self.etag = None
 287         self.modified = None
 288         self.seen = {}
 289
 290     def _set_name(self, name):
 291         if not self._name_regexp.match(name):
 292             raise _error.InvalidFeedName(name=name, feed=self)
 293         self.name = name
 294         self.section = 'feed.{}'.format(self.name)
 295
 296     def _fetch(self):
 297         """Fetch and parse a feed using feedparser.
 298
 299         >>> feed = Feed(
 300         ...    name='test-feed',
 301         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 302         >>> parsed = feed._fetch()
 303         >>> parsed.status
 304         200
 305         """
 306         _LOG.info('fetch {}'.format(self))
 307         if not self.url:
 308             raise _error.InvalidFeedConfig(setting='url', feed=self)
 309         if self.section in self.config:
 310             config = self.config[self.section]
 311         else:
 312             config = self.config['DEFAULT']
 313         proxy = config['proxy']
 314         timeout = config.getint('feed-timeout')
 315         kwargs = {}
 316         if proxy:
 317             kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
 318         f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
 319         return f(self.url, self.etag, modified=self.modified, **kwargs)
 320
 321     def _process(self, parsed):
 322         _LOG.info('process {}'.format(self))
 323         self._check_for_errors(parsed)
 324         for entry in reversed(parsed.entries):
 325             _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
 326             processed = self._process_entry(parsed=parsed, entry=entry)
 327             if processed:
 328                 yield processed
 329
 330     def _check_for_errors(self, parsed):
 331         warned = False
 332         status = getattr(parsed, 'status', 200)
 333         _LOG.debug('HTTP status {}'.format(status))
 334         if status == 301:
 335             _LOG.info('redirect {} from {} to {}'.format(
 336                     self.name, self.url, parsed['url']))
 337             self.url = parsed['url']
 338         elif status not in [200, 302, 304]:
 339             raise _error.HTTPError(status=status, feed=self)
 340
 341         http_headers = parsed.get('headers', {})
 342         if http_headers:
 343             _LOG.debug('HTTP headers: {}'.format(http_headers))
 344         if not http_headers:
 345             _LOG.warning('could not get HTTP headers: {}'.format(self))
 346             warned = True
 347         else:
 348             if 'html' in http_headers.get('content-type', 'rss'):
 349                 _LOG.warning('looks like HTML: {}'.format(self))
 350                 warned = True
 351             if http_headers.get('content-length', '1') == '0':
 352                 _LOG.warning('empty page: {}'.format(self))
 353                 warned = True
 354
 355         version = parsed.get('version', None)
 356         if version:
 357             _LOG.debug('feed version {}'.format(version))
 358         else:
 359             _LOG.warning('unrecognized version: {}'.format(self))
 360             warned = True
 361
 362         exc = parsed.get('bozo_exception', None)
 363         if isinstance(exc, _socket.timeout):
 364             _LOG.error('timed out: {}'.format(self))
 365             warned = True
 366         elif isinstance(exc, OSError):
 367             _LOG.error('{}: {}'.format(exc, self))
 368             warned = True
 369         elif isinstance(exc, _SOCKET_ERRORS):
 370             _LOG.error('{}: {}'.format(exc, self))
 371             warned = True
 372         elif isinstance(exc, _feedparser.zlib.error):
 373             _LOG.error('broken compression: {}'.format(self))
 374             warned = True
 375         elif isinstance(exc, (IOError, AttributeError)):
 376             _LOG.error('{}: {}'.format(exc, self))
 377             warned = True
 378         elif isinstance(exc, KeyboardInterrupt):
 379             raise exc
 380         elif isinstance(exc, _sax.SAXParseException):
 381             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
 382             warned = True
 383         elif parsed.bozo or exc:
 384             if exc is None:
 385                 exc = "can't process"
 386             _LOG.error('processing error: {}: {}'.format(exc, self))
 387             warned = True
 388
 389         if (not warned and
 390             status in [200, 302] and
 391             not parsed.entries and
 392             not version):
 393             raise _error.ProcessingError(parsed=parsed, feed=feed)
 394
 395     def _process_entry(self, parsed, entry):
 396         id_ = self._get_entry_id(entry)
 397         # If .trust_guid isn't set, we get back hashes of the content.
 398         # Instead of letting these run wild, we put them in context
 399         # by associating them with the actual ID (if it exists).
 400         guid = entry.get('id', id_)
 401         if isinstance(guid, dict):
 402             guid = guid.values()[0]
 403         if guid in self.seen:
 404             if self.seen[guid]['id'] == id_:
 405                 _LOG.debug('already seen {}'.format(id_))
 406                 return  # already seen
 407         sender = self._get_entry_email(parsed=parsed, entry=entry)
 408         subject = self._get_entry_title(entry)
 409         extra_headers = _collections.OrderedDict((
 410                 ('Date', self._get_entry_date(entry)),
 411                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
 412                 ('User-Agent', 'rss2email'),
 413                 ('X-RSS-Feed', self.url),
 414                 ('X-RSS-ID', id_),
 415                 ('X-RSS-URL', self._get_entry_link(entry)),
 416                 ('X-RSS-TAGS', self._get_entry_tags(entry)),
 417                 ))
 418         for k,v in extra_headers.items():  # remove empty tags, etc.
 419             if v is None:
 420                 extra_headers.pop(k)
 421         if self.bonus_header:
 422             for header in self.bonus_header.splitlines():
 423                 if ':' in header:
 424                     key,value = header.split(':', 1)
 425                     extra_headers[key.strip()] = value.strip()
 426                 else:
 427                     _LOG.warning(
 428                         'malformed bonus-header: {}'.format(
 429                             self.bonus_header))
 430
 431         content = self._get_entry_content(entry)
 432         try:
 433             content = self._process_entry_content(
 434                 entry=entry, content=content, subject=subject)
 435         except _error.ProcessingError as e:
 436             e.parsed = parsed
 437             raise
 438         message = _email.get_message(
 439             sender=sender,
 440             recipient=self.to,
 441             subject=subject,
 442             body=content['value'],
 443             content_type=content['type'].split('/', 1)[1],
 444             extra_headers=extra_headers,
 445             config=self.config,
 446             section=self.section)
 447         return (guid, id_, sender, message)
 448
 449     def _get_entry_id(self, entry):
 450         """Get best ID from an entry."""
 451         if self.trust_guid:
 452             if getattr(entry, 'id', None):
 453                 # Newer versions of feedparser could return a dictionary
 454                 if isinstance(entry.id, dict):
 455                     return entry.id.values()[0]
 456                 return entry.id
 457         content = self._get_entry_content(entry)
 458         content_value = content['value'].strip()
 459         if content_value:
 460             return _hashlib.sha1(
 461                 content_value.encode('unicode-escape')).hexdigest()
 462         elif getattr(entry, 'link', None):
 463             return _hashlib.sha1(
 464                 entry.link.encode('unicode-escape')).hexdigest()
 465         elif getattr(entry, 'title', None):
 466             return _hashlib.sha1(
 467                 entry.title.encode('unicode-escape')).hexdigest()
 468
 469     def _get_entry_link(self, entry):
 470         return entry.get('link', None)
 471
 472     def _get_entry_title(self, entry):
 473         if hasattr(entry, 'title_detail') and entry.title_detail:
 474             title = entry.title_detail.value
 475             if 'html' in entry.title_detail.type:
 476                 title = _html2text.html2text(title)
 477         else:
 478             content = self._get_entry_content(entry)
 479             value = content['value']
 480             if content['type'] in ('text/html', 'application/xhtml+xml'):
 481                 value = _html2text.html2text(value)
 482             title = value[:70]
 483         title = title.replace('\n', ' ').strip()
 484         return title
 485
 486     def _get_entry_date(self, entry):
 487         datetime = _time.gmtime()
 488         if self.date_header:
 489             for datetype in self.date_header_order:
 490                 kind = datetype + '_parsed'
 491                 if entry.get(kind, None):
 492                     datetime = entry[kind]
 493                     break
 494         return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 495
 496     def _get_entry_name(self, parsed, entry):
 497         """Get the best name
 498
 499         >>> import feedparser
 500         >>> f = Feed(name='test-feed')
 501         >>> parsed = feedparser.parse(
 502         ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
 503         ...     '  <entry>\\n'
 504         ...     '    <author>\\n'
 505         ...     '      <name>Example author</name>\\n'
 506         ...     '      <email>me@example.com</email>\\n'
 507         ...     '      <url>http://example.com/</url>\\n'
 508         ...     '    </author>\\n'
 509         ...     '  </entry>\\n'
 510         ...     '</feed>\\n'
 511         ...     )
 512         >>> entry = parsed.entries[0]
 513         >>> f.friendly_name = False
 514         >>> f._get_entry_name(parsed, entry)
 515         ''
 516         >>> f.friendly_name = True
 517         >>> f._get_entry_name(parsed, entry)
 518         'Example author'
 519         """
 520         if not self.friendly_name:
 521             return ''
 522         parts = ['']
 523         feed = parsed.feed
 524         parts.append(feed.get('title', ''))
 525         for x in [entry, feed]:
 526             if 'name' in x.get('author_detail', []):
 527                 if x.author_detail.name:
 528                     if ''.join(parts):
 529                         parts.append(': ')
 530                     parts.append(x.author_detail.name)
 531                     break
 532         if not ''.join(parts) and self.use_publisher_email:
 533             if 'name' in feed.get('publisher_detail', []):
 534                 if ''.join(parts):
 535                     parts.append(': ')
 536                 parts.append(feed.publisher_detail.name)
 537         return _html2text.unescape(''.join(parts))
 538
 539     def _validate_email(self, email, default=None):
 540         """Do a basic quality check on email address
 541
 542         Return `default` if the address doesn't appear to be
 543         well-formed.  If `default` is `None`, return
 544         `self.from_email`.
 545
 546         >>> f = Feed(name='test-feed')
 547         >>> f._validate_email('valid@example.com', 'default@example.com')
 548         'valid@example.com'
 549         >>> f._validate_email('invalid@', 'default@example.com')
 550         'default@example.com'
 551         >>> f._validate_email('@invalid', 'default@example.com')
 552         'default@example.com'
 553         >>> f._validate_email('invalid', 'default@example.com')
 554         'default@example.com'
 555         """
 556         parts = email.split('@')
 557         if len(parts) != 2 or '' in parts:
 558             if default is None:
 559                 return self.from_email
 560             return default
 561         return email
 562
 563     def _get_entry_address(self, parsed, entry):
 564         """Get the best From email address ('<jdoe@a.com>')
 565
 566         If the best guess isn't well-formed (something@somthing.com),
 567         use `self.from_email` instead.
 568         """
 569         if self.force_from:
 570             return self.from_email
 571         feed = parsed.feed
 572         if 'email' in entry.get('author_detail', []):
 573             return self._validate_email(entry.author_detail.email)
 574         elif 'email' in feed.get('author_detail', []):
 575             return self._validate_email(feed.author_detail.email)
 576         if self.use_publisher_email:
 577             if 'email' in feed.get('publisher_detail', []):
 578                 return self._validate_email(feed.publisher_detail.email)
 579             if feed.get('errorreportsto', None):
 580                 return self._validate_email(feed.errorreportsto)
 581         _LOG.debug('no sender address found, fallback to default')
 582         return self.from_email
 583
 584     def _get_entry_email(self, parsed, entry):
 585         """Get the best From email address ('John <jdoe@a.com>')
 586         """
 587         name = self._get_entry_name(parsed=parsed, entry=entry)
 588         address = self._get_entry_address(parsed=parsed, entry=entry)
 589         return _formataddr((name, address))
 590
 591     def _get_entry_tags(self, entry):
 592         """Add post tags, if available
 593
 594         >>> f = Feed(name='test-feed')
 595         >>> f._get_entry_tags({
 596         ...         'tags': [{'term': 'tag1',
 597         ...                   'scheme': None,
 598         ...                   'label': None}]})
 599         'tag1'
 600         >>> f._get_entry_tags({
 601         ...         'tags': [{'term': 'tag1',
 602         ...                   'scheme': None,
 603         ...                   'label': None},
 604         ...                  {'term': 'tag2',
 605         ...                   'scheme': None,
 606         ...                   'label': None}]})
 607         'tag1,tag2'
 608
 609         Test some troublesome cases.  No tags:
 610
 611         >>> f._get_entry_tags({})
 612
 613         Empty tags:
 614
 615         >>> f._get_entry_tags({'tags': []})
 616
 617         Tags without a ``term`` entry:
 618
 619         >>> f._get_entry_tags({
 620         ...         'tags': [{'scheme': None,
 621         ...                   'label': None}]})
 622
 623         Tags with an empty term:
 624
 625         >>> f._get_entry_tags({
 626         ...         'tags': [{'term': '',
 627         ...                   'scheme': None,
 628         ...                   'label': None}]})
 629         """
 630         taglist = [tag['term'] for tag in entry.get('tags', [])
 631                    if tag.get('term', '')]
 632         if taglist:
 633             return ','.join(taglist)
 634
 635     def _get_entry_content(self, entry):
 636         """Select the best content from an entry.
 637
 638         Returns a feedparser content dict.
 639         """
 640         # How this works:
 641         #  * We have a bunch of potential contents.
 642         #  * We go thru looking for our first choice.
 643         #    (HTML or text, depending on self.html_mail)
 644         #  * If that doesn't work, we go thru looking for our second choice.
 645         #  * If that still doesn't work, we just take the first one.
 646         #
 647         # Possible future improvement:
 648         #  * Instead of just taking the first one
 649         #    pick the one in the "best" language.
 650         #  * HACK: hardcoded .html_mail, should take a tuple of media types
 651         contents = list(entry.get('content', []))
 652         if entry.get('summary_detail', None):
 653             contents.append(entry.summary_detail)
 654         if self.html_mail:
 655             types = ['text/html', 'text/plain']
 656         else:
 657             types = ['text/plain', 'text/html']
 658         for content_type in types:
 659             for content in contents:
 660                 if content['type'] == content_type:
 661                     return content
 662         if contents:
 663             return contents[0]
 664         return {'type': 'text/plain', 'value': ''}
 665
 666     def _process_entry_content(self, entry, content, subject):
 667         "Convert entry content to the requested format."
 668         link = self._get_entry_link(entry)
 669         if self.html_mail:
 670             lines = [
 671                 '<!DOCTYPE html>',
 672                 '<html>',
 673                 '  <head>',
 674                 ]
 675             if self.use_css and self.css:
 676                 lines.extend([
 677                         '    <style type="text/css">',
 678                         self.css,
 679                         '    </style>',
 680                         ])
 681             lines.extend([
 682                     '</head>',
 683                     '<body>',
 684                     '<div id="entry>',
 685                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
 686                         link, subject),
 687                     '<div id="body">',
 688                     ])
 689             if content['type'] in ('text/html', 'application/xhtml+xml'):
 690                 lines.append(content['value'].strip())
 691             else:
 692                 lines.append(_saxutils.escape(content['value'].strip()))
 693             lines.append('</div>')
 694             lines.extend([
 695                     '<div class="footer">'
 696                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
 697                     ])
 698             for enclosure in getattr(entry, 'enclosures', []):
 699                 if getattr(enclosure, 'url', None):
 700                     lines.append(
 701                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 702                             enclosure.url))
 703                 if getattr(enclosure, 'src', None):
 704                     lines.append(
 705                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 706                             enclosure.src))
 707                     lines.append(
 708                         '<p><img src="{}" /></p>'.format(enclosure.src))
 709             for elink in getattr(entry, 'links', []):
 710                 if elink.get('rel', None) == 'via':
 711                     url = elink['href']
 712                     title = elink.get('title', url)
 713                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
 714                             url, title))
 715             lines.extend([
 716                     '</div>',  # /footer
 717                     '</div>',  # /entry
 718                     '</body>',
 719                     '</html>',
 720                     ''])
 721             content['type'] = 'text/html'
 722             content['value'] = '\n'.join(lines)
 723             return content
 724         else:  # not self.html_mail
 725             if content['type'] in ('text/html', 'application/xhtml+xml'):
 726                 try:
 727                     lines = [_html2text.html2text(content['value'])]
 728                 except _html_parser.HTMLParseError as e:
 729                     raise _error.ProcessingError(parsed=None, feed=self)
 730             else:
 731                 lines = [content['value']]
 732             lines.append('')
 733             lines.append('URL: {}'.format(link))
 734             for enclosure in getattr(entry, 'enclosures', []):
 735                 if getattr(enclosure, 'url', None):
 736                     lines.append('Enclosure: {}'.format(enclosure.url))
 737                 if getattr(enclosure, 'src', None):
 738                     lines.append('Enclosure: {}'.format(enclosure.src))
 739             for elink in getattr(entry, 'links', []):
 740                 if elink.get('rel', None) == 'via':
 741                     url = elink['href']
 742                     title = elink.get('title', url)
 743                     lines.append('Via: {} {}'.format(title, url))
 744             content['type'] = 'text/plain'
 745             content['value'] = '\n'.join(lines)
 746             return content
 747
 748     def _send(self, sender, message):
 749         _LOG.info('send message for {}'.format(self))
 750         section = self.section
 751         if section not in self.config:
 752             section = 'DEFAULT'
 753         _email.send(sender=sender, recipient=self.to, message=message,
 754                     config=self.config, section=section)
 755
 756     def run(self, send=True):
 757         """Fetch and process the feed, mailing entry emails.
 758
 759         >>> feed = Feed(
 760         ...    name='test-feed',
 761         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 762         >>> def send(sender, message):
 763         ...    print('send from {}:'.format(sender))
 764         ...    print(message.as_string())
 765         >>> feed._send = send
 766         >>> feed.to = 'jdoe@dummy.invalid'
 767         >>> #parsed = feed.run()  # enable for debugging
 768         """
 769         if not self.to:
 770             raise _error.NoToEmailAddress(feed=self)
 771         parsed = self._fetch()
 772         for (guid, id_, sender, message) in self._process(parsed):
 773             _LOG.debug('new message: {}'.format(message['Subject']))
 774             if send:
 775                 self._send(sender=sender, message=message)
 776             if guid not in self.seen:
 777                 self.seen[guid] = {}
 778             self.seen[guid]['id'] = id_
 779         self.etag = parsed.get('etag', None)
 780         self.modified = parsed.get('modified', None)