rss2email/feed.py

   1 # Copyright (C) 2004-2013 Aaron Swartz
   2 #                         Brian Lalor
   3 #                         Dean Jackson
   4 #                         Erik Hetzner
   5 #                         Etienne Millon <me@emillon.org>
   6 #                         Joey Hess
   7 #                         Lindsey Smith <lindsey.smith@gmail.com>
   8 #                         Marcel Ackermann
   9 #                         Martin 'Joey' Schulze
  10 #                         Matej Cepl
  11 #                         W. Trevor King <wking@tremily.us>
  12 #
  13 # This file is part of rss2email.
  14 #
  15 # rss2email is free software: you can redistribute it and/or modify it under
  16 # the terms of the GNU General Public License as published by the Free Software
  17 # Foundation, either version 2 of the License, or (at your option) version 3 of
  18 # the License.
  19 #
  20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
  21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  22 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  23 #
  24 # You should have received a copy of the GNU General Public License along with
  25 # rss2email.  If not, see <http://www.gnu.org/licenses/>.
  26
  27 """Define the ``Feed`` class for handling a single feed
  28 """
  29
  30 import collections as _collections
  31 from email.utils import formataddr as _formataddr
  32 import hashlib as _hashlib
  33 import re as _re
  34 import socket as _socket
  35 import time as _time
  36 import urllib.error as _urllib_error
  37 import urllib.request as _urllib_request
  38 import uuid as _uuid
  39 import xml.sax as _sax
  40 import xml.sax.saxutils as _saxutils
  41
  42 import feedparser as _feedparser
  43 import html2text as _html2text
  44
  45 from . import __url__
  46 from . import __version__
  47 from . import LOG as _LOG
  48 from . import config as _config
  49 from . import email as _email
  50 from . import error as _error
  51 from . import util as _util
  52
  53
  54 _feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
  55 _urllib_request.install_opener(_urllib_request.build_opener())
  56 _SOCKET_ERRORS = []
  57 for e in ['error', 'gaierror']:
  58     if hasattr(_socket, e):
  59         _SOCKET_ERRORS.append(getattr(_socket, e))
  60 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
  61
  62
  63 class Feed (object):
  64     """Utility class for feed manipulation and storage.
  65
  66     >>> import pickle
  67     >>> import sys
  68     >>> from .config import CONFIG
  69
  70     >>> feed = Feed(
  71     ...    name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
  72     >>> print(feed)
  73     test-feed (http://example.com/feed.atom -> a@b.com)
  74     >>> feed.section
  75     'feed.test-feed'
  76     >>> feed.from_email
  77     'user@rss2email.invalid'
  78
  79     >>> feed.from_email = 'a@b.com'
  80     >>> feed.save_to_config()
  81     >>> feed.config.write(sys.stdout)  # doctest: +REPORT_UDIFF, +ELLIPSIS
  82     [DEFAULT]
  83     from = user@rss2email.invalid
  84     ...
  85     verbose = warning
  86     <BLANKLINE>
  87     [feed.test-feed]
  88     url = http://example.com/feed.atom
  89     from = a@b.com
  90     to = a@b.com
  91     <BLANKLINE>
  92
  93     >>> feed.etag = 'dummy etag'
  94     >>> string = pickle.dumps(feed)
  95     >>> feed = pickle.loads(string)
  96     >>> feed.load_from_config(config=CONFIG)
  97     >>> feed.etag
  98     'dummy etag'
  99     >>> feed.url
 100     'http://example.com/feed.atom'
 101
 102     Names can only contain ASCII letters, digits, and '._-'.  Here the
 103     invalid space causes an exception:
 104
 105     >>> Feed(name='invalid name')
 106     Traceback (most recent call last):
 107       ...
 108     rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
 109
 110     You must define a URL:
 111
 112     >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
 113     Traceback (most recent call last):
 114       ...
 115     rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
 116
 117
 118     Cleanup `CONFIG`.
 119
 120     >>> CONFIG['DEFAULT']['to'] = ''
 121     >>> test_section = CONFIG.pop('feed.test-feed')
 122     """
 123     _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
 124
 125     # saved/loaded from feed.dat using __getstate__/__setstate__.
 126     _dynamic_attributes = [
 127         'name',
 128         'etag',
 129         'modified',
 130         'seen',
 131         ]
 132
 133     ## saved/loaded from ConfigParser instance
 134     # attributes that aren't in DEFAULT
 135     _non_default_configured_attributes = [
 136         'url',
 137         ]
 138     # attributes that are in DEFAULT
 139     _default_configured_attributes = [
 140         key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
 141     _default_configured_attributes[
 142         _default_configured_attributes.index('from')
 143         ] = 'from_email'  # `from` is a Python keyword
 144     # all attributes that are saved/loaded from .config
 145     _configured_attributes = (
 146         _non_default_configured_attributes + _default_configured_attributes)
 147     # attribute name -> .config option
 148     _configured_attribute_translations = dict(
 149         (attr,attr) for attr in _non_default_configured_attributes)
 150     _configured_attribute_translations.update(dict(
 151             zip(_default_configured_attributes,
 152                 _config.CONFIG['DEFAULT'].keys())))
 153     # .config option -> attribute name
 154     _configured_attribute_inverse_translations = dict(
 155         (v,k) for k,v in _configured_attribute_translations.items())
 156
 157     # hints for value conversion
 158     _boolean_attributes = [
 159         'force_from',
 160         'use_publisher_email',
 161         'friendly_name',
 162         'active',
 163         'date_header',
 164         'trust_guid',
 165         'html_mail',
 166         'use_css',
 167         'unicode_snob',
 168         'links_after_each_paragraph',
 169         'use_smtp',
 170         'smtp_ssl',
 171         ]
 172
 173     _integer_attributes = [
 174         'feed_timeout',
 175         'body_width',
 176         ]
 177
 178     _list_attributes = [
 179         'date_header_order',
 180         'encodings',
 181         ]
 182
 183     def __init__(self, name=None, url=None, to=None, config=None):
 184         self._set_name(name=name)
 185         self.reset()
 186         self.__setstate__(dict(
 187                 (attr, getattr(self, attr))
 188                 for attr in self._dynamic_attributes))
 189         self.load_from_config(config=config)
 190         if url:
 191             self.url = url
 192         if to:
 193             self.to = to
 194
 195     def __str__(self):
 196         return '{} ({} -> {})'.format(self.name, self.url, self.to)
 197
 198     def __repr__(self):
 199         return '<Feed {}>'.format(str(self))
 200
 201     def __getstate__(self):
 202         "Save dyamic attributes"
 203         return dict(
 204             (key,getattr(self,key)) for key in self._dynamic_attributes)
 205
 206     def __setstate__(self, state):
 207         "Restore dynamic attributes"
 208         keys = sorted(state.keys())
 209         if keys != sorted(self._dynamic_attributes):
 210             raise ValueError(state)
 211         self._set_name(name=state['name'])
 212         self.__dict__.update(state)
 213
 214     def save_to_config(self):
 215         "Save configured attributes"
 216         data = _collections.OrderedDict()
 217         default = self.config['DEFAULT']
 218         for attr in self._configured_attributes:
 219             key = self._configured_attribute_translations[attr]
 220             value = getattr(self, attr)
 221             if value is not None:
 222                 value = self._get_configured_option_value(
 223                     attribute=attr, value=value)
 224                 if (attr in self._non_default_configured_attributes or
 225                     value != default[key]):
 226                     data[key] = value
 227         self.config[self.section] = data
 228
 229     def load_from_config(self, config=None):
 230         "Restore configured attributes"
 231         if config is None:
 232             config = _config.CONFIG
 233         self.config = config
 234         if self.section in self.config:
 235             data = self.config[self.section]
 236         else:
 237             data = self.config['DEFAULT']
 238         keys = sorted(data.keys())
 239         expected = sorted(self._configured_attribute_translations.values())
 240         if keys != expected:
 241             for key in expected:
 242                 if (key not in keys and
 243                     key not in self._non_default_configured_attributes):
 244                     raise ValueError('missing key: {}'.format(key))
 245             for key in keys:
 246                 if key not in expected:
 247                     raise ValueError('extra key: {}'.format(key))
 248         data = dict(
 249             (self._configured_attribute_inverse_translations[k],
 250              self._get_configured_attribute_value(
 251                   attribute=self._configured_attribute_inverse_translations[k],
 252                   key=k, data=data))
 253             for k in data.keys())
 254         for attr in self._non_default_configured_attributes:
 255             if attr not in data:
 256                 data[attr] = None
 257         self.__dict__.update(data)
 258
 259     def _get_configured_option_value(self, attribute, value):
 260         if value and attribute in self._list_attributes:
 261             return ', '.join(value)
 262         return str(value)
 263
 264     def _get_configured_attribute_value(self, attribute, key, data):
 265         if attribute in self._boolean_attributes:
 266             return data.getboolean(key)
 267         elif attribute in self._integer_attributes:
 268             return data.getint(key)
 269         elif attribute in self._list_attributes:
 270             return [x.strip() for x in data[key].split(',')]
 271         return data[key]
 272
 273     def reset(self):
 274         """Reset dynamic data
 275         """
 276         self.etag = None
 277         self.modified = None
 278         self.seen = {}
 279
 280     def _set_name(self, name):
 281         if not self._name_regexp.match(name):
 282             raise _error.InvalidFeedName(name=name, feed=self)
 283         self.name = name
 284         self.section = 'feed.{}'.format(self.name)
 285
 286     def _fetch(self):
 287         """Fetch and parse a feed using feedparser.
 288
 289         >>> feed = Feed(
 290         ...    name='test-feed',
 291         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 292         >>> parsed = feed._fetch()
 293         >>> parsed.status
 294         200
 295         """
 296         _LOG.info('fetch {}'.format(self))
 297         if not self.url:
 298             raise _error.InvalidFeedConfig(setting='url', feed=self)
 299         if self.section in self.config:
 300             config = self.config[self.section]
 301         else:
 302             config = self.config['DEFAULT']
 303         proxy = config['proxy']
 304         timeout = config.getint('feed-timeout')
 305         kwargs = {}
 306         if proxy:
 307             kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
 308         f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
 309         return f(self.url, self.etag, modified=self.modified, **kwargs)
 310
 311     def _process(self, parsed):
 312         _LOG.info('process {}'.format(self))
 313         self._check_for_errors(parsed)
 314         for entry in reversed(parsed.entries):
 315             _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
 316             processed = self._process_entry(parsed=parsed, entry=entry)
 317             if processed:
 318                 yield processed
 319
 320     def _check_for_errors(self, parsed):
 321         warned = False
 322         status = getattr(parsed, 'status', 200)
 323         _LOG.debug('HTTP status {}'.format(status))
 324         if status == 301:
 325             _LOG.info('redirect {} from {} to {}'.format(
 326                     self.name, self.url, parsed['url']))
 327             self.url = parsed['url']
 328         elif status not in [200, 302, 304]:
 329             raise _error.HTTPError(status=status, feed=self)
 330
 331         http_headers = parsed.get('headers', {})
 332         if http_headers:
 333             _LOG.debug('HTTP headers: {}'.format(http_headers))
 334         if not http_headers:
 335             _LOG.warning('could not get HTTP headers: {}'.format(self))
 336             warned = True
 337         else:
 338             if 'html' in http_headers.get('content-type', 'rss'):
 339                 _LOG.warning('looks like HTML: {}'.format(self))
 340                 warned = True
 341             if http_headers.get('content-length', '1') == '0':
 342                 _LOG.warning('empty page: {}'.format(self))
 343                 warned = True
 344
 345         version = parsed.get('version', None)
 346         if version:
 347             _LOG.debug('feed version {}'.format(version))
 348         else:
 349             _LOG.warning('unrecognized version: {}'.format(self))
 350             warned = True
 351
 352         exc = parsed.get('bozo_exception', None)
 353         if isinstance(exc, _socket.timeout):
 354             _LOG.error('timed out: {}'.format(self))
 355             warned = True
 356         elif isinstance(exc, _SOCKET_ERRORS):
 357             reason = exc.args[1]
 358             _LOG.error('{}: {}'.format(exc, self))
 359             warned = True
 360         elif (hasattr(exc, 'reason') and
 361               isinstance(exc.reason, _urllib_error.URLError)):
 362             if isinstance(exc.reason, _SOCKET_ERRORS):
 363                 reason = exc.reason.args[1]
 364             else:
 365                 reason = exc.reason
 366             _LOG.error('{}: {}'.format(exc, self))
 367             warned = True
 368         elif isinstance(exc, _feedparser.zlib.error):
 369             _LOG.error('broken compression: {}'.format(self))
 370             warned = True
 371         elif isinstance(exc, (IOError, AttributeError)):
 372             _LOG.error('{}: {}'.format(exc, self))
 373             warned = True
 374         elif isinstance(exc, KeyboardInterrupt):
 375             raise exc
 376         elif isinstance(exc, _sax.SAXParseException):
 377             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
 378             warned = True
 379         elif parsed.bozo or exc:
 380             if exc is None:
 381                 exc = "can't process"
 382             _LOG.error('processing error: {}: {}'.format(exc, self))
 383             warned = True
 384
 385         if (not warned and
 386             status in [200, 302] and
 387             not parsed.entries and
 388             not version):
 389             raise _error.ProcessingError(parsed=parsed, feed=feed)
 390
 391     def _process_entry(self, parsed, entry):
 392         id_ = self._get_entry_id(entry)
 393         # If .trust_guid isn't set, we get back hashes of the content.
 394         # Instead of letting these run wild, we put them in context
 395         # by associating them with the actual ID (if it exists).
 396         guid = entry.get('id', id_)
 397         if isinstance(guid, dict):
 398             guid = guid.values()[0]
 399         if guid in self.seen:
 400             if self.seen[guid] == id_:
 401                 _LOG.debug('already seen {}'.format(id_))
 402                 return  # already seen
 403         sender = self._get_entry_email(parsed=parsed, entry=entry)
 404         link = entry.get('link', None)
 405         subject = self._get_entry_title(entry)
 406         extra_headers = _collections.OrderedDict((
 407                 ('Date', self._get_entry_date(entry)),
 408                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
 409                 ('User-Agent', 'rss2email'),
 410                 ('X-RSS-Feed', self.url),
 411                 ('X-RSS-ID', id_),
 412                 ('X-RSS-URL', link),
 413                 ('X-RSS-TAGS', self._get_entry_tags(entry)),
 414                 ))
 415         for k,v in extra_headers.items():  # remove empty tags, etc.
 416             if v is None:
 417                 extra_headers.pop(k)
 418         if self.bonus_header:
 419             for header in self.bonus_header.splitlines():
 420                 if ':' in header:
 421                     key,value = header.split(':', 1)
 422                     extra_headers[key.strip()] = value.strip()
 423                 else:
 424                     _LOG.warning(
 425                         'malformed bonus-header: {}'.format(
 426                             self.bonus_header))
 427
 428         content = self._get_entry_content(entry)
 429         content = self._process_entry_content(
 430             entry=entry, content=content, link=link, subject=subject)
 431         message = _email.get_message(
 432             sender=sender,
 433             recipient=self.to,
 434             subject=subject,
 435             body=content['value'],
 436             content_type=content['type'].split('/', 1)[1],
 437             extra_headers=extra_headers)
 438         return (guid, id_, sender, message)
 439
 440     def _get_entry_id(self, entry):
 441         """Get best ID from an entry."""
 442         if self.trust_guid:
 443             if getattr(entry, 'id', None):
 444                 # Newer versions of feedparser could return a dictionary
 445                 if isinstance(entry.id, dict):
 446                     return entry.id.values()[0]
 447                 return entry.id
 448         content = self._get_entry_content(entry)
 449         content_value = content['value'].strip()
 450         if content_value:
 451             return _hashlib.sha1(
 452                 content_value.encode('unicode-escape')).hexdigest()
 453         elif getattr(entry, 'link', None):
 454             return _hashlib.sha1(
 455                 entry.link.encode('unicode-escape')).hexdigest()
 456         elif getattr(entry, 'title', None):
 457             return _hashlib.sha1(
 458                 entry.title.encode('unicode-escape')).hexdigest()
 459
 460     def _get_entry_title(self, entry):
 461         if hasattr(entry, 'title_detail') and entry.title_detail:
 462             title = entry.title_detail.value
 463             if 'html' in entry.title_detail.type:
 464                 title = _html2text.html2text(title)
 465         else:
 466             content = self._get_entry_content(entry)
 467             value = content['value']
 468             if content['type'] in ('text/html', 'application/xhtml+xml'):
 469                 value = _html2text.html2text(value)
 470             title = value[:70]
 471         title = title.replace('\n', ' ').strip()
 472         return title
 473
 474     def _get_entry_date(self, entry):
 475         datetime = _time.gmtime()
 476         if self.date_header:
 477             for datetype in self.date_header_order:
 478                 kind = datetype + '_parsed'
 479                 if entry.get(kind, None):
 480                     datetime = entry[kind]
 481                     break
 482         return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 483
 484     def _get_entry_name(self, parsed, entry):
 485         """Get the best name
 486
 487         >>> import feedparser
 488         >>> f = Feed(name='test-feed')
 489         >>> parsed = feedparser.parse(
 490         ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
 491         ...     '  <entry>\\n'
 492         ...     '    <author>\\n'
 493         ...     '      <name>Example author</name>\\n'
 494         ...     '      <email>me@example.com</email>\\n'
 495         ...     '      <url>http://example.com/</url>\\n'
 496         ...     '    </author>\\n'
 497         ...     '  </entry>\\n'
 498         ...     '</feed>\\n'
 499         ...     )
 500         >>> entry = parsed.entries[0]
 501         >>> f.friendly_name = False
 502         >>> f._get_entry_name(parsed, entry)
 503         ''
 504         >>> f.friendly_name = True
 505         >>> f._get_entry_name(parsed, entry)
 506         'Example author'
 507         """
 508         if not self.friendly_name:
 509             return ''
 510         parts = ['']
 511         feed = parsed.feed
 512         parts.append(feed.get('title', ''))
 513         for x in [entry, feed]:
 514             if 'name' in x.get('author_detail', []):
 515                 if x.author_detail.name:
 516                     if ''.join(parts):
 517                         parts.append(': ')
 518                     parts.append(x.author_detail.name)
 519                     break
 520         if not ''.join(parts) and self.use_publisher_email:
 521             if 'name' in feed.get('publisher_detail', []):
 522                 if ''.join(parts):
 523                     parts.append(': ')
 524                 parts.append(feed.publisher_detail.name)
 525         return _html2text.unescape(''.join(parts))
 526
 527     def _validate_email(self, email, default=None):
 528         """Do a basic quality check on email address
 529
 530         Return `default` if the address doesn't appear to be
 531         well-formed.  If `default` is `None`, return
 532         `self.from_email`.
 533
 534         >>> f = Feed(name='test-feed')
 535         >>> f._validate_email('valid@example.com', 'default@example.com')
 536         'valid@example.com'
 537         >>> f._validate_email('invalid@', 'default@example.com')
 538         'default@example.com'
 539         >>> f._validate_email('@invalid', 'default@example.com')
 540         'default@example.com'
 541         >>> f._validate_email('invalid', 'default@example.com')
 542         'default@example.com'
 543         """
 544         parts = email.split('@')
 545         if len(parts) != 2 or '' in parts:
 546             if default is None:
 547                 return self.from_email
 548             return default
 549         return email
 550
 551     def _get_entry_address(self, parsed, entry):
 552         """Get the best From email address ('<jdoe@a.com>')
 553
 554         If the best guess isn't well-formed (something@somthing.com),
 555         use `self.from_email` instead.
 556         """
 557         if self.force_from:
 558             return self.from_email
 559         feed = parsed.feed
 560         if 'email' in entry.get('author_detail', []):
 561             return self._validate_email(entry.author_detail.email)
 562         elif 'email' in feed.get('author_detail', []):
 563             return self._validate_email(feed.author_detail.email)
 564         if self.use_publisher_email:
 565             if 'email' in feed.get('publisher_detail', []):
 566                 return self._validate_email(feed.publisher_detail.email)
 567             if feed.get('errorreportsto', None):
 568                 return self._validate_email(feed.errorreportsto)
 569         _LOG.debug('no sender address found, fallback to default')
 570         return self.from_email
 571
 572     def _get_entry_email(self, parsed, entry):
 573         """Get the best From email address ('John <jdoe@a.com>')
 574         """
 575         name = self._get_entry_name(parsed=parsed, entry=entry)
 576         address = self._get_entry_address(parsed=parsed, entry=entry)
 577         return _formataddr((name, address))
 578
 579     def _get_entry_tags(self, entry):
 580         """Add post tags, if available
 581
 582         >>> f = Feed(name='test-feed')
 583         >>> f._get_entry_tags({
 584         ...         'tags': [{'term': 'tag1',
 585         ...                   'scheme': None,
 586         ...                   'label': None}]})
 587         'tag1'
 588         >>> f._get_entry_tags({
 589         ...         'tags': [{'term': 'tag1',
 590         ...                   'scheme': None,
 591         ...                   'label': None},
 592         ...                  {'term': 'tag2',
 593         ...                   'scheme': None,
 594         ...                   'label': None}]})
 595         'tag1,tag2'
 596
 597         Test some troublesome cases.  No tags:
 598
 599         >>> f._get_entry_tags({})
 600
 601         Empty tags:
 602
 603         >>> f._get_entry_tags({'tags': []})
 604
 605         Tags without a ``term`` entry:
 606
 607         >>> f._get_entry_tags({
 608         ...         'tags': [{'scheme': None,
 609         ...                   'label': None}]})
 610
 611         Tags with an empty term:
 612
 613         >>> f._get_entry_tags({
 614         ...         'tags': [{'term': '',
 615         ...                   'scheme': None,
 616         ...                   'label': None}]})
 617         """
 618         taglist = [tag['term'] for tag in entry.get('tags', [])
 619                    if tag.get('term', '')]
 620         if taglist:
 621             return ','.join(taglist)
 622
 623     def _get_entry_content(self, entry):
 624         """Select the best content from an entry.
 625
 626         Returns a feedparser content dict.
 627         """
 628         # How this works:
 629         #  * We have a bunch of potential contents.
 630         #  * We go thru looking for our first choice.
 631         #    (HTML or text, depending on self.html_mail)
 632         #  * If that doesn't work, we go thru looking for our second choice.
 633         #  * If that still doesn't work, we just take the first one.
 634         #
 635         # Possible future improvement:
 636         #  * Instead of just taking the first one
 637         #    pick the one in the "best" language.
 638         #  * HACK: hardcoded .html_mail, should take a tuple of media types
 639         contents = list(entry.get('content', []))
 640         if entry.get('summary_detail', None):
 641             contents.append(entry.summary_detail)
 642         if self.html_mail:
 643             types = ['text/html', 'text/plain']
 644         else:
 645             types = ['text/plain', 'text/html']
 646         for content_type in types:
 647             for content in contents:
 648                 if content['type'] == content_type:
 649                     return content
 650         if contents:
 651             return contents[0]
 652         return {'type': 'text/plain', 'value': ''}
 653
 654     def _process_entry_content(self, entry, content, link, subject):
 655         "Convert entry content to the requested format."
 656         if self.html_mail:
 657             lines = [
 658                 '<!DOCTYPE html>',
 659                 '<html>',
 660                 '  <head>',
 661                 ]
 662             if self.use_css and self.css:
 663                 lines.extend([
 664                         '    <style type="text/css">',
 665                         self.css,
 666                         '    </style>',
 667                         ])
 668             lines.extend([
 669                     '</head>',
 670                     '<body>',
 671                     '<div id="entry>',
 672                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
 673                         link, subject),
 674                     '<div id="body"><table><tr><td>',
 675                     ])
 676             if content['type'] in ('text/html', 'application/xhtml+xml'):
 677                 lines.append(content['value'].strip())
 678             else:
 679                 lines.append(_saxutils.escape(content['value'].strip()))
 680             lines.append('</td></tr></table></div>')
 681             lines.extend([
 682                     '<div class="footer">'
 683                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
 684                     ])
 685             for enclosure in getattr(entry, 'enclosures', []):
 686                 if getattr(enclosure, 'url', None):
 687                     lines.append(
 688                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 689                             enclosure.url))
 690                 if getattr(enclosure, 'src', None):
 691                     lines.append(
 692                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 693                             enclosure.src))
 694                     lines.append(
 695                         '<p><img src="{}" /></p>'.format(enclosure.src))
 696             for elink in getattr(entry, 'links', []):
 697                 if elink.get('rel', None) == 'via':
 698                     url = elink['href']
 699                     url = url.replace(
 700                         'http://www.google.com/reader/public/atom/',
 701                         'http://www.google.com/reader/view/')
 702                     title = url
 703                     if elink.get('title', None):
 704                         title = elink['title']
 705                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
 706                             url, title))
 707             lines.extend([
 708                     '</div>',  # /footer
 709                     '</div>',  # /entry
 710                     '</body>',
 711                     '</html>',
 712                     ''])
 713             content['type'] = 'text/html'
 714             content['value'] = '\n'.join(lines)
 715             return content
 716         else:  # not self.html_mail
 717             if content['type'] in ('text/html', 'application/xhtml+xml'):
 718                 lines = [_html2text.html2text(content['value'])]
 719             else:
 720                 lines = [content['value']]
 721             lines.append('')
 722             lines.append('URL: {}'.format(link))
 723             for enclosure in getattr(entry, 'enclosures', []):
 724                 if getattr(enclosure, 'url', None):
 725                     lines.append('Enclosure: {}'.format(enclosure.url))
 726                 if getattr(enclosure, 'src', None):
 727                     lines.append('Enclosure: {}'.format(enclosure.src))
 728             for elink in getattr(entry, 'links', []):
 729                 if elink.get('rel', None) == 'via':
 730                     url = elink['href']
 731                     url = url.replace(
 732                         'http://www.google.com/reader/public/atom/',
 733                         'http://www.google.com/reader/view/')
 734                     title = url
 735                     if elink.get('title', None):
 736                         title = elink['title']
 737                     lines.append('Via: {} {}'.format(title, url))
 738             content['type'] = 'text/plain'
 739             content['value'] = '\n'.join(lines)
 740             return content
 741
 742     def _send(self, sender, message):
 743         _LOG.info('send message for {}'.format(self))
 744         section = self.section
 745         if section not in self.config:
 746             section = 'DEFAULT'
 747         _email.send(sender=sender, recipient=self.to, message=message,
 748                     config=self.config, section=section)
 749
 750     def run(self, send=True):
 751         """Fetch and process the feed, mailing entry emails.
 752
 753         >>> feed = Feed(
 754         ...    name='test-feed',
 755         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 756         >>> def send(sender, message):
 757         ...    print('send from {}:'.format(sender))
 758         ...    print(message.as_string())
 759         >>> feed._send = send
 760         >>> feed.to = 'jdoe@dummy.invalid'
 761         >>> #parsed = feed.run()  # enable for debugging
 762         """
 763         if not self.to:
 764             raise _error.NoToEmailAddress(feed=self)
 765         parsed = self._fetch()
 766         for (guid, id_, sender, message) in self._process(parsed):
 767             _LOG.debug('new message: {}'.format(message['Subject']))
 768             if send:
 769                 self._send(sender=sender, message=message)
 770             self.seen[guid] = id_
 771         self.etag = parsed.get('etag', None)
 772         self.modified = parsed.get('modified', None)