rss2email/feed.py

   1 # Copyright (C) 2004-2013 Aaron Swartz
   2 #                         Brian Lalor
   3 #                         Dean Jackson
   4 #                         Erik Hetzner
   5 #                         Etienne Millon <me@emillon.org>
   6 #                         Joey Hess
   7 #                         Lindsey Smith <lindsey.smith@gmail.com>
   8 #                         Marcel Ackermann
   9 #                         Martin 'Joey' Schulze
  10 #                         Matej Cepl
  11 #                         W. Trevor King <wking@tremily.us>
  12 #
  13 # This file is part of rss2email.
  14 #
  15 # rss2email is free software: you can redistribute it and/or modify it under
  16 # the terms of the GNU General Public License as published by the Free Software
  17 # Foundation, either version 2 of the License, or (at your option) version 3 of
  18 # the License.
  19 #
  20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
  21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  22 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  23 #
  24 # You should have received a copy of the GNU General Public License along with
  25 # rss2email.  If not, see <http://www.gnu.org/licenses/>.
  26
  27 """Define the ``Feed`` class for handling a single feed
  28 """
  29
  30 import collections as _collections
  31 from email.mime.message import MIMEMessage as _MIMEMessage
  32 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
  33 from email.utils import formataddr as _formataddr
  34 import hashlib as _hashlib
  35 import html.parser as _html_parser
  36 import re as _re
  37 import socket as _socket
  38 import time as _time
  39 import urllib.error as _urllib_error
  40 import urllib.request as _urllib_request
  41 import uuid as _uuid
  42 import xml.sax as _sax
  43 import xml.sax.saxutils as _saxutils
  44
  45 import feedparser as _feedparser
  46 import html2text as _html2text
  47
  48 from . import __url__
  49 from . import __version__
  50 from . import LOG as _LOG
  51 from . import config as _config
  52 from . import email as _email
  53 from . import error as _error
  54 from . import util as _util
  55
  56
  57 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
  58 _feedparser.USER_AGENT = _USER_AGENT
  59 _urllib_request.install_opener(_urllib_request.build_opener())
  60 _SOCKET_ERRORS = []
  61 for e in ['error', 'herror', 'gaierror']:
  62     if hasattr(_socket, e):
  63         _SOCKET_ERRORS.append(getattr(_socket, e))
  64 del e  # cleanup namespace
  65 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
  66
  67 # drv_libxml2 raises:
  68 #   TypeError: 'str' does not support the buffer interface
  69 _feedparser.PREFERRED_XML_PARSERS = []
  70
  71
  72 class Feed (object):
  73     """Utility class for feed manipulation and storage.
  74
  75     >>> import pickle
  76     >>> import sys
  77     >>> from .config import CONFIG
  78
  79     >>> feed = Feed(
  80     ...    name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
  81     >>> print(feed)
  82     test-feed (http://example.com/feed.atom -> a@b.com)
  83     >>> feed.section
  84     'feed.test-feed'
  85     >>> feed.from_email
  86     'user@rss2email.invalid'
  87
  88     >>> feed.from_email = 'a@b.com'
  89     >>> feed.save_to_config()
  90     >>> feed.config.write(sys.stdout)  # doctest: +REPORT_UDIFF, +ELLIPSIS
  91     [DEFAULT]
  92     from = user@rss2email.invalid
  93     ...
  94     verbose = warning
  95     <BLANKLINE>
  96     [feed.test-feed]
  97     url = http://example.com/feed.atom
  98     from = a@b.com
  99     to = a@b.com
 100     <BLANKLINE>
 101
 102     >>> feed.etag = 'dummy etag'
 103     >>> string = pickle.dumps(feed)
 104     >>> feed = pickle.loads(string)
 105     >>> feed.load_from_config(config=CONFIG)
 106     >>> feed.etag
 107     'dummy etag'
 108     >>> feed.url
 109     'http://example.com/feed.atom'
 110
 111     Names can only contain ASCII letters, digits, and '._-'.  Here the
 112     invalid space causes an exception:
 113
 114     >>> Feed(name='invalid name')
 115     Traceback (most recent call last):
 116       ...
 117     rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
 118
 119     You must define a URL:
 120
 121     >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
 122     Traceback (most recent call last):
 123       ...
 124     rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
 125
 126
 127     Cleanup `CONFIG`.
 128
 129     >>> CONFIG['DEFAULT']['to'] = ''
 130     >>> test_section = CONFIG.pop('feed.test-feed')
 131     """
 132     _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
 133
 134     # saved/loaded from feed.dat using __getstate__/__setstate__.
 135     _dynamic_attributes = [
 136         'name',
 137         'etag',
 138         'modified',
 139         'seen',
 140         ]
 141
 142     ## saved/loaded from ConfigParser instance
 143     # attributes that aren't in DEFAULT
 144     _non_default_configured_attributes = [
 145         'url',
 146         ]
 147     # attributes that are in DEFAULT
 148     _default_configured_attributes = [
 149         key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
 150     _default_configured_attributes[
 151         _default_configured_attributes.index('from')
 152         ] = 'from_email'  # `from` is a Python keyword
 153     # all attributes that are saved/loaded from .config
 154     _configured_attributes = (
 155         _non_default_configured_attributes + _default_configured_attributes)
 156     # attribute name -> .config option
 157     _configured_attribute_translations = dict(
 158         (attr,attr) for attr in _non_default_configured_attributes)
 159     _configured_attribute_translations.update(dict(
 160             zip(_default_configured_attributes,
 161                 _config.CONFIG['DEFAULT'].keys())))
 162     # .config option -> attribute name
 163     _configured_attribute_inverse_translations = dict(
 164         (v,k) for k,v in _configured_attribute_translations.items())
 165
 166     # hints for value conversion
 167     _boolean_attributes = [
 168         'digest',
 169         'force_from',
 170         'use_publisher_email',
 171         'friendly_name',
 172         'active',
 173         'date_header',
 174         'trust_guid',
 175         'html_mail',
 176         'use_css',
 177         'unicode_snob',
 178         'links_after_each_paragraph',
 179         'use_smtp',
 180         'smtp_ssl',
 181         ]
 182
 183     _integer_attributes = [
 184         'feed_timeout',
 185         'body_width',
 186         ]
 187
 188     _list_attributes = [
 189         'date_header_order',
 190         'encodings',
 191         ]
 192
 193     _function_attributes = [
 194         'post_process',
 195         'digest_post_process',
 196         ]
 197
 198     def __init__(self, name=None, url=None, to=None, config=None):
 199         self._set_name(name=name)
 200         self.reset()
 201         self.__setstate__(dict(
 202                 (attr, getattr(self, attr))
 203                 for attr in self._dynamic_attributes))
 204         self.load_from_config(config=config)
 205         if url:
 206             self.url = url
 207         if to:
 208             self.to = to
 209
 210     def __str__(self):
 211         return '{} ({} -> {})'.format(self.name, self.url, self.to)
 212
 213     def __repr__(self):
 214         return '<Feed {}>'.format(str(self))
 215
 216     def __getstate__(self):
 217         "Save dyamic attributes"
 218         return dict(
 219             (key,getattr(self,key)) for key in self._dynamic_attributes)
 220
 221     get_state = __getstate__  # make it publicly accessible
 222
 223     def __setstate__(self, state):
 224         "Restore dynamic attributes"
 225         keys = sorted(state.keys())
 226         if keys != sorted(self._dynamic_attributes):
 227             raise ValueError(state)
 228         self._set_name(name=state['name'])
 229         self.__dict__.update(state)
 230
 231     set_state = __setstate__  # make it publicly accessible
 232
 233     def save_to_config(self):
 234         "Save configured attributes"
 235         data = _collections.OrderedDict()
 236         default = self.config['DEFAULT']
 237         for attr in self._configured_attributes:
 238             key = self._configured_attribute_translations[attr]
 239             value = getattr(self, attr)
 240             if value is not None:
 241                 value = self._get_configured_option_value(
 242                     attribute=attr, value=value)
 243                 if (attr in self._non_default_configured_attributes or
 244                     value != default[key]):
 245                     data[key] = value
 246         self.config[self.section] = data
 247
 248     def load_from_config(self, config=None):
 249         "Restore configured attributes"
 250         if config is None:
 251             config = _config.CONFIG
 252         self.config = config
 253         if self.section in self.config:
 254             data = self.config[self.section]
 255         else:
 256             data = self.config['DEFAULT']
 257         keys = sorted(data.keys())
 258         expected = sorted(self._configured_attribute_translations.values())
 259         if keys != expected:
 260             for key in expected:
 261                 if (key not in keys and
 262                     key not in self._non_default_configured_attributes):
 263                     raise _error.InvalidFeedConfig(
 264                         setting=key, feed=self,
 265                         message='missing configuration key: {}'.format(key))
 266             for key in keys:
 267                 if key not in expected:
 268                     raise _error.InvalidFeedConfig(
 269                         setting=key, feed=self,
 270                         message='extra configuration key: {}'.format(key))
 271         data = dict(
 272             (self._configured_attribute_inverse_translations[k],
 273              self._get_configured_attribute_value(
 274                   attribute=self._configured_attribute_inverse_translations[k],
 275                   key=k, data=data))
 276             for k in data.keys())
 277         for attr in self._non_default_configured_attributes:
 278             if attr not in data:
 279                 data[attr] = None
 280         self.__dict__.update(data)
 281
 282     def _get_configured_option_value(self, attribute, value):
 283         if value is None:
 284             return ''
 285         elif attribute in self._list_attributes:
 286             return ', '.join(value)
 287         elif attribute in self._function_attributes:
 288             return _util.import_name(value)
 289         return str(value)
 290
 291     def _get_configured_attribute_value(self, attribute, key, data):
 292         if attribute in self._boolean_attributes:
 293             return data.getboolean(key)
 294         elif attribute in self._integer_attributes:
 295             return data.getint(key)
 296         elif attribute in self._list_attributes:
 297             return [x.strip() for x in data[key].split(',')]
 298         elif attribute in self._function_attributes:
 299             if data[key]:
 300                 return _util.import_function(data[key])
 301             return None
 302         return data[key]
 303
 304     def reset(self):
 305         """Reset dynamic data
 306         """
 307         self.etag = None
 308         self.modified = None
 309         self.seen = {}
 310
 311     def _set_name(self, name):
 312         if not self._name_regexp.match(name):
 313             raise _error.InvalidFeedName(name=name, feed=self)
 314         self.name = name
 315         self.section = 'feed.{}'.format(self.name)
 316
 317     def _fetch(self):
 318         """Fetch and parse a feed using feedparser.
 319
 320         >>> feed = Feed(
 321         ...    name='test-feed',
 322         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 323         >>> parsed = feed._fetch()
 324         >>> parsed.status
 325         200
 326         """
 327         _LOG.info('fetch {}'.format(self))
 328         if not self.url:
 329             raise _error.InvalidFeedConfig(setting='url', feed=self)
 330         if self.section in self.config:
 331             config = self.config[self.section]
 332         else:
 333             config = self.config['DEFAULT']
 334         proxy = config['proxy']
 335         timeout = config.getint('feed-timeout')
 336         kwargs = {}
 337         if proxy:
 338             kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
 339         f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
 340         return f(self.url, self.etag, modified=self.modified, **kwargs)
 341
 342     def _process(self, parsed):
 343         _LOG.info('process {}'.format(self))
 344         self._check_for_errors(parsed)
 345         for entry in reversed(parsed.entries):
 346             _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
 347             processed = self._process_entry(parsed=parsed, entry=entry)
 348             if processed:
 349                 guid,id_,sender,message = processed
 350                 if self.post_process:
 351                     message = self.post_process(
 352                         feed=self, parsed=parsed, entry=entry, guid=guid,
 353                         message=message)
 354                     if not message:
 355                         continue
 356                 yield (guid, id_, sender, message)
 357
 358     def _check_for_errors(self, parsed):
 359         warned = False
 360         status = getattr(parsed, 'status', 200)
 361         _LOG.debug('HTTP status {}'.format(status))
 362         if status == 301:
 363             _LOG.info('redirect {} from {} to {}'.format(
 364                     self.name, self.url, parsed['url']))
 365             self.url = parsed['url']
 366         elif status not in [200, 302, 304]:
 367             raise _error.HTTPError(status=status, feed=self)
 368
 369         http_headers = parsed.get('headers', {})
 370         if http_headers:
 371             _LOG.debug('HTTP headers: {}'.format(http_headers))
 372         if not http_headers:
 373             _LOG.warning('could not get HTTP headers: {}'.format(self))
 374             warned = True
 375         else:
 376             if 'html' in http_headers.get('content-type', 'rss'):
 377                 _LOG.warning('looks like HTML: {}'.format(self))
 378                 warned = True
 379             if http_headers.get('content-length', '1') == '0':
 380                 _LOG.warning('empty page: {}'.format(self))
 381                 warned = True
 382
 383         version = parsed.get('version', None)
 384         if version:
 385             _LOG.debug('feed version {}'.format(version))
 386         else:
 387             _LOG.warning('unrecognized version: {}'.format(self))
 388             warned = True
 389
 390         exc = parsed.get('bozo_exception', None)
 391         if isinstance(exc, _socket.timeout):
 392             _LOG.error('timed out: {}'.format(self))
 393             warned = True
 394         elif isinstance(exc, OSError):
 395             _LOG.error('{}: {}'.format(exc, self))
 396             warned = True
 397         elif isinstance(exc, _SOCKET_ERRORS):
 398             _LOG.error('{}: {}'.format(exc, self))
 399             warned = True
 400         elif isinstance(exc, _feedparser.zlib.error):
 401             _LOG.error('broken compression: {}'.format(self))
 402             warned = True
 403         elif isinstance(exc, (IOError, AttributeError)):
 404             _LOG.error('{}: {}'.format(exc, self))
 405             warned = True
 406         elif isinstance(exc, KeyboardInterrupt):
 407             raise exc
 408         elif isinstance(exc, _sax.SAXParseException):
 409             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
 410             warned = True
 411         elif (parsed.bozo and
 412               isinstance(exc, _feedparser.CharacterEncodingOverride)):
 413             _LOG.warning(
 414                 'incorrectly declared encoding: {}: {}'.format(exc, self))
 415             warned = True
 416         elif parsed.bozo or exc:
 417             if exc is None:
 418                 exc = "can't process"
 419             _LOG.error('processing error: {}: {}'.format(exc, self))
 420             warned = True
 421
 422         if (not warned and
 423             status in [200, 302] and
 424             not parsed.entries and
 425             not version):
 426             raise _error.ProcessingError(parsed=parsed, feed=feed)
 427
 428     def _html2text(self, html, baseurl=''):
 429         self.config.setup_html2text(section=self.section)
 430         return _html2text.html2text(html=html, baseurl=baseurl)
 431
 432     def _process_entry(self, parsed, entry):
 433         id_ = self._get_entry_id(entry)
 434         # If .trust_guid isn't set, we get back hashes of the content.
 435         # Instead of letting these run wild, we put them in context
 436         # by associating them with the actual ID (if it exists).
 437         guid = entry.get('id', id_)
 438         if isinstance(guid, dict):
 439             guid = guid.values()[0]
 440         if guid in self.seen:
 441             if self.seen[guid]['id'] == id_:
 442                 _LOG.debug('already seen {}'.format(id_))
 443                 return  # already seen
 444         sender = self._get_entry_email(parsed=parsed, entry=entry)
 445         subject = self._get_entry_title(entry)
 446         extra_headers = _collections.OrderedDict((
 447                 ('Date', self._get_entry_date(entry)),
 448                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
 449                 ('User-Agent', _USER_AGENT),
 450                 ('X-RSS-Feed', self.url),
 451                 ('X-RSS-ID', id_),
 452                 ('X-RSS-URL', self._get_entry_link(entry)),
 453                 ('X-RSS-TAGS', self._get_entry_tags(entry)),
 454                 ))
 455         for k,v in extra_headers.items():  # remove empty tags, etc.
 456             if v is None:
 457                 extra_headers.pop(k)
 458         if self.bonus_header:
 459             for header in self.bonus_header.splitlines():
 460                 if ':' in header:
 461                     key,value = header.split(':', 1)
 462                     extra_headers[key.strip()] = value.strip()
 463                 else:
 464                     _LOG.warning(
 465                         'malformed bonus-header: {}'.format(
 466                             self.bonus_header))
 467
 468         content = self._get_entry_content(entry)
 469         try:
 470             content = self._process_entry_content(
 471                 entry=entry, content=content, subject=subject)
 472         except _error.ProcessingError as e:
 473             e.parsed = parsed
 474             raise
 475         message = _email.get_message(
 476             sender=sender,
 477             recipient=self.to,
 478             subject=subject,
 479             body=content['value'],
 480             content_type=content['type'].split('/', 1)[1],
 481             extra_headers=extra_headers,
 482             config=self.config,
 483             section=self.section)
 484         return (guid, id_, sender, message)
 485
 486     def _get_entry_id(self, entry):
 487         """Get best ID from an entry."""
 488         if self.trust_guid:
 489             if getattr(entry, 'id', None):
 490                 # Newer versions of feedparser could return a dictionary
 491                 if isinstance(entry.id, dict):
 492                     return entry.id.values()[0]
 493                 return entry.id
 494         content = self._get_entry_content(entry)
 495         content_value = content['value'].strip()
 496         if content_value:
 497             return _hashlib.sha1(
 498                 content_value.encode('unicode-escape')).hexdigest()
 499         elif getattr(entry, 'link', None):
 500             return _hashlib.sha1(
 501                 entry.link.encode('unicode-escape')).hexdigest()
 502         elif getattr(entry, 'title', None):
 503             return _hashlib.sha1(
 504                 entry.title.encode('unicode-escape')).hexdigest()
 505
 506     def _get_entry_link(self, entry):
 507         return entry.get('link', None)
 508
 509     def _get_entry_title(self, entry):
 510         if hasattr(entry, 'title_detail') and entry.title_detail:
 511             title = entry.title_detail.value
 512             if 'html' in entry.title_detail.type:
 513                 title = self._html2text(title)
 514         else:
 515             content = self._get_entry_content(entry)
 516             value = content['value']
 517             if content['type'] in ('text/html', 'application/xhtml+xml'):
 518                 value = self._html2text(value)
 519             title = value[:70]
 520         title = title.replace('\n', ' ').strip()
 521         return title
 522
 523     def _get_entry_date(self, entry):
 524         datetime = _time.gmtime()
 525         if self.date_header:
 526             for datetype in self.date_header_order:
 527                 kind = datetype + '_parsed'
 528                 if entry.get(kind, None):
 529                     datetime = entry[kind]
 530                     break
 531         return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 532
 533     def _get_entry_name(self, parsed, entry):
 534         """Get the best name
 535
 536         >>> import feedparser
 537         >>> f = Feed(name='test-feed')
 538         >>> parsed = feedparser.parse(
 539         ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
 540         ...     '  <entry>\\n'
 541         ...     '    <author>\\n'
 542         ...     '      <name>Example author</name>\\n'
 543         ...     '      <email>me@example.com</email>\\n'
 544         ...     '      <url>http://example.com/</url>\\n'
 545         ...     '    </author>\\n'
 546         ...     '  </entry>\\n'
 547         ...     '</feed>\\n'
 548         ...     )
 549         >>> entry = parsed.entries[0]
 550         >>> f.friendly_name = False
 551         >>> f._get_entry_name(parsed, entry)
 552         ''
 553         >>> f.friendly_name = True
 554         >>> f._get_entry_name(parsed, entry)
 555         'Example author'
 556         """
 557         if not self.friendly_name:
 558             return ''
 559         parts = ['']
 560         feed = parsed.feed
 561         parts.append(feed.get('title', ''))
 562         for x in [entry, feed]:
 563             if 'name' in x.get('author_detail', []):
 564                 if x.author_detail.name:
 565                     if ''.join(parts):
 566                         parts.append(': ')
 567                     parts.append(x.author_detail.name)
 568                     break
 569         if not ''.join(parts) and self.use_publisher_email:
 570             if 'name' in feed.get('publisher_detail', []):
 571                 if ''.join(parts):
 572                     parts.append(': ')
 573                 parts.append(feed.publisher_detail.name)
 574         return _html2text.unescape(''.join(parts))
 575
 576     def _validate_email(self, email, default=None):
 577         """Do a basic quality check on email address
 578
 579         Return `default` if the address doesn't appear to be
 580         well-formed.  If `default` is `None`, return
 581         `self.from_email`.
 582
 583         >>> f = Feed(name='test-feed')
 584         >>> f._validate_email('valid@example.com', 'default@example.com')
 585         'valid@example.com'
 586         >>> f._validate_email('invalid@', 'default@example.com')
 587         'default@example.com'
 588         >>> f._validate_email('@invalid', 'default@example.com')
 589         'default@example.com'
 590         >>> f._validate_email('invalid', 'default@example.com')
 591         'default@example.com'
 592         """
 593         parts = email.split('@')
 594         if len(parts) != 2 or '' in parts:
 595             if default is None:
 596                 return self.from_email
 597             return default
 598         return email
 599
 600     def _get_entry_address(self, parsed, entry):
 601         """Get the best From email address ('<jdoe@a.com>')
 602
 603         If the best guess isn't well-formed (something@somthing.com),
 604         use `self.from_email` instead.
 605         """
 606         if self.force_from:
 607             return self.from_email
 608         feed = parsed.feed
 609         if 'email' in entry.get('author_detail', []):
 610             return self._validate_email(entry.author_detail.email)
 611         elif 'email' in feed.get('author_detail', []):
 612             return self._validate_email(feed.author_detail.email)
 613         if self.use_publisher_email:
 614             if 'email' in feed.get('publisher_detail', []):
 615                 return self._validate_email(feed.publisher_detail.email)
 616             if feed.get('errorreportsto', None):
 617                 return self._validate_email(feed.errorreportsto)
 618         _LOG.debug('no sender address found, fallback to default')
 619         return self.from_email
 620
 621     def _get_entry_email(self, parsed, entry):
 622         """Get the best From email address ('John <jdoe@a.com>')
 623         """
 624         name = self._get_entry_name(parsed=parsed, entry=entry)
 625         address = self._get_entry_address(parsed=parsed, entry=entry)
 626         return _formataddr((name, address))
 627
 628     def _get_entry_tags(self, entry):
 629         """Add post tags, if available
 630
 631         >>> f = Feed(name='test-feed')
 632         >>> f._get_entry_tags({
 633         ...         'tags': [{'term': 'tag1',
 634         ...                   'scheme': None,
 635         ...                   'label': None}]})
 636         'tag1'
 637         >>> f._get_entry_tags({
 638         ...         'tags': [{'term': 'tag1',
 639         ...                   'scheme': None,
 640         ...                   'label': None},
 641         ...                  {'term': 'tag2',
 642         ...                   'scheme': None,
 643         ...                   'label': None}]})
 644         'tag1,tag2'
 645
 646         Test some troublesome cases.  No tags:
 647
 648         >>> f._get_entry_tags({})
 649
 650         Empty tags:
 651
 652         >>> f._get_entry_tags({'tags': []})
 653
 654         Tags without a ``term`` entry:
 655
 656         >>> f._get_entry_tags({
 657         ...         'tags': [{'scheme': None,
 658         ...                   'label': None}]})
 659
 660         Tags with an empty term:
 661
 662         >>> f._get_entry_tags({
 663         ...         'tags': [{'term': '',
 664         ...                   'scheme': None,
 665         ...                   'label': None}]})
 666         """
 667         taglist = [tag['term'] for tag in entry.get('tags', [])
 668                    if tag.get('term', '')]
 669         if taglist:
 670             return ','.join(taglist)
 671
 672     def _get_entry_content(self, entry):
 673         """Select the best content from an entry.
 674
 675         Returns a feedparser content dict.
 676         """
 677         # How this works:
 678         #  * We have a bunch of potential contents.
 679         #  * We go thru looking for our first choice.
 680         #    (HTML or text, depending on self.html_mail)
 681         #  * If that doesn't work, we go thru looking for our second choice.
 682         #  * If that still doesn't work, we just take the first one.
 683         #
 684         # Possible future improvement:
 685         #  * Instead of just taking the first one
 686         #    pick the one in the "best" language.
 687         #  * HACK: hardcoded .html_mail, should take a tuple of media types
 688         contents = list(entry.get('content', []))
 689         if entry.get('summary_detail', None):
 690             contents.append(entry.summary_detail)
 691         if self.html_mail:
 692             types = ['text/html', 'text/plain']
 693         else:
 694             types = ['text/plain', 'text/html']
 695         for content_type in types:
 696             for content in contents:
 697                 if content['type'] == content_type:
 698                     return content
 699         if contents:
 700             return contents[0]
 701         return {'type': 'text/plain', 'value': ''}
 702
 703     def _process_entry_content(self, entry, content, subject):
 704         "Convert entry content to the requested format."
 705         link = self._get_entry_link(entry)
 706         if self.html_mail:
 707             lines = [
 708                 '<!DOCTYPE html>',
 709                 '<html>',
 710                 '  <head>',
 711                 ]
 712             if self.use_css and self.css:
 713                 lines.extend([
 714                         '    <style type="text/css">',
 715                         self.css,
 716                         '    </style>',
 717                         ])
 718             lines.extend([
 719                     '</head>',
 720                     '<body>',
 721                     '<div id="entry">',
 722                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
 723                         link, subject),
 724                     '<div id="body">',
 725                     ])
 726             if content['type'] in ('text/html', 'application/xhtml+xml'):
 727                 lines.append(content['value'].strip())
 728             else:
 729                 lines.append(_saxutils.escape(content['value'].strip()))
 730             lines.append('</div>')
 731             lines.extend([
 732                     '<div class="footer">'
 733                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
 734                     ])
 735             for enclosure in getattr(entry, 'enclosures', []):
 736                 if getattr(enclosure, 'url', None):
 737                     lines.append(
 738                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 739                             enclosure.url))
 740                 if getattr(enclosure, 'src', None):
 741                     lines.append(
 742                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 743                             enclosure.src))
 744                     lines.append(
 745                         '<p><img src="{}" /></p>'.format(enclosure.src))
 746             for elink in getattr(entry, 'links', []):
 747                 if elink.get('rel', None) == 'via':
 748                     url = elink['href']
 749                     title = elink.get('title', url)
 750                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
 751                             url, title))
 752             lines.extend([
 753                     '</div>',  # /footer
 754                     '</div>',  # /entry
 755                     '</body>',
 756                     '</html>',
 757                     ''])
 758             content['type'] = 'text/html'
 759             content['value'] = '\n'.join(lines)
 760             return content
 761         else:  # not self.html_mail
 762             if content['type'] in ('text/html', 'application/xhtml+xml'):
 763                 try:
 764                     lines = [self._html2text(content['value'])]
 765                 except _html_parser.HTMLParseError as e:
 766                     raise _error.ProcessingError(parsed=None, feed=self)
 767             else:
 768                 lines = [content['value']]
 769             lines.append('')
 770             lines.append('URL: {}'.format(link))
 771             for enclosure in getattr(entry, 'enclosures', []):
 772                 if getattr(enclosure, 'url', None):
 773                     lines.append('Enclosure: {}'.format(enclosure.url))
 774                 if getattr(enclosure, 'src', None):
 775                     lines.append('Enclosure: {}'.format(enclosure.src))
 776             for elink in getattr(entry, 'links', []):
 777                 if elink.get('rel', None) == 'via':
 778                     url = elink['href']
 779                     title = elink.get('title', url)
 780                     lines.append('Via: {} {}'.format(title, url))
 781             content['type'] = 'text/plain'
 782             content['value'] = '\n'.join(lines)
 783             return content
 784
 785     def _send(self, sender, message):
 786         _LOG.info('send message for {}'.format(self))
 787         section = self.section
 788         if section not in self.config:
 789             section = 'DEFAULT'
 790         _email.send(sender=sender, recipient=self.to, message=message,
 791                     config=self.config, section=section)
 792
 793     def run(self, send=True):
 794         """Fetch and process the feed, mailing entry emails.
 795
 796         >>> feed = Feed(
 797         ...    name='test-feed',
 798         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 799         >>> def send(sender, message):
 800         ...    print('send from {}:'.format(sender))
 801         ...    print(message.as_string())
 802         >>> feed._send = send
 803         >>> feed.to = 'jdoe@dummy.invalid'
 804         >>> #parsed = feed.run()  # enable for debugging
 805         """
 806         if not self.to:
 807             raise _error.NoToEmailAddress(feed=self)
 808         parsed = self._fetch()
 809
 810         if self.digest:
 811             digest = self._new_digest()
 812             seen = []
 813
 814         for (guid, id_, sender, message) in self._process(parsed):
 815             _LOG.debug('new message: {}'.format(message['Subject']))
 816             if self.digest:
 817                 seen.append((guid, id_))
 818                 self._append_to_digest(digest=digest, message=message)
 819             else:
 820                 if send:
 821                     self._send(sender=sender, message=message)
 822                 if guid not in self.seen:
 823                     self.seen[guid] = {}
 824                 self.seen[guid]['id'] = id_
 825
 826         if self.digest and seen:
 827             if self.digest_post_process:
 828                 digest = self.digest_post_process(
 829                     feed=self, parsed=parsed, seen=seen, message=digest)
 830                 if not digest:
 831                     return
 832             self._send_digest(
 833                 digest=digest, seen=seen, sender=sender, send=send)
 834
 835         self.etag = parsed.get('etag', None)
 836         self.modified = parsed.get('modified', None)
 837
 838     def _new_digest(self):
 839         digest = _MIMEMultipart('digest')
 840         digest['To'] = self.to  # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
 841         digest['Subject'] = 'digest for {}'.format(self.name)
 842         digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
 843         digest['User-Agent'] = _USER_AGENT
 844         digest['X-RSS-Feed'] = self.url
 845         return digest
 846
 847     def _append_to_digest(self, digest, message):
 848         part = _MIMEMessage(message)
 849         part.add_header('Content-Disposition', 'attachment')
 850         digest.attach(part)
 851
 852     def _send_digest(self, digest, seen, sender, send=True):
 853         """Send a digest message
 854
 855         The date is extracted from the last message in the digest
 856         payload.  We assume that this part exists.  If you don't have
 857         any messages in the digest, don't call this function.
 858         """
 859         digest['From'] = sender  # TODO: _Header(), _formataddr()...
 860         last_part = digest.get_payload()[-1]
 861         last_message = last_part.get_payload()[0]
 862         digest['Date'] = last_message['Date']
 863
 864         _LOG.debug('new digest for {}'.format(self))
 865         if send:
 866             self._send(sender=sender, message=digest)
 867         for (guid, id_) in seen:
 868             if guid not in self.seen:
 869                 self.seen[guid] = {}
 870             self.seen[guid]['id'] = id_