rss2email/feed.py

   1 # -*- coding: utf-8 -*-
   2 # Copyright (C) 2004-2013 Aaron Swartz
   3 #                         Brian Lalor
   4 #                         Dean Jackson
   5 #                         Dennis Keitzel <github@pinshot.net>
   6 #                         Erik Hetzner
   7 #                         Etienne Millon <me@emillon.org>
   8 #                         J. Lewis Muir <jlmuir@imca-cat.org>
   9 #                         Joey Hess
  10 #                         Lindsey Smith <lindsey.smith@gmail.com>
  11 #                         Marcel Ackermann
  12 #                         Martin 'Joey' Schulze
  13 #                         Matej Cepl
  14 #                         W. Trevor King <wking@tremily.us>
  15 #
  16 # This file is part of rss2email.
  17 #
  18 # rss2email is free software: you can redistribute it and/or modify it under
  19 # the terms of the GNU General Public License as published by the Free Software
  20 # Foundation, either version 2 of the License, or (at your option) version 3 of
  21 # the License.
  22 #
  23 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
  24 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  25 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  26 #
  27 # You should have received a copy of the GNU General Public License along with
  28 # rss2email.  If not, see <http://www.gnu.org/licenses/>.
  29
  30 """Define the ``Feed`` class for handling a single feed
  31 """
  32
  33 import collections as _collections
  34 from email.mime.message import MIMEMessage as _MIMEMessage
  35 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
  36 from email.utils import formataddr as _formataddr
  37 import hashlib as _hashlib
  38 import html.parser as _html_parser
  39 import re as _re
  40 import socket as _socket
  41 import time as _time
  42 import urllib.error as _urllib_error
  43 import urllib.request as _urllib_request
  44 import uuid as _uuid
  45 import xml.sax as _sax
  46 import xml.sax.saxutils as _saxutils
  47
  48 import feedparser as _feedparser
  49 import html2text as _html2text
  50
  51 from . import __url__
  52 from . import __version__
  53 from . import LOG as _LOG
  54 from . import config as _config
  55 from . import email as _email
  56 from . import error as _error
  57 from . import util as _util
  58
  59
  60 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
  61 _feedparser.USER_AGENT = _USER_AGENT
  62 _urllib_request.install_opener(_urllib_request.build_opener())
  63 _SOCKET_ERRORS = []
  64 for e in ['error', 'herror', 'gaierror']:
  65     if hasattr(_socket, e):
  66         _SOCKET_ERRORS.append(getattr(_socket, e))
  67 del e  # cleanup namespace
  68 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
  69
  70 # drv_libxml2 raises:
  71 #   TypeError: 'str' does not support the buffer interface
  72 _feedparser.PREFERRED_XML_PARSERS = []
  73
  74
  75 class Feed (object):
  76     """Utility class for feed manipulation and storage.
  77
  78     >>> import pickle
  79     >>> import sys
  80     >>> from .config import CONFIG
  81
  82     >>> feed = Feed(
  83     ...    name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
  84     >>> print(feed)
  85     test-feed (http://example.com/feed.atom -> a@b.com)
  86     >>> feed.section
  87     'feed.test-feed'
  88     >>> feed.from_email
  89     'user@rss2email.invalid'
  90
  91     >>> feed.from_email = 'a@b.com'
  92     >>> feed.save_to_config()
  93     >>> feed.config.write(sys.stdout)  # doctest: +REPORT_UDIFF, +ELLIPSIS
  94     [DEFAULT]
  95     from = user@rss2email.invalid
  96     ...
  97     verbose = warning
  98     <BLANKLINE>
  99     [feed.test-feed]
 100     url = http://example.com/feed.atom
 101     from = a@b.com
 102     to = a@b.com
 103     <BLANKLINE>
 104
 105     >>> feed.etag = 'dummy etag'
 106     >>> string = pickle.dumps(feed)
 107     >>> feed = pickle.loads(string)
 108     >>> feed.load_from_config(config=CONFIG)
 109     >>> feed.etag
 110     'dummy etag'
 111     >>> feed.url
 112     'http://example.com/feed.atom'
 113
 114     Names can only contain letters, digits, and '._-'.  Here the
 115     invalid space causes an exception:
 116
 117     >>> Feed(name='invalid name')
 118     Traceback (most recent call last):
 119       ...
 120     rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
 121
 122     However, you aren't restricted to ASCII letters:
 123
 124     >>> Feed(name='Αθήνα')
 125     <Feed Αθήνα (None -> )>
 126
 127     You must define a URL:
 128
 129     >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
 130     Traceback (most recent call last):
 131       ...
 132     rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
 133
 134
 135     Cleanup `CONFIG`.
 136
 137     >>> CONFIG['DEFAULT']['to'] = ''
 138     >>> test_section = CONFIG.pop('feed.test-feed')
 139
 140     """
 141     _name_regexp = _re.compile('^[\w\d.-]+$')
 142
 143     # saved/loaded from feed.dat using __getstate__/__setstate__.
 144     _dynamic_attributes = [
 145         'name',
 146         'etag',
 147         'modified',
 148         'seen',
 149         ]
 150
 151     ## saved/loaded from ConfigParser instance
 152     # attributes that aren't in DEFAULT
 153     _non_default_configured_attributes = [
 154         'url',
 155         ]
 156     # attributes that are in DEFAULT
 157     _default_configured_attributes = [
 158         key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
 159     _default_configured_attributes[
 160         _default_configured_attributes.index('from')
 161         ] = 'from_email'  # `from` is a Python keyword
 162     # all attributes that are saved/loaded from .config
 163     _configured_attributes = (
 164         _non_default_configured_attributes + _default_configured_attributes)
 165     # attribute name -> .config option
 166     _configured_attribute_translations = dict(
 167         (attr,attr) for attr in _non_default_configured_attributes)
 168     _configured_attribute_translations.update(dict(
 169             zip(_default_configured_attributes,
 170                 _config.CONFIG['DEFAULT'].keys())))
 171     # .config option -> attribute name
 172     _configured_attribute_inverse_translations = dict(
 173         (v,k) for k,v in _configured_attribute_translations.items())
 174
 175     # hints for value conversion
 176     _boolean_attributes = [
 177         'digest',
 178         'force_from',
 179         'use_publisher_email',
 180         'active',
 181         'date_header',
 182         'trust_guid',
 183         'html_mail',
 184         'use_css',
 185         'unicode_snob',
 186         'links_after_each_paragraph',
 187         'use_smtp',
 188         'smtp_ssl',
 189         ]
 190
 191     _integer_attributes = [
 192         'feed_timeout',
 193         'body_width',
 194         ]
 195
 196     _list_attributes = [
 197         'date_header_order',
 198         'encodings',
 199         ]
 200
 201     _function_attributes = [
 202         'post_process',
 203         'digest_post_process',
 204         ]
 205
 206     def __init__(self, name=None, url=None, to=None, config=None):
 207         self._set_name(name=name)
 208         self.reset()
 209         self.__setstate__(dict(
 210                 (attr, getattr(self, attr))
 211                 for attr in self._dynamic_attributes))
 212         self.load_from_config(config=config)
 213         if url:
 214             self.url = url
 215         if to:
 216             self.to = to
 217
 218     def __str__(self):
 219         return '{} ({} -> {})'.format(self.name, self.url, self.to)
 220
 221     def __repr__(self):
 222         return '<Feed {}>'.format(str(self))
 223
 224     def __getstate__(self):
 225         "Save dyamic attributes"
 226         return dict(
 227             (key,getattr(self,key)) for key in self._dynamic_attributes)
 228
 229     get_state = __getstate__  # make it publicly accessible
 230
 231     def __setstate__(self, state):
 232         "Restore dynamic attributes"
 233         keys = sorted(state.keys())
 234         if keys != sorted(self._dynamic_attributes):
 235             raise ValueError(state)
 236         self._set_name(name=state['name'])
 237         self.__dict__.update(state)
 238
 239     set_state = __setstate__  # make it publicly accessible
 240
 241     def save_to_config(self):
 242         "Save configured attributes"
 243         data = _collections.OrderedDict()
 244         default = self.config['DEFAULT']
 245         for attr in self._configured_attributes:
 246             key = self._configured_attribute_translations[attr]
 247             value = getattr(self, attr)
 248             if value is not None:
 249                 value = self._get_configured_option_value(
 250                     attribute=attr, value=value)
 251                 if (attr in self._non_default_configured_attributes or
 252                     value != default[key]):
 253                     data[key] = value
 254         self.config[self.section] = data
 255
 256     def load_from_config(self, config=None):
 257         "Restore configured attributes"
 258         if config is None:
 259             config = _config.CONFIG
 260         self.config = config
 261         if self.section in self.config:
 262             data = self.config[self.section]
 263         else:
 264             data = self.config['DEFAULT']
 265         keys = sorted(data.keys())
 266         expected = sorted(self._configured_attribute_translations.values())
 267         if keys != expected:
 268             for key in expected:
 269                 if (key not in keys and
 270                     key not in self._non_default_configured_attributes):
 271                     raise _error.InvalidFeedConfig(
 272                         setting=key, feed=self,
 273                         message='missing configuration key: {}'.format(key))
 274             for key in keys:
 275                 if key not in expected:
 276                     raise _error.InvalidFeedConfig(
 277                         setting=key, feed=self,
 278                         message='extra configuration key: {}'.format(key))
 279         data = dict(
 280             (self._configured_attribute_inverse_translations[k],
 281              self._get_configured_attribute_value(
 282                   attribute=self._configured_attribute_inverse_translations[k],
 283                   key=k, data=data))
 284             for k in data.keys())
 285         for attr in self._non_default_configured_attributes:
 286             if attr not in data:
 287                 data[attr] = None
 288         self.__dict__.update(data)
 289
 290     def _get_configured_option_value(self, attribute, value):
 291         if value is None:
 292             return ''
 293         elif attribute in self._list_attributes:
 294             return ', '.join(value)
 295         elif attribute in self._function_attributes:
 296             return _util.import_name(value)
 297         return str(value)
 298
 299     def _get_configured_attribute_value(self, attribute, key, data):
 300         if attribute in self._boolean_attributes:
 301             return data.getboolean(key)
 302         elif attribute in self._integer_attributes:
 303             return data.getint(key)
 304         elif attribute in self._list_attributes:
 305             return [x.strip() for x in data[key].split(',')]
 306         elif attribute in self._function_attributes:
 307             if data[key]:
 308                 return _util.import_function(data[key])
 309             return None
 310         return data[key]
 311
 312     def reset(self):
 313         """Reset dynamic data
 314         """
 315         self.etag = None
 316         self.modified = None
 317         self.seen = {}
 318
 319     def _set_name(self, name):
 320         if not self._name_regexp.match(name):
 321             raise _error.InvalidFeedName(name=name, feed=self)
 322         self.name = name
 323         self.section = 'feed.{}'.format(self.name)
 324
 325     def _fetch(self):
 326         """Fetch and parse a feed using feedparser.
 327
 328         >>> feed = Feed(
 329         ...    name='test-feed',
 330         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 331         >>> parsed = feed._fetch()
 332         >>> parsed.status
 333         200
 334         """
 335         _LOG.info('fetch {}'.format(self))
 336         if not self.url:
 337             raise _error.InvalidFeedConfig(setting='url', feed=self)
 338         if self.section in self.config:
 339             config = self.config[self.section]
 340         else:
 341             config = self.config['DEFAULT']
 342         proxy = config['proxy']
 343         timeout = config.getint('feed-timeout')
 344         kwargs = {}
 345         if proxy:
 346             kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
 347         f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
 348         return f(self.url, self.etag, modified=self.modified, **kwargs)
 349
 350     def _process(self, parsed):
 351         _LOG.info('process {}'.format(self))
 352         self._check_for_errors(parsed)
 353         for entry in reversed(parsed.entries):
 354             _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
 355             processed = self._process_entry(parsed=parsed, entry=entry)
 356             if processed:
 357                 guid,id_,sender,message = processed
 358                 if self.post_process:
 359                     message = self.post_process(
 360                         feed=self, parsed=parsed, entry=entry, guid=guid,
 361                         message=message)
 362                     if not message:
 363                         continue
 364                 yield (guid, id_, sender, message)
 365
 366     def _check_for_errors(self, parsed):
 367         warned = False
 368         status = getattr(parsed, 'status', 200)
 369         _LOG.debug('HTTP status {}'.format(status))
 370         if status == 301:
 371             _LOG.info('redirect {} from {} to {}'.format(
 372                     self.name, self.url, parsed['url']))
 373             self.url = parsed['url']
 374         elif status not in [200, 302, 304]:
 375             raise _error.HTTPError(status=status, feed=self)
 376
 377         http_headers = parsed.get('headers', {})
 378         if http_headers:
 379             _LOG.debug('HTTP headers: {}'.format(http_headers))
 380         if not http_headers:
 381             _LOG.warning('could not get HTTP headers: {}'.format(self))
 382             warned = True
 383         else:
 384             if 'html' in http_headers.get('content-type', 'rss'):
 385                 _LOG.warning('looks like HTML: {}'.format(self))
 386                 warned = True
 387             if http_headers.get('content-length', '1') == '0':
 388                 _LOG.warning('empty page: {}'.format(self))
 389                 warned = True
 390
 391         version = parsed.get('version', None)
 392         if version:
 393             _LOG.debug('feed version {}'.format(version))
 394         else:
 395             _LOG.warning('unrecognized version: {}'.format(self))
 396             warned = True
 397
 398         exc = parsed.get('bozo_exception', None)
 399         if isinstance(exc, _socket.timeout):
 400             _LOG.error('timed out: {}'.format(self))
 401             warned = True
 402         elif isinstance(exc, OSError):
 403             _LOG.error('{}: {}'.format(exc, self))
 404             warned = True
 405         elif isinstance(exc, _SOCKET_ERRORS):
 406             _LOG.error('{}: {}'.format(exc, self))
 407             warned = True
 408         elif isinstance(exc, _feedparser.zlib.error):
 409             _LOG.error('broken compression: {}'.format(self))
 410             warned = True
 411         elif isinstance(exc, (IOError, AttributeError)):
 412             _LOG.error('{}: {}'.format(exc, self))
 413             warned = True
 414         elif isinstance(exc, KeyboardInterrupt):
 415             raise exc
 416         elif isinstance(exc, _sax.SAXParseException):
 417             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
 418             warned = True
 419         elif (parsed.bozo and
 420               isinstance(exc, _feedparser.CharacterEncodingOverride)):
 421             _LOG.warning(
 422                 'incorrectly declared encoding: {}: {}'.format(exc, self))
 423             warned = True
 424         elif parsed.bozo or exc:
 425             if exc is None:
 426                 exc = "can't process"
 427             _LOG.error('processing error: {}: {}'.format(exc, self))
 428             warned = True
 429
 430         if (not warned and
 431             status in [200, 302] and
 432             not parsed.entries and
 433             not version):
 434             raise _error.ProcessingError(parsed=parsed, feed=feed)
 435
 436     def _html2text(self, html, baseurl='', default=None):
 437         self.config.setup_html2text(section=self.section)
 438         try:
 439             return _html2text.html2text(html=html, baseurl=baseurl)
 440         except _html_parser.HTMLParseError as e:
 441             if default is not None:
 442                 return default
 443             raise
 444
 445     def _process_entry(self, parsed, entry):
 446         id_ = self._get_entry_id(entry)
 447         # If .trust_guid isn't set, we get back hashes of the content.
 448         # Instead of letting these run wild, we put them in context
 449         # by associating them with the actual ID (if it exists).
 450         guid = entry.get('id', id_)
 451         if isinstance(guid, dict):
 452             guid = guid.values()[0]
 453         if guid in self.seen:
 454             if self.seen[guid]['id'] == id_:
 455                 _LOG.debug('already seen {}'.format(id_))
 456                 return  # already seen
 457         sender = self._get_entry_email(parsed=parsed, entry=entry)
 458         subject = self._get_entry_title(entry)
 459         extra_headers = _collections.OrderedDict((
 460                 ('Date', self._get_entry_date(entry)),
 461                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
 462                 ('User-Agent', _USER_AGENT),
 463                 ('X-RSS-Feed', self.url),
 464                 ('X-RSS-ID', id_),
 465                 ('X-RSS-URL', self._get_entry_link(entry)),
 466                 ('X-RSS-TAGS', self._get_entry_tags(entry)),
 467                 ))
 468         for k,v in extra_headers.items():  # remove empty tags, etc.
 469             if v is None:
 470                 extra_headers.pop(k)
 471         if self.bonus_header:
 472             for header in self.bonus_header.splitlines():
 473                 if ':' in header:
 474                     key,value = header.split(':', 1)
 475                     extra_headers[key.strip()] = value.strip()
 476                 else:
 477                     _LOG.warning(
 478                         'malformed bonus-header: {}'.format(
 479                             self.bonus_header))
 480
 481         content = self._get_entry_content(entry)
 482         try:
 483             content = self._process_entry_content(
 484                 entry=entry, content=content, subject=subject)
 485         except _error.ProcessingError as e:
 486             e.parsed = parsed
 487             raise
 488         message = _email.get_message(
 489             sender=sender,
 490             recipient=self.to,
 491             subject=subject,
 492             body=content['value'],
 493             content_type=content['type'].split('/', 1)[1],
 494             extra_headers=extra_headers,
 495             config=self.config,
 496             section=self.section)
 497         return (guid, id_, sender, message)
 498
 499     def _get_entry_id(self, entry):
 500         """Get best ID from an entry."""
 501         if self.trust_guid:
 502             if getattr(entry, 'id', None):
 503                 # Newer versions of feedparser could return a dictionary
 504                 if isinstance(entry.id, dict):
 505                     return entry.id.values()[0]
 506                 return entry.id
 507         content = self._get_entry_content(entry)
 508         content_value = content['value'].strip()
 509         if content_value:
 510             return _hashlib.sha1(
 511                 content_value.encode('unicode-escape')).hexdigest()
 512         elif getattr(entry, 'link', None):
 513             return _hashlib.sha1(
 514                 entry.link.encode('unicode-escape')).hexdigest()
 515         elif getattr(entry, 'title', None):
 516             return _hashlib.sha1(
 517                 entry.title.encode('unicode-escape')).hexdigest()
 518
 519     def _get_entry_link(self, entry):
 520         return entry.get('link', None)
 521
 522     def _get_entry_title(self, entry):
 523         if hasattr(entry, 'title_detail') and entry.title_detail:
 524             title = entry.title_detail.value
 525             if 'html' in entry.title_detail.type:
 526                 title = self._html2text(title, default=title)
 527         else:
 528             content = self._get_entry_content(entry)
 529             value = content['value']
 530             if content['type'] in ('text/html', 'application/xhtml+xml'):
 531                 value = self._html2text(value, default=value)
 532             title = value[:70]
 533         title = title.replace('\n', ' ').strip()
 534         return title
 535
 536     def _get_entry_date(self, entry):
 537         datetime = _time.gmtime()
 538         if self.date_header:
 539             for datetype in self.date_header_order:
 540                 kind = datetype + '_parsed'
 541                 if entry.get(kind, None):
 542                     datetime = entry[kind]
 543                     break
 544         return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 545
 546     def _get_entry_name(self, parsed, entry):
 547         """Get the best name
 548
 549         >>> import feedparser
 550         >>> f = Feed(name='test-feed')
 551         >>> parsed = feedparser.parse(
 552         ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
 553         ...     '  <entry>\\n'
 554         ...     '    <author>\\n'
 555         ...     '      <name>Example author</name>\\n'
 556         ...     '      <email>me@example.com</email>\\n'
 557         ...     '      <url>http://example.com/</url>\\n'
 558         ...     '    </author>\\n'
 559         ...     '  </entry>\\n'
 560         ...     '</feed>\\n'
 561         ...     )
 562         >>> entry = parsed.entries[0]
 563         >>> f.name_format = ''
 564         >>> f._get_entry_name(parsed, entry)
 565         ''
 566         >>> f.name_format = '{author}'
 567         >>> f._get_entry_name(parsed, entry)
 568         'Example author'
 569         >>> f.name_format = '{feed-title}: {author}'
 570         >>> f._get_entry_name(parsed, entry)
 571         ': Example author'
 572         >>> f.name_format = '{author} ({feed.name})'
 573         >>> f._get_entry_name(parsed, entry)
 574         'Example author (test-feed)'
 575         """
 576         if not self.name_format:
 577             return ''
 578         data = {
 579             'feed': self,
 580             'feed-title': '<feed title>',
 581             'author': '<author>',
 582             'publisher': '<publisher>',
 583             }
 584         feed = parsed.feed
 585         data['feed-title'] = feed.get('title', '')
 586         for x in [entry, feed]:
 587             if 'name' in x.get('author_detail', []):
 588                 if x.author_detail.name:
 589                     data['author'] = x.author_detail.name
 590                     break
 591         if 'name' in feed.get('publisher_detail', []):
 592             data['publisher'] = feed.publisher_detail.name
 593         name = self.name_format.format(**data)
 594         return _html2text.unescape(name)
 595
 596     def _validate_email(self, email, default=None):
 597         """Do a basic quality check on email address
 598
 599         Return `default` if the address doesn't appear to be
 600         well-formed.  If `default` is `None`, return
 601         `self.from_email`.
 602
 603         >>> f = Feed(name='test-feed')
 604         >>> f._validate_email('valid@example.com', 'default@example.com')
 605         'valid@example.com'
 606         >>> f._validate_email('invalid@', 'default@example.com')
 607         'default@example.com'
 608         >>> f._validate_email('@invalid', 'default@example.com')
 609         'default@example.com'
 610         >>> f._validate_email('invalid', 'default@example.com')
 611         'default@example.com'
 612         """
 613         parts = email.split('@')
 614         if len(parts) != 2 or '' in parts:
 615             if default is None:
 616                 return self.from_email
 617             return default
 618         return email
 619
 620     def _get_entry_address(self, parsed, entry):
 621         """Get the best From email address ('<jdoe@a.com>')
 622
 623         If the best guess isn't well-formed (something@somthing.com),
 624         use `self.from_email` instead.
 625         """
 626         if self.force_from:
 627             return self.from_email
 628         feed = parsed.feed
 629         if 'email' in entry.get('author_detail', []):
 630             return self._validate_email(entry.author_detail.email)
 631         elif 'email' in feed.get('author_detail', []):
 632             return self._validate_email(feed.author_detail.email)
 633         if self.use_publisher_email:
 634             if 'email' in feed.get('publisher_detail', []):
 635                 return self._validate_email(feed.publisher_detail.email)
 636             if feed.get('errorreportsto', None):
 637                 return self._validate_email(feed.errorreportsto)
 638         _LOG.debug('no sender address found, fallback to default')
 639         return self.from_email
 640
 641     def _get_entry_email(self, parsed, entry):
 642         """Get the best From email address ('John <jdoe@a.com>')
 643         """
 644         name = self._get_entry_name(parsed=parsed, entry=entry)
 645         address = self._get_entry_address(parsed=parsed, entry=entry)
 646         return _formataddr((name, address))
 647
 648     def _get_entry_tags(self, entry):
 649         """Add post tags, if available
 650
 651         >>> f = Feed(name='test-feed')
 652         >>> f._get_entry_tags({
 653         ...         'tags': [{'term': 'tag1',
 654         ...                   'scheme': None,
 655         ...                   'label': None}]})
 656         'tag1'
 657         >>> f._get_entry_tags({
 658         ...         'tags': [{'term': 'tag1',
 659         ...                   'scheme': None,
 660         ...                   'label': None},
 661         ...                  {'term': 'tag2',
 662         ...                   'scheme': None,
 663         ...                   'label': None}]})
 664         'tag1,tag2'
 665
 666         Test some troublesome cases.  No tags:
 667
 668         >>> f._get_entry_tags({})
 669
 670         Empty tags:
 671
 672         >>> f._get_entry_tags({'tags': []})
 673
 674         Tags without a ``term`` entry:
 675
 676         >>> f._get_entry_tags({
 677         ...         'tags': [{'scheme': None,
 678         ...                   'label': None}]})
 679
 680         Tags with an empty term:
 681
 682         >>> f._get_entry_tags({
 683         ...         'tags': [{'term': '',
 684         ...                   'scheme': None,
 685         ...                   'label': None}]})
 686         """
 687         taglist = [tag['term'] for tag in entry.get('tags', [])
 688                    if tag.get('term', '')]
 689         if taglist:
 690             return ','.join(taglist)
 691
 692     def _get_entry_content(self, entry):
 693         """Select the best content from an entry.
 694
 695         Returns a feedparser content dict.
 696         """
 697         # How this works:
 698         #  * We have a bunch of potential contents.
 699         #  * We go thru looking for our first choice.
 700         #    (HTML or text, depending on self.html_mail)
 701         #  * If that doesn't work, we go thru looking for our second choice.
 702         #  * If that still doesn't work, we just take the first one.
 703         #
 704         # Possible future improvement:
 705         #  * Instead of just taking the first one
 706         #    pick the one in the "best" language.
 707         #  * HACK: hardcoded .html_mail, should take a tuple of media types
 708         contents = list(entry.get('content', []))
 709         if entry.get('summary_detail', None):
 710             contents.append(entry.summary_detail)
 711         if self.html_mail:
 712             types = ['text/html', 'text/plain']
 713         else:
 714             types = ['text/plain', 'text/html']
 715         for content_type in types:
 716             for content in contents:
 717                 if content['type'] == content_type:
 718                     return content
 719         if contents:
 720             return contents[0]
 721         return {'type': 'text/plain', 'value': ''}
 722
 723     def _process_entry_content(self, entry, content, subject):
 724         "Convert entry content to the requested format."
 725         link = self._get_entry_link(entry)
 726         if self.html_mail:
 727             lines = [
 728                 '<!DOCTYPE html>',
 729                 '<html>',
 730                 '  <head>',
 731                 ]
 732             if self.use_css and self.css:
 733                 lines.extend([
 734                         '    <style type="text/css">',
 735                         self.css,
 736                         '    </style>',
 737                         ])
 738             lines.extend([
 739                     '</head>',
 740                     '<body>',
 741                     '<div id="entry">',
 742                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
 743                         link, subject),
 744                     '<div id="body">',
 745                     ])
 746             if content['type'] in ('text/html', 'application/xhtml+xml'):
 747                 lines.append(content['value'].strip())
 748             else:
 749                 lines.append(_saxutils.escape(content['value'].strip()))
 750             lines.append('</div>')
 751             lines.extend([
 752                     '<div class="footer">'
 753                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
 754                     ])
 755             for enclosure in getattr(entry, 'enclosures', []):
 756                 if getattr(enclosure, 'url', None):
 757                     lines.append(
 758                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 759                             enclosure.url))
 760                 if getattr(enclosure, 'src', None):
 761                     lines.append(
 762                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 763                             enclosure.src))
 764                     lines.append(
 765                         '<p><img src="{}" /></p>'.format(enclosure.src))
 766             for elink in getattr(entry, 'links', []):
 767                 if elink.get('rel', None) == 'via':
 768                     url = elink['href']
 769                     title = elink.get('title', url)
 770                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
 771                             url, title))
 772             lines.extend([
 773                     '</div>',  # /footer
 774                     '</div>',  # /entry
 775                     '</body>',
 776                     '</html>',
 777                     ''])
 778             content['type'] = 'text/html'
 779             content['value'] = '\n'.join(lines)
 780             return content
 781         else:  # not self.html_mail
 782             if content['type'] in ('text/html', 'application/xhtml+xml'):
 783                 try:
 784                     lines = [self._html2text(content['value'])]
 785                 except _html_parser.HTMLParseError as e:
 786                     raise _error.ProcessingError(parsed=None, feed=self)
 787             else:
 788                 lines = [content['value']]
 789             lines.append('')
 790             lines.append('URL: {}'.format(link))
 791             for enclosure in getattr(entry, 'enclosures', []):
 792                 if getattr(enclosure, 'url', None):
 793                     lines.append('Enclosure: {}'.format(enclosure.url))
 794                 if getattr(enclosure, 'src', None):
 795                     lines.append('Enclosure: {}'.format(enclosure.src))
 796             for elink in getattr(entry, 'links', []):
 797                 if elink.get('rel', None) == 'via':
 798                     url = elink['href']
 799                     title = elink.get('title', url)
 800                     lines.append('Via: {} {}'.format(title, url))
 801             content['type'] = 'text/plain'
 802             content['value'] = '\n'.join(lines)
 803             return content
 804
 805     def _send(self, sender, message):
 806         _LOG.info('send message for {}'.format(self))
 807         section = self.section
 808         if section not in self.config:
 809             section = 'DEFAULT'
 810         _email.send(sender=sender, recipient=self.to, message=message,
 811                     config=self.config, section=section)
 812
 813     def run(self, send=True):
 814         """Fetch and process the feed, mailing entry emails.
 815
 816         >>> feed = Feed(
 817         ...    name='test-feed',
 818         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 819         >>> def send(sender, message):
 820         ...    print('send from {}:'.format(sender))
 821         ...    print(message.as_string())
 822         >>> feed._send = send
 823         >>> feed.to = 'jdoe@dummy.invalid'
 824         >>> #parsed = feed.run()  # enable for debugging
 825         """
 826         if not self.to:
 827             raise _error.NoToEmailAddress(feed=self)
 828         parsed = self._fetch()
 829
 830         if self.digest:
 831             digest = self._new_digest()
 832             seen = []
 833
 834         for (guid, id_, sender, message) in self._process(parsed):
 835             _LOG.debug('new message: {}'.format(message['Subject']))
 836             if self.digest:
 837                 seen.append((guid, id_))
 838                 self._append_to_digest(digest=digest, message=message)
 839             else:
 840                 if send:
 841                     self._send(sender=sender, message=message)
 842                 if guid not in self.seen:
 843                     self.seen[guid] = {}
 844                 self.seen[guid]['id'] = id_
 845
 846         if self.digest and seen:
 847             if self.digest_post_process:
 848                 digest = self.digest_post_process(
 849                     feed=self, parsed=parsed, seen=seen, message=digest)
 850                 if not digest:
 851                     return
 852             self._send_digest(
 853                 digest=digest, seen=seen, sender=sender, send=send)
 854
 855         self.etag = parsed.get('etag', None)
 856         self.modified = parsed.get('modified', None)
 857
 858     def _new_digest(self):
 859         digest = _MIMEMultipart('digest')
 860         digest['To'] = self.to  # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
 861         digest['Subject'] = 'digest for {}'.format(self.name)
 862         digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
 863         digest['User-Agent'] = _USER_AGENT
 864         digest['X-RSS-Feed'] = self.url
 865         return digest
 866
 867     def _append_to_digest(self, digest, message):
 868         part = _MIMEMessage(message)
 869         part.add_header('Content-Disposition', 'attachment')
 870         digest.attach(part)
 871
 872     def _send_digest(self, digest, seen, sender, send=True):
 873         """Send a digest message
 874
 875         The date is extracted from the last message in the digest
 876         payload.  We assume that this part exists.  If you don't have
 877         any messages in the digest, don't call this function.
 878         """
 879         digest['From'] = sender  # TODO: _Header(), _formataddr()...
 880         last_part = digest.get_payload()[-1]
 881         last_message = last_part.get_payload()[0]
 882         digest['Date'] = last_message['Date']
 883
 884         _LOG.debug('new digest for {}'.format(self))
 885         if send:
 886             self._send(sender=sender, message=digest)
 887         for (guid, id_) in seen:
 888             if guid not in self.seen:
 889                 self.seen[guid] = {}
 890             self.seen[guid]['id'] = id_