rss2email/feed.py

   1 # -*- coding: utf-8 -*-
   2 # Copyright (C) 2004-2013 Aaron Swartz
   3 #                         Brian Lalor
   4 #                         Dean Jackson
   5 #                         Dennis Keitzel <github@pinshot.net>
   6 #                         Erik Hetzner
   7 #                         Etienne Millon <me@emillon.org>
   8 #                         J. Lewis Muir <jlmuir@imca-cat.org>
   9 #                         Joey Hess
  10 #                         Lindsey Smith <lindsey.smith@gmail.com>
  11 #                         Marcel Ackermann
  12 #                         Martin 'Joey' Schulze
  13 #                         Matej Cepl
  14 #                         W. Trevor King <wking@tremily.us>
  15 #
  16 # This file is part of rss2email.
  17 #
  18 # rss2email is free software: you can redistribute it and/or modify it under
  19 # the terms of the GNU General Public License as published by the Free Software
  20 # Foundation, either version 2 of the License, or (at your option) version 3 of
  21 # the License.
  22 #
  23 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
  24 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  25 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  26 #
  27 # You should have received a copy of the GNU General Public License along with
  28 # rss2email.  If not, see <http://www.gnu.org/licenses/>.
  29
  30 """Define the ``Feed`` class for handling a single feed
  31 """
  32
  33 import collections as _collections
  34 from email.mime.message import MIMEMessage as _MIMEMessage
  35 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
  36 from email.utils import formataddr as _formataddr
  37 import hashlib as _hashlib
  38 import html.parser as _html_parser
  39 import re as _re
  40 import socket as _socket
  41 import time as _time
  42 import urllib.error as _urllib_error
  43 import urllib.request as _urllib_request
  44 import uuid as _uuid
  45 import xml.sax as _sax
  46 import xml.sax.saxutils as _saxutils
  47
  48 import feedparser as _feedparser
  49 import html2text as _html2text
  50
  51 from . import __url__
  52 from . import __version__
  53 from . import LOG as _LOG
  54 from . import config as _config
  55 from . import email as _email
  56 from . import error as _error
  57 from . import util as _util
  58
  59
  60 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
  61 _feedparser.USER_AGENT = _USER_AGENT
  62 _urllib_request.install_opener(_urllib_request.build_opener())
  63 _SOCKET_ERRORS = []
  64 for e in ['error', 'herror', 'gaierror']:
  65     if hasattr(_socket, e):
  66         _SOCKET_ERRORS.append(getattr(_socket, e))
  67 del e  # cleanup namespace
  68 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
  69
  70 # drv_libxml2 raises:
  71 #   TypeError: 'str' does not support the buffer interface
  72 _feedparser.PREFERRED_XML_PARSERS = []
  73
  74
  75 class Feed (object):
  76     """Utility class for feed manipulation and storage.
  77
  78     >>> import pickle
  79     >>> import sys
  80     >>> from .config import CONFIG
  81
  82     >>> feed = Feed(
  83     ...    name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
  84     >>> print(feed)
  85     test-feed (http://example.com/feed.atom -> a@b.com)
  86     >>> feed.section
  87     'feed.test-feed'
  88     >>> feed.from_email
  89     'user@rss2email.invalid'
  90
  91     >>> feed.from_email = 'a@b.com'
  92     >>> feed.save_to_config()
  93     >>> feed.config.write(sys.stdout)  # doctest: +REPORT_UDIFF, +ELLIPSIS
  94     [DEFAULT]
  95     from = user@rss2email.invalid
  96     ...
  97     verbose = warning
  98     <BLANKLINE>
  99     [feed.test-feed]
 100     url = http://example.com/feed.atom
 101     from = a@b.com
 102     to = a@b.com
 103     <BLANKLINE>
 104
 105     >>> feed.etag = 'dummy etag'
 106     >>> string = pickle.dumps(feed)
 107     >>> feed = pickle.loads(string)
 108     >>> feed.load_from_config(config=CONFIG)
 109     >>> feed.etag
 110     'dummy etag'
 111     >>> feed.url
 112     'http://example.com/feed.atom'
 113
 114     Names can only contain letters, digits, and '._-'.  Here the
 115     invalid space causes an exception:
 116
 117     >>> Feed(name='invalid name')
 118     Traceback (most recent call last):
 119       ...
 120     rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
 121
 122     However, you aren't restricted to ASCII letters:
 123
 124     >>> Feed(name='Αθήνα')
 125     <Feed Αθήνα (None -> )>
 126
 127     You must define a URL:
 128
 129     >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
 130     Traceback (most recent call last):
 131       ...
 132     rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
 133
 134
 135     Cleanup `CONFIG`.
 136
 137     >>> CONFIG['DEFAULT']['to'] = ''
 138     >>> test_section = CONFIG.pop('feed.test-feed')
 139
 140     """
 141     _name_regexp = _re.compile('^[\w\d.-]+$')
 142
 143     # saved/loaded from feed.dat using __getstate__/__setstate__.
 144     _dynamic_attributes = [
 145         'name',
 146         'etag',
 147         'modified',
 148         'seen',
 149         ]
 150
 151     ## saved/loaded from ConfigParser instance
 152     # attributes that aren't in DEFAULT
 153     _non_default_configured_attributes = [
 154         'url',
 155         ]
 156     # attributes that are in DEFAULT
 157     _default_configured_attributes = [
 158         key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
 159     _default_configured_attributes[
 160         _default_configured_attributes.index('from')
 161         ] = 'from_email'  # `from` is a Python keyword
 162     # all attributes that are saved/loaded from .config
 163     _configured_attributes = (
 164         _non_default_configured_attributes + _default_configured_attributes)
 165     # attribute name -> .config option
 166     _configured_attribute_translations = dict(
 167         (attr,attr) for attr in _non_default_configured_attributes)
 168     _configured_attribute_translations.update(dict(
 169             zip(_default_configured_attributes,
 170                 _config.CONFIG['DEFAULT'].keys())))
 171     # .config option -> attribute name
 172     _configured_attribute_inverse_translations = dict(
 173         (v,k) for k,v in _configured_attribute_translations.items())
 174
 175     # hints for value conversion
 176     _boolean_attributes = [
 177         'digest',
 178         'force_from',
 179         'use_publisher_email',
 180         'active',
 181         'date_header',
 182         'trust_guid',
 183         'trust_link',
 184         'html_mail',
 185         'use_css',
 186         'unicode_snob',
 187         'links_after_each_paragraph',
 188         'use_smtp',
 189         'smtp_ssl',
 190         ]
 191
 192     _integer_attributes = [
 193         'feed_timeout',
 194         'body_width',
 195         ]
 196
 197     _list_attributes = [
 198         'date_header_order',
 199         'encodings',
 200         ]
 201
 202     _function_attributes = [
 203         'post_process',
 204         'digest_post_process',
 205         ]
 206
 207     def __init__(self, name=None, url=None, to=None, config=None):
 208         self._set_name(name=name)
 209         self.reset()
 210         self.__setstate__(dict(
 211                 (attr, getattr(self, attr))
 212                 for attr in self._dynamic_attributes))
 213         self.load_from_config(config=config)
 214         if url:
 215             self.url = url
 216         if to:
 217             self.to = to
 218
 219     def __str__(self):
 220         return '{} ({} -> {})'.format(self.name, self.url, self.to)
 221
 222     def __repr__(self):
 223         return '<Feed {}>'.format(str(self))
 224
 225     def __getstate__(self):
 226         "Save dyamic attributes"
 227         return dict(
 228             (key,getattr(self,key)) for key in self._dynamic_attributes)
 229
 230     get_state = __getstate__  # make it publicly accessible
 231
 232     def __setstate__(self, state):
 233         "Restore dynamic attributes"
 234         keys = sorted(state.keys())
 235         if keys != sorted(self._dynamic_attributes):
 236             raise ValueError(state)
 237         self._set_name(name=state['name'])
 238         self.__dict__.update(state)
 239
 240     set_state = __setstate__  # make it publicly accessible
 241
 242     def save_to_config(self):
 243         "Save configured attributes"
 244         data = _collections.OrderedDict()
 245         default = self.config['DEFAULT']
 246         for attr in self._configured_attributes:
 247             key = self._configured_attribute_translations[attr]
 248             value = getattr(self, attr)
 249             if value is not None:
 250                 value = self._get_configured_option_value(
 251                     attribute=attr, value=value)
 252                 if (attr in self._non_default_configured_attributes or
 253                     value != default[key]):
 254                     data[key] = value
 255         self.config[self.section] = data
 256
 257     def load_from_config(self, config=None):
 258         "Restore configured attributes"
 259         if config is None:
 260             config = _config.CONFIG
 261         self.config = config
 262         if self.section in self.config:
 263             data = self.config[self.section]
 264         else:
 265             data = self.config['DEFAULT']
 266         keys = sorted(data.keys())
 267         expected = sorted(self._configured_attribute_translations.values())
 268         if keys != expected:
 269             for key in expected:
 270                 if (key not in keys and
 271                     key not in self._non_default_configured_attributes):
 272                     raise _error.InvalidFeedConfig(
 273                         setting=key, feed=self,
 274                         message='missing configuration key: {}'.format(key))
 275             for key in keys:
 276                 if key not in expected:
 277                     raise _error.InvalidFeedConfig(
 278                         setting=key, feed=self,
 279                         message='extra configuration key: {}'.format(key))
 280         data = dict(
 281             (self._configured_attribute_inverse_translations[k],
 282              self._get_configured_attribute_value(
 283                   attribute=self._configured_attribute_inverse_translations[k],
 284                   key=k, data=data))
 285             for k in data.keys())
 286         for attr in self._non_default_configured_attributes:
 287             if attr not in data:
 288                 data[attr] = None
 289         self.__dict__.update(data)
 290
 291     def _get_configured_option_value(self, attribute, value):
 292         if value is None:
 293             return ''
 294         elif attribute in self._list_attributes:
 295             return ', '.join(value)
 296         elif attribute in self._function_attributes:
 297             return _util.import_name(value)
 298         return str(value)
 299
 300     def _get_configured_attribute_value(self, attribute, key, data):
 301         if attribute in self._boolean_attributes:
 302             return data.getboolean(key)
 303         elif attribute in self._integer_attributes:
 304             return data.getint(key)
 305         elif attribute in self._list_attributes:
 306             return [x.strip() for x in data[key].split(',')]
 307         elif attribute in self._function_attributes:
 308             if data[key]:
 309                 return _util.import_function(data[key])
 310             return None
 311         return data[key]
 312
 313     def reset(self):
 314         """Reset dynamic data
 315         """
 316         self.etag = None
 317         self.modified = None
 318         self.seen = {}
 319
 320     def _set_name(self, name):
 321         if not self._name_regexp.match(name):
 322             raise _error.InvalidFeedName(name=name, feed=self)
 323         self.name = name
 324         self.section = 'feed.{}'.format(self.name)
 325
 326     def _fetch(self):
 327         """Fetch and parse a feed using feedparser.
 328
 329         >>> feed = Feed(
 330         ...    name='test-feed',
 331         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 332         >>> parsed = feed._fetch()
 333         >>> parsed.status
 334         200
 335         """
 336         _LOG.info('fetch {}'.format(self))
 337         if not self.url:
 338             raise _error.InvalidFeedConfig(setting='url', feed=self)
 339         if self.section in self.config:
 340             config = self.config[self.section]
 341         else:
 342             config = self.config['DEFAULT']
 343         proxy = config['proxy']
 344         timeout = config.getint('feed-timeout')
 345         kwargs = {}
 346         if proxy:
 347             kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
 348         f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
 349         return f(self.url, self.etag, modified=self.modified, **kwargs)
 350
 351     def _process(self, parsed):
 352         _LOG.info('process {}'.format(self))
 353         self._check_for_errors(parsed)
 354         for entry in reversed(parsed.entries):
 355             _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
 356             processed = self._process_entry(parsed=parsed, entry=entry)
 357             if processed:
 358                 guid,id_,sender,message = processed
 359                 if self.post_process:
 360                     message = self.post_process(
 361                         feed=self, parsed=parsed, entry=entry, guid=guid,
 362                         message=message)
 363                     if not message:
 364                         continue
 365                 yield (guid, id_, sender, message)
 366
 367     def _check_for_errors(self, parsed):
 368         warned = False
 369         status = getattr(parsed, 'status', 200)
 370         _LOG.debug('HTTP status {}'.format(status))
 371         if status == 301:
 372             _LOG.info('redirect {} from {} to {}'.format(
 373                     self.name, self.url, parsed['url']))
 374             self.url = parsed['url']
 375         elif status not in [200, 302, 304]:
 376             raise _error.HTTPError(status=status, feed=self)
 377
 378         http_headers = parsed.get('headers', {})
 379         if http_headers:
 380             _LOG.debug('HTTP headers: {}'.format(http_headers))
 381         if not http_headers:
 382             _LOG.warning('could not get HTTP headers: {}'.format(self))
 383             warned = True
 384         else:
 385             if 'html' in http_headers.get('content-type', 'rss'):
 386                 _LOG.warning('looks like HTML: {}'.format(self))
 387                 warned = True
 388             if http_headers.get('content-length', '1') == '0':
 389                 _LOG.warning('empty page: {}'.format(self))
 390                 warned = True
 391
 392         version = parsed.get('version', None)
 393         if version:
 394             _LOG.debug('feed version {}'.format(version))
 395         else:
 396             _LOG.warning('unrecognized version: {}'.format(self))
 397             warned = True
 398
 399         exc = parsed.get('bozo_exception', None)
 400         if isinstance(exc, _socket.timeout):
 401             _LOG.error('timed out: {}'.format(self))
 402             warned = True
 403         elif isinstance(exc, OSError):
 404             _LOG.error('{}: {}'.format(exc, self))
 405             warned = True
 406         elif isinstance(exc, _SOCKET_ERRORS):
 407             _LOG.error('{}: {}'.format(exc, self))
 408             warned = True
 409         elif isinstance(exc, _feedparser.zlib.error):
 410             _LOG.error('broken compression: {}'.format(self))
 411             warned = True
 412         elif isinstance(exc, (IOError, AttributeError)):
 413             _LOG.error('{}: {}'.format(exc, self))
 414             warned = True
 415         elif isinstance(exc, KeyboardInterrupt):
 416             raise exc
 417         elif isinstance(exc, _sax.SAXParseException):
 418             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
 419             warned = True
 420         elif (parsed.bozo and
 421               isinstance(exc, _feedparser.CharacterEncodingOverride)):
 422             _LOG.warning(
 423                 'incorrectly declared encoding: {}: {}'.format(exc, self))
 424             warned = True
 425         elif parsed.bozo or exc:
 426             if exc is None:
 427                 exc = "can't process"
 428             _LOG.error('processing error: {}: {}'.format(exc, self))
 429             warned = True
 430
 431         if (not warned and
 432             status in [200, 302] and
 433             not parsed.entries and
 434             not version):
 435             raise _error.ProcessingError(parsed=parsed, feed=feed)
 436
 437     def _html2text(self, html, baseurl='', default=None):
 438         self.config.setup_html2text(section=self.section)
 439         try:
 440             return _html2text.html2text(html=html, baseurl=baseurl)
 441         except _html_parser.HTMLParseError as e:
 442             if default is not None:
 443                 return default
 444             raise
 445
 446     def _process_entry(self, parsed, entry):
 447         id_ = self._get_entry_id(entry)
 448         # If .trust_guid isn't set, we get back hashes of the content.
 449         # Instead of letting these run wild, we put them in context
 450         # by associating them with the actual ID (if it exists).
 451         guid = entry.get('id', id_)
 452         if isinstance(guid, dict):
 453             guid = guid.values()[0]
 454         if guid in self.seen:
 455             if self.seen[guid]['id'] == id_:
 456                 _LOG.debug('already seen {}'.format(id_))
 457                 return  # already seen
 458         sender = self._get_entry_email(parsed=parsed, entry=entry)
 459         subject = self._get_entry_title(entry)
 460         extra_headers = _collections.OrderedDict((
 461                 ('Date', self._get_entry_date(entry)),
 462                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
 463                 ('User-Agent', _USER_AGENT),
 464                 ('X-RSS-Feed', self.url),
 465                 ('X-RSS-ID', id_),
 466                 ('X-RSS-URL', self._get_entry_link(entry)),
 467                 ('X-RSS-TAGS', self._get_entry_tags(entry)),
 468                 ))
 469         for k,v in extra_headers.items():  # remove empty tags, etc.
 470             if v is None:
 471                 extra_headers.pop(k)
 472         if self.bonus_header:
 473             for header in self.bonus_header.splitlines():
 474                 if ':' in header:
 475                     key,value = header.split(':', 1)
 476                     extra_headers[key.strip()] = value.strip()
 477                 else:
 478                     _LOG.warning(
 479                         'malformed bonus-header: {}'.format(
 480                             self.bonus_header))
 481
 482         content = self._get_entry_content(entry)
 483         try:
 484             content = self._process_entry_content(
 485                 entry=entry, content=content, subject=subject)
 486         except _error.ProcessingError as e:
 487             e.parsed = parsed
 488             raise
 489         message = _email.get_message(
 490             sender=sender,
 491             recipient=self.to,
 492             subject=subject,
 493             body=content['value'],
 494             content_type=content['type'].split('/', 1)[1],
 495             extra_headers=extra_headers,
 496             config=self.config,
 497             section=self.section)
 498         return (guid, id_, sender, message)
 499
 500     def _get_entry_id(self, entry):
 501         """Get best ID from an entry."""
 502         if self.trust_link:
 503             return entry.get('link', None)
 504         if self.trust_guid:
 505             if getattr(entry, 'id', None):
 506                 # Newer versions of feedparser could return a dictionary
 507                 if isinstance(entry.id, dict):
 508                     return entry.id.values()[0]
 509                 return entry.id
 510         content = self._get_entry_content(entry)
 511         content_value = content['value'].strip()
 512         if content_value:
 513             return _hashlib.sha1(
 514                 content_value.encode('unicode-escape')).hexdigest()
 515         elif getattr(entry, 'link', None):
 516             return _hashlib.sha1(
 517                 entry.link.encode('unicode-escape')).hexdigest()
 518         elif getattr(entry, 'title', None):
 519             return _hashlib.sha1(
 520                 entry.title.encode('unicode-escape')).hexdigest()
 521
 522     def _get_entry_link(self, entry):
 523         return entry.get('link', None)
 524
 525     def _get_entry_title(self, entry):
 526         if hasattr(entry, 'title_detail') and entry.title_detail:
 527             title = entry.title_detail.value
 528             if 'html' in entry.title_detail.type:
 529                 title = self._html2text(title, default=title)
 530         else:
 531             content = self._get_entry_content(entry)
 532             value = content['value']
 533             if content['type'] in ('text/html', 'application/xhtml+xml'):
 534                 value = self._html2text(value, default=value)
 535             title = value[:70]
 536         title = title.replace('\n', ' ').strip()
 537         return title
 538
 539     def _get_entry_date(self, entry):
 540         datetime = _time.gmtime()
 541         if self.date_header:
 542             for datetype in self.date_header_order:
 543                 kind = datetype + '_parsed'
 544                 if entry.get(kind, None):
 545                     datetime = entry[kind]
 546                     break
 547         return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 548
 549     def _get_entry_name(self, parsed, entry):
 550         """Get the best name
 551
 552         >>> import feedparser
 553         >>> f = Feed(name='test-feed')
 554         >>> parsed = feedparser.parse(
 555         ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
 556         ...     '  <entry>\\n'
 557         ...     '    <author>\\n'
 558         ...     '      <name>Example author</name>\\n'
 559         ...     '      <email>me@example.com</email>\\n'
 560         ...     '      <url>http://example.com/</url>\\n'
 561         ...     '    </author>\\n'
 562         ...     '  </entry>\\n'
 563         ...     '</feed>\\n'
 564         ...     )
 565         >>> entry = parsed.entries[0]
 566         >>> f.name_format = ''
 567         >>> f._get_entry_name(parsed, entry)
 568         ''
 569         >>> f.name_format = '{author}'
 570         >>> f._get_entry_name(parsed, entry)
 571         'Example author'
 572         >>> f.name_format = '{feed-title}: {author}'
 573         >>> f._get_entry_name(parsed, entry)
 574         ': Example author'
 575         >>> f.name_format = '{author} ({feed.name})'
 576         >>> f._get_entry_name(parsed, entry)
 577         'Example author (test-feed)'
 578         """
 579         if not self.name_format:
 580             return ''
 581         data = {
 582             'feed': self,
 583             'feed-title': '<feed title>',
 584             'author': '<author>',
 585             'publisher': '<publisher>',
 586             }
 587         feed = parsed.feed
 588         data['feed-title'] = feed.get('title', '')
 589         for x in [entry, feed]:
 590             if 'name' in x.get('author_detail', []):
 591                 if x.author_detail.name:
 592                     data['author'] = x.author_detail.name
 593                     break
 594         if 'name' in feed.get('publisher_detail', []):
 595             data['publisher'] = feed.publisher_detail.name
 596         name = self.name_format.format(**data)
 597         return _html2text.unescape(name)
 598
 599     def _validate_email(self, email, default=None):
 600         """Do a basic quality check on email address
 601
 602         Return `default` if the address doesn't appear to be
 603         well-formed.  If `default` is `None`, return
 604         `self.from_email`.
 605
 606         >>> f = Feed(name='test-feed')
 607         >>> f._validate_email('valid@example.com', 'default@example.com')
 608         'valid@example.com'
 609         >>> f._validate_email('invalid@', 'default@example.com')
 610         'default@example.com'
 611         >>> f._validate_email('@invalid', 'default@example.com')
 612         'default@example.com'
 613         >>> f._validate_email('invalid', 'default@example.com')
 614         'default@example.com'
 615         """
 616         parts = email.split('@')
 617         if len(parts) != 2 or '' in parts:
 618             if default is None:
 619                 return self.from_email
 620             return default
 621         return email
 622
 623     def _get_entry_address(self, parsed, entry):
 624         """Get the best From email address ('<jdoe@a.com>')
 625
 626         If the best guess isn't well-formed (something@somthing.com),
 627         use `self.from_email` instead.
 628         """
 629         if self.force_from:
 630             return self.from_email
 631         feed = parsed.feed
 632         if 'email' in entry.get('author_detail', []):
 633             return self._validate_email(entry.author_detail.email)
 634         elif 'email' in feed.get('author_detail', []):
 635             return self._validate_email(feed.author_detail.email)
 636         if self.use_publisher_email:
 637             if 'email' in feed.get('publisher_detail', []):
 638                 return self._validate_email(feed.publisher_detail.email)
 639             if feed.get('errorreportsto', None):
 640                 return self._validate_email(feed.errorreportsto)
 641         _LOG.debug('no sender address found, fallback to default')
 642         return self.from_email
 643
 644     def _get_entry_email(self, parsed, entry):
 645         """Get the best From email address ('John <jdoe@a.com>')
 646         """
 647         name = self._get_entry_name(parsed=parsed, entry=entry)
 648         address = self._get_entry_address(parsed=parsed, entry=entry)
 649         return _formataddr((name, address))
 650
 651     def _get_entry_tags(self, entry):
 652         """Add post tags, if available
 653
 654         >>> f = Feed(name='test-feed')
 655         >>> f._get_entry_tags({
 656         ...         'tags': [{'term': 'tag1',
 657         ...                   'scheme': None,
 658         ...                   'label': None}]})
 659         'tag1'
 660         >>> f._get_entry_tags({
 661         ...         'tags': [{'term': 'tag1',
 662         ...                   'scheme': None,
 663         ...                   'label': None},
 664         ...                  {'term': 'tag2',
 665         ...                   'scheme': None,
 666         ...                   'label': None}]})
 667         'tag1,tag2'
 668
 669         Test some troublesome cases.  No tags:
 670
 671         >>> f._get_entry_tags({})
 672
 673         Empty tags:
 674
 675         >>> f._get_entry_tags({'tags': []})
 676
 677         Tags without a ``term`` entry:
 678
 679         >>> f._get_entry_tags({
 680         ...         'tags': [{'scheme': None,
 681         ...                   'label': None}]})
 682
 683         Tags with an empty term:
 684
 685         >>> f._get_entry_tags({
 686         ...         'tags': [{'term': '',
 687         ...                   'scheme': None,
 688         ...                   'label': None}]})
 689         """
 690         taglist = [tag['term'] for tag in entry.get('tags', [])
 691                    if tag.get('term', '')]
 692         if taglist:
 693             return ','.join(taglist)
 694
 695     def _get_entry_content(self, entry):
 696         """Select the best content from an entry.
 697
 698         Returns a feedparser content dict.
 699         """
 700         # How this works:
 701         #  * We have a bunch of potential contents.
 702         #  * We go thru looking for our first choice.
 703         #    (HTML or text, depending on self.html_mail)
 704         #  * If that doesn't work, we go thru looking for our second choice.
 705         #  * If that still doesn't work, we just take the first one.
 706         #
 707         # Possible future improvement:
 708         #  * Instead of just taking the first one
 709         #    pick the one in the "best" language.
 710         #  * HACK: hardcoded .html_mail, should take a tuple of media types
 711         contents = list(entry.get('content', []))
 712         if entry.get('summary_detail', None):
 713             contents.append(entry.summary_detail)
 714         if self.html_mail:
 715             types = ['text/html', 'text/plain']
 716         else:
 717             types = ['text/plain', 'text/html']
 718         for content_type in types:
 719             for content in contents:
 720                 if content['type'] == content_type:
 721                     return content
 722         if contents:
 723             return contents[0]
 724         return {'type': 'text/plain', 'value': ''}
 725
 726     def _process_entry_content(self, entry, content, subject):
 727         "Convert entry content to the requested format."
 728         link = self._get_entry_link(entry)
 729         if self.html_mail:
 730             lines = [
 731                 '<!DOCTYPE html>',
 732                 '<html>',
 733                 '  <head>',
 734                 ]
 735             if self.use_css and self.css:
 736                 lines.extend([
 737                         '    <style type="text/css">',
 738                         self.css,
 739                         '    </style>',
 740                         ])
 741             lines.extend([
 742                     '</head>',
 743                     '<body>',
 744                     '<div id="entry">',
 745                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
 746                         link, subject),
 747                     '<div id="body">',
 748                     ])
 749             if content['type'] in ('text/html', 'application/xhtml+xml'):
 750                 lines.append(content['value'].strip())
 751             else:
 752                 lines.append(_saxutils.escape(content['value'].strip()))
 753             lines.append('</div>')
 754             lines.extend([
 755                     '<div class="footer">'
 756                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
 757                     ])
 758             for enclosure in getattr(entry, 'enclosures', []):
 759                 if getattr(enclosure, 'url', None):
 760                     lines.append(
 761                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 762                             enclosure.url))
 763                 if getattr(enclosure, 'src', None):
 764                     lines.append(
 765                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 766                             enclosure.src))
 767                     lines.append(
 768                         '<p><img src="{}" /></p>'.format(enclosure.src))
 769             for elink in getattr(entry, 'links', []):
 770                 if elink.get('rel', None) == 'via':
 771                     url = elink['href']
 772                     title = elink.get('title', url)
 773                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
 774                             url, title))
 775             lines.extend([
 776                     '</div>',  # /footer
 777                     '</div>',  # /entry
 778                     '</body>',
 779                     '</html>',
 780                     ''])
 781             content['type'] = 'text/html'
 782             content['value'] = '\n'.join(lines)
 783             return content
 784         else:  # not self.html_mail
 785             if content['type'] in ('text/html', 'application/xhtml+xml'):
 786                 try:
 787                     lines = [self._html2text(content['value'])]
 788                 except _html_parser.HTMLParseError as e:
 789                     raise _error.ProcessingError(parsed=None, feed=self)
 790             else:
 791                 lines = [content['value']]
 792             lines.append('')
 793             lines.append('URL: {}'.format(link))
 794             for enclosure in getattr(entry, 'enclosures', []):
 795                 if getattr(enclosure, 'url', None):
 796                     lines.append('Enclosure: {}'.format(enclosure.url))
 797                 if getattr(enclosure, 'src', None):
 798                     lines.append('Enclosure: {}'.format(enclosure.src))
 799             for elink in getattr(entry, 'links', []):
 800                 if elink.get('rel', None) == 'via':
 801                     url = elink['href']
 802                     title = elink.get('title', url)
 803                     lines.append('Via: {} {}'.format(title, url))
 804             content['type'] = 'text/plain'
 805             content['value'] = '\n'.join(lines)
 806             return content
 807
 808     def _send(self, sender, message):
 809         _LOG.info('send message for {}'.format(self))
 810         section = self.section
 811         if section not in self.config:
 812             section = 'DEFAULT'
 813         _email.send(sender=sender, recipient=self.to, message=message,
 814                     config=self.config, section=section)
 815
 816     def run(self, send=True):
 817         """Fetch and process the feed, mailing entry emails.
 818
 819         >>> feed = Feed(
 820         ...    name='test-feed',
 821         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 822         >>> def send(sender, message):
 823         ...    print('send from {}:'.format(sender))
 824         ...    print(message.as_string())
 825         >>> feed._send = send
 826         >>> feed.to = 'jdoe@dummy.invalid'
 827         >>> #parsed = feed.run()  # enable for debugging
 828         """
 829         if not self.to:
 830             raise _error.NoToEmailAddress(feed=self)
 831         parsed = self._fetch()
 832
 833         if self.digest:
 834             digest = self._new_digest()
 835             seen = []
 836
 837         for (guid, id_, sender, message) in self._process(parsed):
 838             _LOG.debug('new message: {}'.format(message['Subject']))
 839             if self.digest:
 840                 seen.append((guid, id_))
 841                 self._append_to_digest(digest=digest, message=message)
 842             else:
 843                 if send:
 844                     self._send(sender=sender, message=message)
 845                 if guid not in self.seen:
 846                     self.seen[guid] = {}
 847                 self.seen[guid]['id'] = id_
 848
 849         if self.digest and seen:
 850             if self.digest_post_process:
 851                 digest = self.digest_post_process(
 852                     feed=self, parsed=parsed, seen=seen, message=digest)
 853                 if not digest:
 854                     return
 855             self._send_digest(
 856                 digest=digest, seen=seen, sender=sender, send=send)
 857
 858         self.etag = parsed.get('etag', None)
 859         self.modified = parsed.get('modified', None)
 860
 861     def _new_digest(self):
 862         digest = _MIMEMultipart('digest')
 863         digest['To'] = self.to  # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
 864         digest['Subject'] = 'digest for {}'.format(self.name)
 865         digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
 866         digest['User-Agent'] = _USER_AGENT
 867         digest['X-RSS-Feed'] = self.url
 868         return digest
 869
 870     def _append_to_digest(self, digest, message):
 871         part = _MIMEMessage(message)
 872         part.add_header('Content-Disposition', 'attachment')
 873         digest.attach(part)
 874
 875     def _send_digest(self, digest, seen, sender, send=True):
 876         """Send a digest message
 877
 878         The date is extracted from the last message in the digest
 879         payload.  We assume that this part exists.  If you don't have
 880         any messages in the digest, don't call this function.
 881         """
 882         digest['From'] = sender  # TODO: _Header(), _formataddr()...
 883         last_part = digest.get_payload()[-1]
 884         last_message = last_part.get_payload()[0]
 885         digest['Date'] = last_message['Date']
 886
 887         _LOG.debug('new digest for {}'.format(self))
 888         if send:
 889             self._send(sender=sender, message=digest)
 890         for (guid, id_) in seen:
 891             if guid not in self.seen:
 892                 self.seen[guid] = {}
 893             self.seen[guid]['id'] = id_