rss2email/feed.py

   1 # Copyright (C) 2004-2013 Aaron Swartz
   2 #                         Brian Lalor
   3 #                         Dean Jackson
   4 #                         Dennis Keitzel <github@pinshot.net>
   5 #                         Erik Hetzner
   6 #                         Etienne Millon <me@emillon.org>
   7 #                         J. Lewis Muir <jlmuir@imca-cat.org>
   8 #                         Joey Hess
   9 #                         Lindsey Smith <lindsey.smith@gmail.com>
  10 #                         Marcel Ackermann
  11 #                         Martin 'Joey' Schulze
  12 #                         Matej Cepl
  13 #                         W. Trevor King <wking@tremily.us>
  14 #
  15 # This file is part of rss2email.
  16 #
  17 # rss2email is free software: you can redistribute it and/or modify it under
  18 # the terms of the GNU General Public License as published by the Free Software
  19 # Foundation, either version 2 of the License, or (at your option) version 3 of
  20 # the License.
  21 #
  22 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
  23 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  24 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  25 #
  26 # You should have received a copy of the GNU General Public License along with
  27 # rss2email.  If not, see <http://www.gnu.org/licenses/>.
  28
  29 """Define the ``Feed`` class for handling a single feed
  30 """
  31
  32 import collections as _collections
  33 from email.mime.message import MIMEMessage as _MIMEMessage
  34 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
  35 from email.utils import formataddr as _formataddr
  36 import hashlib as _hashlib
  37 import html.parser as _html_parser
  38 import re as _re
  39 import socket as _socket
  40 import time as _time
  41 import urllib.error as _urllib_error
  42 import urllib.request as _urllib_request
  43 import uuid as _uuid
  44 import xml.sax as _sax
  45 import xml.sax.saxutils as _saxutils
  46
  47 import feedparser as _feedparser
  48 import html2text as _html2text
  49
  50 from . import __url__
  51 from . import __version__
  52 from . import LOG as _LOG
  53 from . import config as _config
  54 from . import email as _email
  55 from . import error as _error
  56 from . import util as _util
  57
  58
  59 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
  60 _feedparser.USER_AGENT = _USER_AGENT
  61 _urllib_request.install_opener(_urllib_request.build_opener())
  62 _SOCKET_ERRORS = []
  63 for e in ['error', 'herror', 'gaierror']:
  64     if hasattr(_socket, e):
  65         _SOCKET_ERRORS.append(getattr(_socket, e))
  66 del e  # cleanup namespace
  67 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
  68
  69 # drv_libxml2 raises:
  70 #   TypeError: 'str' does not support the buffer interface
  71 _feedparser.PREFERRED_XML_PARSERS = []
  72
  73
  74 class Feed (object):
  75     """Utility class for feed manipulation and storage.
  76
  77     >>> import pickle
  78     >>> import sys
  79     >>> from .config import CONFIG
  80
  81     >>> feed = Feed(
  82     ...    name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
  83     >>> print(feed)
  84     test-feed (http://example.com/feed.atom -> a@b.com)
  85     >>> feed.section
  86     'feed.test-feed'
  87     >>> feed.from_email
  88     'user@rss2email.invalid'
  89
  90     >>> feed.from_email = 'a@b.com'
  91     >>> feed.save_to_config()
  92     >>> feed.config.write(sys.stdout)  # doctest: +REPORT_UDIFF, +ELLIPSIS
  93     [DEFAULT]
  94     from = user@rss2email.invalid
  95     ...
  96     verbose = warning
  97     <BLANKLINE>
  98     [feed.test-feed]
  99     url = http://example.com/feed.atom
 100     from = a@b.com
 101     to = a@b.com
 102     <BLANKLINE>
 103
 104     >>> feed.etag = 'dummy etag'
 105     >>> string = pickle.dumps(feed)
 106     >>> feed = pickle.loads(string)
 107     >>> feed.load_from_config(config=CONFIG)
 108     >>> feed.etag
 109     'dummy etag'
 110     >>> feed.url
 111     'http://example.com/feed.atom'
 112
 113     Names can only contain ASCII letters, digits, and '._-'.  Here the
 114     invalid space causes an exception:
 115
 116     >>> Feed(name='invalid name')
 117     Traceback (most recent call last):
 118       ...
 119     rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
 120
 121     You must define a URL:
 122
 123     >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
 124     Traceback (most recent call last):
 125       ...
 126     rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
 127
 128
 129     Cleanup `CONFIG`.
 130
 131     >>> CONFIG['DEFAULT']['to'] = ''
 132     >>> test_section = CONFIG.pop('feed.test-feed')
 133     """
 134     _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
 135
 136     # saved/loaded from feed.dat using __getstate__/__setstate__.
 137     _dynamic_attributes = [
 138         'name',
 139         'etag',
 140         'modified',
 141         'seen',
 142         ]
 143
 144     ## saved/loaded from ConfigParser instance
 145     # attributes that aren't in DEFAULT
 146     _non_default_configured_attributes = [
 147         'url',
 148         ]
 149     # attributes that are in DEFAULT
 150     _default_configured_attributes = [
 151         key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
 152     _default_configured_attributes[
 153         _default_configured_attributes.index('from')
 154         ] = 'from_email'  # `from` is a Python keyword
 155     # all attributes that are saved/loaded from .config
 156     _configured_attributes = (
 157         _non_default_configured_attributes + _default_configured_attributes)
 158     # attribute name -> .config option
 159     _configured_attribute_translations = dict(
 160         (attr,attr) for attr in _non_default_configured_attributes)
 161     _configured_attribute_translations.update(dict(
 162             zip(_default_configured_attributes,
 163                 _config.CONFIG['DEFAULT'].keys())))
 164     # .config option -> attribute name
 165     _configured_attribute_inverse_translations = dict(
 166         (v,k) for k,v in _configured_attribute_translations.items())
 167
 168     # hints for value conversion
 169     _boolean_attributes = [
 170         'digest',
 171         'force_from',
 172         'use_publisher_email',
 173         'active',
 174         'date_header',
 175         'trust_guid',
 176         'html_mail',
 177         'use_css',
 178         'unicode_snob',
 179         'links_after_each_paragraph',
 180         'use_smtp',
 181         'smtp_ssl',
 182         ]
 183
 184     _integer_attributes = [
 185         'feed_timeout',
 186         'body_width',
 187         ]
 188
 189     _list_attributes = [
 190         'date_header_order',
 191         'encodings',
 192         ]
 193
 194     _function_attributes = [
 195         'post_process',
 196         'digest_post_process',
 197         ]
 198
 199     def __init__(self, name=None, url=None, to=None, config=None):
 200         self._set_name(name=name)
 201         self.reset()
 202         self.__setstate__(dict(
 203                 (attr, getattr(self, attr))
 204                 for attr in self._dynamic_attributes))
 205         self.load_from_config(config=config)
 206         if url:
 207             self.url = url
 208         if to:
 209             self.to = to
 210
 211     def __str__(self):
 212         return '{} ({} -> {})'.format(self.name, self.url, self.to)
 213
 214     def __repr__(self):
 215         return '<Feed {}>'.format(str(self))
 216
 217     def __getstate__(self):
 218         "Save dyamic attributes"
 219         return dict(
 220             (key,getattr(self,key)) for key in self._dynamic_attributes)
 221
 222     get_state = __getstate__  # make it publicly accessible
 223
 224     def __setstate__(self, state):
 225         "Restore dynamic attributes"
 226         keys = sorted(state.keys())
 227         if keys != sorted(self._dynamic_attributes):
 228             raise ValueError(state)
 229         self._set_name(name=state['name'])
 230         self.__dict__.update(state)
 231
 232     set_state = __setstate__  # make it publicly accessible
 233
 234     def save_to_config(self):
 235         "Save configured attributes"
 236         data = _collections.OrderedDict()
 237         default = self.config['DEFAULT']
 238         for attr in self._configured_attributes:
 239             key = self._configured_attribute_translations[attr]
 240             value = getattr(self, attr)
 241             if value is not None:
 242                 value = self._get_configured_option_value(
 243                     attribute=attr, value=value)
 244                 if (attr in self._non_default_configured_attributes or
 245                     value != default[key]):
 246                     data[key] = value
 247         self.config[self.section] = data
 248
 249     def load_from_config(self, config=None):
 250         "Restore configured attributes"
 251         if config is None:
 252             config = _config.CONFIG
 253         self.config = config
 254         if self.section in self.config:
 255             data = self.config[self.section]
 256         else:
 257             data = self.config['DEFAULT']
 258         keys = sorted(data.keys())
 259         expected = sorted(self._configured_attribute_translations.values())
 260         if keys != expected:
 261             for key in expected:
 262                 if (key not in keys and
 263                     key not in self._non_default_configured_attributes):
 264                     raise _error.InvalidFeedConfig(
 265                         setting=key, feed=self,
 266                         message='missing configuration key: {}'.format(key))
 267             for key in keys:
 268                 if key not in expected:
 269                     raise _error.InvalidFeedConfig(
 270                         setting=key, feed=self,
 271                         message='extra configuration key: {}'.format(key))
 272         data = dict(
 273             (self._configured_attribute_inverse_translations[k],
 274              self._get_configured_attribute_value(
 275                   attribute=self._configured_attribute_inverse_translations[k],
 276                   key=k, data=data))
 277             for k in data.keys())
 278         for attr in self._non_default_configured_attributes:
 279             if attr not in data:
 280                 data[attr] = None
 281         self.__dict__.update(data)
 282
 283     def _get_configured_option_value(self, attribute, value):
 284         if value is None:
 285             return ''
 286         elif attribute in self._list_attributes:
 287             return ', '.join(value)
 288         elif attribute in self._function_attributes:
 289             return _util.import_name(value)
 290         return str(value)
 291
 292     def _get_configured_attribute_value(self, attribute, key, data):
 293         if attribute in self._boolean_attributes:
 294             return data.getboolean(key)
 295         elif attribute in self._integer_attributes:
 296             return data.getint(key)
 297         elif attribute in self._list_attributes:
 298             return [x.strip() for x in data[key].split(',')]
 299         elif attribute in self._function_attributes:
 300             if data[key]:
 301                 return _util.import_function(data[key])
 302             return None
 303         return data[key]
 304
 305     def reset(self):
 306         """Reset dynamic data
 307         """
 308         self.etag = None
 309         self.modified = None
 310         self.seen = {}
 311
 312     def _set_name(self, name):
 313         if not self._name_regexp.match(name):
 314             raise _error.InvalidFeedName(name=name, feed=self)
 315         self.name = name
 316         self.section = 'feed.{}'.format(self.name)
 317
 318     def _fetch(self):
 319         """Fetch and parse a feed using feedparser.
 320
 321         >>> feed = Feed(
 322         ...    name='test-feed',
 323         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 324         >>> parsed = feed._fetch()
 325         >>> parsed.status
 326         200
 327         """
 328         _LOG.info('fetch {}'.format(self))
 329         if not self.url:
 330             raise _error.InvalidFeedConfig(setting='url', feed=self)
 331         if self.section in self.config:
 332             config = self.config[self.section]
 333         else:
 334             config = self.config['DEFAULT']
 335         proxy = config['proxy']
 336         timeout = config.getint('feed-timeout')
 337         kwargs = {}
 338         if proxy:
 339             kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
 340         f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
 341         return f(self.url, self.etag, modified=self.modified, **kwargs)
 342
 343     def _process(self, parsed):
 344         _LOG.info('process {}'.format(self))
 345         self._check_for_errors(parsed)
 346         for entry in reversed(parsed.entries):
 347             _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
 348             processed = self._process_entry(parsed=parsed, entry=entry)
 349             if processed:
 350                 guid,id_,sender,message = processed
 351                 if self.post_process:
 352                     message = self.post_process(
 353                         feed=self, parsed=parsed, entry=entry, guid=guid,
 354                         message=message)
 355                     if not message:
 356                         continue
 357                 yield (guid, id_, sender, message)
 358
 359     def _check_for_errors(self, parsed):
 360         warned = False
 361         status = getattr(parsed, 'status', 200)
 362         _LOG.debug('HTTP status {}'.format(status))
 363         if status == 301:
 364             _LOG.info('redirect {} from {} to {}'.format(
 365                     self.name, self.url, parsed['url']))
 366             self.url = parsed['url']
 367         elif status not in [200, 302, 304]:
 368             raise _error.HTTPError(status=status, feed=self)
 369
 370         http_headers = parsed.get('headers', {})
 371         if http_headers:
 372             _LOG.debug('HTTP headers: {}'.format(http_headers))
 373         if not http_headers:
 374             _LOG.warning('could not get HTTP headers: {}'.format(self))
 375             warned = True
 376         else:
 377             if 'html' in http_headers.get('content-type', 'rss'):
 378                 _LOG.warning('looks like HTML: {}'.format(self))
 379                 warned = True
 380             if http_headers.get('content-length', '1') == '0':
 381                 _LOG.warning('empty page: {}'.format(self))
 382                 warned = True
 383
 384         version = parsed.get('version', None)
 385         if version:
 386             _LOG.debug('feed version {}'.format(version))
 387         else:
 388             _LOG.warning('unrecognized version: {}'.format(self))
 389             warned = True
 390
 391         exc = parsed.get('bozo_exception', None)
 392         if isinstance(exc, _socket.timeout):
 393             _LOG.error('timed out: {}'.format(self))
 394             warned = True
 395         elif isinstance(exc, OSError):
 396             _LOG.error('{}: {}'.format(exc, self))
 397             warned = True
 398         elif isinstance(exc, _SOCKET_ERRORS):
 399             _LOG.error('{}: {}'.format(exc, self))
 400             warned = True
 401         elif isinstance(exc, _feedparser.zlib.error):
 402             _LOG.error('broken compression: {}'.format(self))
 403             warned = True
 404         elif isinstance(exc, (IOError, AttributeError)):
 405             _LOG.error('{}: {}'.format(exc, self))
 406             warned = True
 407         elif isinstance(exc, KeyboardInterrupt):
 408             raise exc
 409         elif isinstance(exc, _sax.SAXParseException):
 410             _LOG.error('sax parsing error: {}: {}'.format(exc, self))
 411             warned = True
 412         elif (parsed.bozo and
 413               isinstance(exc, _feedparser.CharacterEncodingOverride)):
 414             _LOG.warning(
 415                 'incorrectly declared encoding: {}: {}'.format(exc, self))
 416             warned = True
 417         elif parsed.bozo or exc:
 418             if exc is None:
 419                 exc = "can't process"
 420             _LOG.error('processing error: {}: {}'.format(exc, self))
 421             warned = True
 422
 423         if (not warned and
 424             status in [200, 302] and
 425             not parsed.entries and
 426             not version):
 427             raise _error.ProcessingError(parsed=parsed, feed=feed)
 428
 429     def _html2text(self, html, baseurl='', default=None):
 430         self.config.setup_html2text(section=self.section)
 431         try:
 432             return _html2text.html2text(html=html, baseurl=baseurl)
 433         except _html_parser.HTMLParseError as e:
 434             if default is not None:
 435                 return default
 436             raise
 437
 438     def _process_entry(self, parsed, entry):
 439         id_ = self._get_entry_id(entry)
 440         # If .trust_guid isn't set, we get back hashes of the content.
 441         # Instead of letting these run wild, we put them in context
 442         # by associating them with the actual ID (if it exists).
 443         guid = entry.get('id', id_)
 444         if isinstance(guid, dict):
 445             guid = guid.values()[0]
 446         if guid in self.seen:
 447             if self.seen[guid]['id'] == id_:
 448                 _LOG.debug('already seen {}'.format(id_))
 449                 return  # already seen
 450         sender = self._get_entry_email(parsed=parsed, entry=entry)
 451         subject = self._get_entry_title(entry)
 452         extra_headers = _collections.OrderedDict((
 453                 ('Date', self._get_entry_date(entry)),
 454                 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
 455                 ('User-Agent', _USER_AGENT),
 456                 ('X-RSS-Feed', self.url),
 457                 ('X-RSS-ID', id_),
 458                 ('X-RSS-URL', self._get_entry_link(entry)),
 459                 ('X-RSS-TAGS', self._get_entry_tags(entry)),
 460                 ))
 461         for k,v in extra_headers.items():  # remove empty tags, etc.
 462             if v is None:
 463                 extra_headers.pop(k)
 464         if self.bonus_header:
 465             for header in self.bonus_header.splitlines():
 466                 if ':' in header:
 467                     key,value = header.split(':', 1)
 468                     extra_headers[key.strip()] = value.strip()
 469                 else:
 470                     _LOG.warning(
 471                         'malformed bonus-header: {}'.format(
 472                             self.bonus_header))
 473
 474         content = self._get_entry_content(entry)
 475         try:
 476             content = self._process_entry_content(
 477                 entry=entry, content=content, subject=subject)
 478         except _error.ProcessingError as e:
 479             e.parsed = parsed
 480             raise
 481         message = _email.get_message(
 482             sender=sender,
 483             recipient=self.to,
 484             subject=subject,
 485             body=content['value'],
 486             content_type=content['type'].split('/', 1)[1],
 487             extra_headers=extra_headers,
 488             config=self.config,
 489             section=self.section)
 490         return (guid, id_, sender, message)
 491
 492     def _get_entry_id(self, entry):
 493         """Get best ID from an entry."""
 494         if self.trust_guid:
 495             if getattr(entry, 'id', None):
 496                 # Newer versions of feedparser could return a dictionary
 497                 if isinstance(entry.id, dict):
 498                     return entry.id.values()[0]
 499                 return entry.id
 500         content = self._get_entry_content(entry)
 501         content_value = content['value'].strip()
 502         if content_value:
 503             return _hashlib.sha1(
 504                 content_value.encode('unicode-escape')).hexdigest()
 505         elif getattr(entry, 'link', None):
 506             return _hashlib.sha1(
 507                 entry.link.encode('unicode-escape')).hexdigest()
 508         elif getattr(entry, 'title', None):
 509             return _hashlib.sha1(
 510                 entry.title.encode('unicode-escape')).hexdigest()
 511
 512     def _get_entry_link(self, entry):
 513         return entry.get('link', None)
 514
 515     def _get_entry_title(self, entry):
 516         if hasattr(entry, 'title_detail') and entry.title_detail:
 517             title = entry.title_detail.value
 518             if 'html' in entry.title_detail.type:
 519                 title = self._html2text(title, default=title)
 520         else:
 521             content = self._get_entry_content(entry)
 522             value = content['value']
 523             if content['type'] in ('text/html', 'application/xhtml+xml'):
 524                 value = self._html2text(value, default=value)
 525             title = value[:70]
 526         title = title.replace('\n', ' ').strip()
 527         return title
 528
 529     def _get_entry_date(self, entry):
 530         datetime = _time.gmtime()
 531         if self.date_header:
 532             for datetype in self.date_header_order:
 533                 kind = datetype + '_parsed'
 534                 if entry.get(kind, None):
 535                     datetime = entry[kind]
 536                     break
 537         return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 538
 539     def _get_entry_name(self, parsed, entry):
 540         """Get the best name
 541
 542         >>> import feedparser
 543         >>> f = Feed(name='test-feed')
 544         >>> parsed = feedparser.parse(
 545         ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
 546         ...     '  <entry>\\n'
 547         ...     '    <author>\\n'
 548         ...     '      <name>Example author</name>\\n'
 549         ...     '      <email>me@example.com</email>\\n'
 550         ...     '      <url>http://example.com/</url>\\n'
 551         ...     '    </author>\\n'
 552         ...     '  </entry>\\n'
 553         ...     '</feed>\\n'
 554         ...     )
 555         >>> entry = parsed.entries[0]
 556         >>> f.name_format = ''
 557         >>> f._get_entry_name(parsed, entry)
 558         ''
 559         >>> f.name_format = '{author}'
 560         >>> f._get_entry_name(parsed, entry)
 561         'Example author'
 562         >>> f.name_format = '{feed-title}: {author}'
 563         >>> f._get_entry_name(parsed, entry)
 564         ': Example author'
 565         >>> f.name_format = '{author} ({feed.name})'
 566         >>> f._get_entry_name(parsed, entry)
 567         'Example author (test-feed)'
 568         """
 569         if not self.name_format:
 570             return ''
 571         data = {
 572             'feed': self,
 573             'feed-title': '<feed title>',
 574             'author': '<author>',
 575             'publisher': '<publisher>',
 576             }
 577         feed = parsed.feed
 578         data['feed-title'] = feed.get('title', '')
 579         for x in [entry, feed]:
 580             if 'name' in x.get('author_detail', []):
 581                 if x.author_detail.name:
 582                     data['author'] = x.author_detail.name
 583                     break
 584         if 'name' in feed.get('publisher_detail', []):
 585             data['publisher'] = feed.publisher_detail.name
 586         name = self.name_format.format(**data)
 587         return _html2text.unescape(name)
 588
 589     def _validate_email(self, email, default=None):
 590         """Do a basic quality check on email address
 591
 592         Return `default` if the address doesn't appear to be
 593         well-formed.  If `default` is `None`, return
 594         `self.from_email`.
 595
 596         >>> f = Feed(name='test-feed')
 597         >>> f._validate_email('valid@example.com', 'default@example.com')
 598         'valid@example.com'
 599         >>> f._validate_email('invalid@', 'default@example.com')
 600         'default@example.com'
 601         >>> f._validate_email('@invalid', 'default@example.com')
 602         'default@example.com'
 603         >>> f._validate_email('invalid', 'default@example.com')
 604         'default@example.com'
 605         """
 606         parts = email.split('@')
 607         if len(parts) != 2 or '' in parts:
 608             if default is None:
 609                 return self.from_email
 610             return default
 611         return email
 612
 613     def _get_entry_address(self, parsed, entry):
 614         """Get the best From email address ('<jdoe@a.com>')
 615
 616         If the best guess isn't well-formed (something@somthing.com),
 617         use `self.from_email` instead.
 618         """
 619         if self.force_from:
 620             return self.from_email
 621         feed = parsed.feed
 622         if 'email' in entry.get('author_detail', []):
 623             return self._validate_email(entry.author_detail.email)
 624         elif 'email' in feed.get('author_detail', []):
 625             return self._validate_email(feed.author_detail.email)
 626         if self.use_publisher_email:
 627             if 'email' in feed.get('publisher_detail', []):
 628                 return self._validate_email(feed.publisher_detail.email)
 629             if feed.get('errorreportsto', None):
 630                 return self._validate_email(feed.errorreportsto)
 631         _LOG.debug('no sender address found, fallback to default')
 632         return self.from_email
 633
 634     def _get_entry_email(self, parsed, entry):
 635         """Get the best From email address ('John <jdoe@a.com>')
 636         """
 637         name = self._get_entry_name(parsed=parsed, entry=entry)
 638         address = self._get_entry_address(parsed=parsed, entry=entry)
 639         return _formataddr((name, address))
 640
 641     def _get_entry_tags(self, entry):
 642         """Add post tags, if available
 643
 644         >>> f = Feed(name='test-feed')
 645         >>> f._get_entry_tags({
 646         ...         'tags': [{'term': 'tag1',
 647         ...                   'scheme': None,
 648         ...                   'label': None}]})
 649         'tag1'
 650         >>> f._get_entry_tags({
 651         ...         'tags': [{'term': 'tag1',
 652         ...                   'scheme': None,
 653         ...                   'label': None},
 654         ...                  {'term': 'tag2',
 655         ...                   'scheme': None,
 656         ...                   'label': None}]})
 657         'tag1,tag2'
 658
 659         Test some troublesome cases.  No tags:
 660
 661         >>> f._get_entry_tags({})
 662
 663         Empty tags:
 664
 665         >>> f._get_entry_tags({'tags': []})
 666
 667         Tags without a ``term`` entry:
 668
 669         >>> f._get_entry_tags({
 670         ...         'tags': [{'scheme': None,
 671         ...                   'label': None}]})
 672
 673         Tags with an empty term:
 674
 675         >>> f._get_entry_tags({
 676         ...         'tags': [{'term': '',
 677         ...                   'scheme': None,
 678         ...                   'label': None}]})
 679         """
 680         taglist = [tag['term'] for tag in entry.get('tags', [])
 681                    if tag.get('term', '')]
 682         if taglist:
 683             return ','.join(taglist)
 684
 685     def _get_entry_content(self, entry):
 686         """Select the best content from an entry.
 687
 688         Returns a feedparser content dict.
 689         """
 690         # How this works:
 691         #  * We have a bunch of potential contents.
 692         #  * We go thru looking for our first choice.
 693         #    (HTML or text, depending on self.html_mail)
 694         #  * If that doesn't work, we go thru looking for our second choice.
 695         #  * If that still doesn't work, we just take the first one.
 696         #
 697         # Possible future improvement:
 698         #  * Instead of just taking the first one
 699         #    pick the one in the "best" language.
 700         #  * HACK: hardcoded .html_mail, should take a tuple of media types
 701         contents = list(entry.get('content', []))
 702         if entry.get('summary_detail', None):
 703             contents.append(entry.summary_detail)
 704         if self.html_mail:
 705             types = ['text/html', 'text/plain']
 706         else:
 707             types = ['text/plain', 'text/html']
 708         for content_type in types:
 709             for content in contents:
 710                 if content['type'] == content_type:
 711                     return content
 712         if contents:
 713             return contents[0]
 714         return {'type': 'text/plain', 'value': ''}
 715
 716     def _process_entry_content(self, entry, content, subject):
 717         "Convert entry content to the requested format."
 718         link = self._get_entry_link(entry)
 719         if self.html_mail:
 720             lines = [
 721                 '<!DOCTYPE html>',
 722                 '<html>',
 723                 '  <head>',
 724                 ]
 725             if self.use_css and self.css:
 726                 lines.extend([
 727                         '    <style type="text/css">',
 728                         self.css,
 729                         '    </style>',
 730                         ])
 731             lines.extend([
 732                     '</head>',
 733                     '<body>',
 734                     '<div id="entry">',
 735                     '<h1 class="header"><a href="{}">{}</a></h1>'.format(
 736                         link, subject),
 737                     '<div id="body">',
 738                     ])
 739             if content['type'] in ('text/html', 'application/xhtml+xml'):
 740                 lines.append(content['value'].strip())
 741             else:
 742                 lines.append(_saxutils.escape(content['value'].strip()))
 743             lines.append('</div>')
 744             lines.extend([
 745                     '<div class="footer">'
 746                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
 747                     ])
 748             for enclosure in getattr(entry, 'enclosures', []):
 749                 if getattr(enclosure, 'url', None):
 750                     lines.append(
 751                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 752                             enclosure.url))
 753                 if getattr(enclosure, 'src', None):
 754                     lines.append(
 755                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
 756                             enclosure.src))
 757                     lines.append(
 758                         '<p><img src="{}" /></p>'.format(enclosure.src))
 759             for elink in getattr(entry, 'links', []):
 760                 if elink.get('rel', None) == 'via':
 761                     url = elink['href']
 762                     title = elink.get('title', url)
 763                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
 764                             url, title))
 765             lines.extend([
 766                     '</div>',  # /footer
 767                     '</div>',  # /entry
 768                     '</body>',
 769                     '</html>',
 770                     ''])
 771             content['type'] = 'text/html'
 772             content['value'] = '\n'.join(lines)
 773             return content
 774         else:  # not self.html_mail
 775             if content['type'] in ('text/html', 'application/xhtml+xml'):
 776                 try:
 777                     lines = [self._html2text(content['value'])]
 778                 except _html_parser.HTMLParseError as e:
 779                     raise _error.ProcessingError(parsed=None, feed=self)
 780             else:
 781                 lines = [content['value']]
 782             lines.append('')
 783             lines.append('URL: {}'.format(link))
 784             for enclosure in getattr(entry, 'enclosures', []):
 785                 if getattr(enclosure, 'url', None):
 786                     lines.append('Enclosure: {}'.format(enclosure.url))
 787                 if getattr(enclosure, 'src', None):
 788                     lines.append('Enclosure: {}'.format(enclosure.src))
 789             for elink in getattr(entry, 'links', []):
 790                 if elink.get('rel', None) == 'via':
 791                     url = elink['href']
 792                     title = elink.get('title', url)
 793                     lines.append('Via: {} {}'.format(title, url))
 794             content['type'] = 'text/plain'
 795             content['value'] = '\n'.join(lines)
 796             return content
 797
 798     def _send(self, sender, message):
 799         _LOG.info('send message for {}'.format(self))
 800         section = self.section
 801         if section not in self.config:
 802             section = 'DEFAULT'
 803         _email.send(sender=sender, recipient=self.to, message=message,
 804                     config=self.config, section=section)
 805
 806     def run(self, send=True):
 807         """Fetch and process the feed, mailing entry emails.
 808
 809         >>> feed = Feed(
 810         ...    name='test-feed',
 811         ...    url='http://feeds.feedburner.com/allthingsrss/hJBr')
 812         >>> def send(sender, message):
 813         ...    print('send from {}:'.format(sender))
 814         ...    print(message.as_string())
 815         >>> feed._send = send
 816         >>> feed.to = 'jdoe@dummy.invalid'
 817         >>> #parsed = feed.run()  # enable for debugging
 818         """
 819         if not self.to:
 820             raise _error.NoToEmailAddress(feed=self)
 821         parsed = self._fetch()
 822
 823         if self.digest:
 824             digest = self._new_digest()
 825             seen = []
 826
 827         for (guid, id_, sender, message) in self._process(parsed):
 828             _LOG.debug('new message: {}'.format(message['Subject']))
 829             if self.digest:
 830                 seen.append((guid, id_))
 831                 self._append_to_digest(digest=digest, message=message)
 832             else:
 833                 if send:
 834                     self._send(sender=sender, message=message)
 835                 if guid not in self.seen:
 836                     self.seen[guid] = {}
 837                 self.seen[guid]['id'] = id_
 838
 839         if self.digest and seen:
 840             if self.digest_post_process:
 841                 digest = self.digest_post_process(
 842                     feed=self, parsed=parsed, seen=seen, message=digest)
 843                 if not digest:
 844                     return
 845             self._send_digest(
 846                 digest=digest, seen=seen, sender=sender, send=send)
 847
 848         self.etag = parsed.get('etag', None)
 849         self.modified = parsed.get('modified', None)
 850
 851     def _new_digest(self):
 852         digest = _MIMEMultipart('digest')
 853         digest['To'] = self.to  # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
 854         digest['Subject'] = 'digest for {}'.format(self.name)
 855         digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
 856         digest['User-Agent'] = _USER_AGENT
 857         digest['X-RSS-Feed'] = self.url
 858         return digest
 859
 860     def _append_to_digest(self, digest, message):
 861         part = _MIMEMessage(message)
 862         part.add_header('Content-Disposition', 'attachment')
 863         digest.attach(part)
 864
 865     def _send_digest(self, digest, seen, sender, send=True):
 866         """Send a digest message
 867
 868         The date is extracted from the last message in the digest
 869         payload.  We assume that this part exists.  If you don't have
 870         any messages in the digest, don't call this function.
 871         """
 872         digest['From'] = sender  # TODO: _Header(), _formataddr()...
 873         last_part = digest.get_payload()[-1]
 874         last_message = last_part.get_payload()[0]
 875         digest['Date'] = last_message['Date']
 876
 877         _LOG.debug('new digest for {}'.format(self))
 878         if send:
 879             self._send(sender=sender, message=digest)
 880         for (guid, id_) in seen:
 881             if guid not in self.seen:
 882                 self.seen[guid] = {}
 883             self.seen[guid]['id'] = id_