1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.utils import formataddr as _formataddr
32 import hashlib as _hashlib
33 import html.parser as _html_parser
35 import socket as _socket
37 import urllib.error as _urllib_error
38 import urllib.request as _urllib_request
40 import xml.sax as _sax
41 import xml.sax.saxutils as _saxutils
43 import feedparser as _feedparser
44 import html2text as _html2text
47 from . import __version__
48 from . import LOG as _LOG
49 from . import config as _config
50 from . import email as _email
51 from . import error as _error
52 from . import util as _util
55 _feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
56 _urllib_request.install_opener(_urllib_request.build_opener())
58 for e in ['error', 'herror', 'gaierror']:
59 if hasattr(_socket, e):
60 _SOCKET_ERRORS.append(getattr(_socket, e))
61 del e # cleanup namespace
62 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
66 """Utility class for feed manipulation and storage.
70 >>> from .config import CONFIG
73 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
75 test-feed (http://example.com/feed.atom -> a@b.com)
79 'user@rss2email.invalid'
81 >>> feed.from_email = 'a@b.com'
82 >>> feed.save_to_config()
83 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
85 from = user@rss2email.invalid
90 url = http://example.com/feed.atom
95 >>> feed.etag = 'dummy etag'
96 >>> string = pickle.dumps(feed)
97 >>> feed = pickle.loads(string)
98 >>> feed.load_from_config(config=CONFIG)
102 'http://example.com/feed.atom'
104 Names can only contain ASCII letters, digits, and '._-'. Here the
105 invalid space causes an exception:
107 >>> Feed(name='invalid name')
108 Traceback (most recent call last):
110 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
112 You must define a URL:
114 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
115 Traceback (most recent call last):
117 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
122 >>> CONFIG['DEFAULT']['to'] = ''
123 >>> test_section = CONFIG.pop('feed.test-feed')
125 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
127 # saved/loaded from feed.dat using __getstate__/__setstate__.
128 _dynamic_attributes = [
135 ## saved/loaded from ConfigParser instance
136 # attributes that aren't in DEFAULT
137 _non_default_configured_attributes = [
140 # attributes that are in DEFAULT
141 _default_configured_attributes = [
142 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
143 _default_configured_attributes[
144 _default_configured_attributes.index('from')
145 ] = 'from_email' # `from` is a Python keyword
146 # all attributes that are saved/loaded from .config
147 _configured_attributes = (
148 _non_default_configured_attributes + _default_configured_attributes)
149 # attribute name -> .config option
150 _configured_attribute_translations = dict(
151 (attr,attr) for attr in _non_default_configured_attributes)
152 _configured_attribute_translations.update(dict(
153 zip(_default_configured_attributes,
154 _config.CONFIG['DEFAULT'].keys())))
155 # .config option -> attribute name
156 _configured_attribute_inverse_translations = dict(
157 (v,k) for k,v in _configured_attribute_translations.items())
159 # hints for value conversion
160 _boolean_attributes = [
162 'use_publisher_email',
170 'links_after_each_paragraph',
175 _integer_attributes = [
185 def __init__(self, name=None, url=None, to=None, config=None):
186 self._set_name(name=name)
188 self.__setstate__(dict(
189 (attr, getattr(self, attr))
190 for attr in self._dynamic_attributes))
191 self.load_from_config(config=config)
198 return '{} ({} -> {})'.format(self.name, self.url, self.to)
201 return '<Feed {}>'.format(str(self))
203 def __getstate__(self):
204 "Save dyamic attributes"
206 (key,getattr(self,key)) for key in self._dynamic_attributes)
208 get_state = __getstate__ # make it publicly accessible
210 def __setstate__(self, state):
211 "Restore dynamic attributes"
212 keys = sorted(state.keys())
213 if keys != sorted(self._dynamic_attributes):
214 raise ValueError(state)
215 self._set_name(name=state['name'])
216 self.__dict__.update(state)
218 set_state = __setstate__ # make it publicly accessible
220 def save_to_config(self):
221 "Save configured attributes"
222 data = _collections.OrderedDict()
223 default = self.config['DEFAULT']
224 for attr in self._configured_attributes:
225 key = self._configured_attribute_translations[attr]
226 value = getattr(self, attr)
227 if value is not None:
228 value = self._get_configured_option_value(
229 attribute=attr, value=value)
230 if (attr in self._non_default_configured_attributes or
231 value != default[key]):
233 self.config[self.section] = data
235 def load_from_config(self, config=None):
236 "Restore configured attributes"
238 config = _config.CONFIG
240 if self.section in self.config:
241 data = self.config[self.section]
243 data = self.config['DEFAULT']
244 keys = sorted(data.keys())
245 expected = sorted(self._configured_attribute_translations.values())
248 if (key not in keys and
249 key not in self._non_default_configured_attributes):
250 raise _error.InvalidFeedConfig(
251 setting=key, feed=self,
252 message='missing configuration key: {}'.format(key))
254 if key not in expected:
255 raise _error.InvalidFeedConfig(
256 setting=key, feed=self,
257 message='extra configuration key: {}'.format(key))
259 (self._configured_attribute_inverse_translations[k],
260 self._get_configured_attribute_value(
261 attribute=self._configured_attribute_inverse_translations[k],
263 for k in data.keys())
264 for attr in self._non_default_configured_attributes:
267 self.__dict__.update(data)
269 def _get_configured_option_value(self, attribute, value):
270 if value and attribute in self._list_attributes:
271 return ', '.join(value)
274 def _get_configured_attribute_value(self, attribute, key, data):
275 if attribute in self._boolean_attributes:
276 return data.getboolean(key)
277 elif attribute in self._integer_attributes:
278 return data.getint(key)
279 elif attribute in self._list_attributes:
280 return [x.strip() for x in data[key].split(',')]
284 """Reset dynamic data
290 def _set_name(self, name):
291 if not self._name_regexp.match(name):
292 raise _error.InvalidFeedName(name=name, feed=self)
294 self.section = 'feed.{}'.format(self.name)
297 """Fetch and parse a feed using feedparser.
300 ... name='test-feed',
301 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
302 >>> parsed = feed._fetch()
306 _LOG.info('fetch {}'.format(self))
308 raise _error.InvalidFeedConfig(setting='url', feed=self)
309 if self.section in self.config:
310 config = self.config[self.section]
312 config = self.config['DEFAULT']
313 proxy = config['proxy']
314 timeout = config.getint('feed-timeout')
317 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
318 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
319 return f(self.url, self.etag, modified=self.modified, **kwargs)
321 def _process(self, parsed):
322 _LOG.info('process {}'.format(self))
323 self._check_for_errors(parsed)
324 for entry in reversed(parsed.entries):
325 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
326 processed = self._process_entry(parsed=parsed, entry=entry)
330 def _check_for_errors(self, parsed):
332 status = getattr(parsed, 'status', 200)
333 _LOG.debug('HTTP status {}'.format(status))
335 _LOG.info('redirect {} from {} to {}'.format(
336 self.name, self.url, parsed['url']))
337 self.url = parsed['url']
338 elif status not in [200, 302, 304]:
339 raise _error.HTTPError(status=status, feed=self)
341 http_headers = parsed.get('headers', {})
343 _LOG.debug('HTTP headers: {}'.format(http_headers))
345 _LOG.warning('could not get HTTP headers: {}'.format(self))
348 if 'html' in http_headers.get('content-type', 'rss'):
349 _LOG.warning('looks like HTML: {}'.format(self))
351 if http_headers.get('content-length', '1') == '0':
352 _LOG.warning('empty page: {}'.format(self))
355 version = parsed.get('version', None)
357 _LOG.debug('feed version {}'.format(version))
359 _LOG.warning('unrecognized version: {}'.format(self))
362 exc = parsed.get('bozo_exception', None)
363 if isinstance(exc, _socket.timeout):
364 _LOG.error('timed out: {}'.format(self))
366 elif isinstance(exc, OSError):
367 _LOG.error('{}: {}'.format(exc, self))
369 elif isinstance(exc, _SOCKET_ERRORS):
370 _LOG.error('{}: {}'.format(exc, self))
372 elif isinstance(exc, _feedparser.zlib.error):
373 _LOG.error('broken compression: {}'.format(self))
375 elif isinstance(exc, (IOError, AttributeError)):
376 _LOG.error('{}: {}'.format(exc, self))
378 elif isinstance(exc, KeyboardInterrupt):
380 elif isinstance(exc, _sax.SAXParseException):
381 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
383 elif parsed.bozo or exc:
385 exc = "can't process"
386 _LOG.error('processing error: {}: {}'.format(exc, self))
390 status in [200, 302] and
391 not parsed.entries and
393 raise _error.ProcessingError(parsed=parsed, feed=feed)
395 def _process_entry(self, parsed, entry):
396 id_ = self._get_entry_id(entry)
397 # If .trust_guid isn't set, we get back hashes of the content.
398 # Instead of letting these run wild, we put them in context
399 # by associating them with the actual ID (if it exists).
400 guid = entry.get('id', id_)
401 if isinstance(guid, dict):
402 guid = guid.values()[0]
403 if guid in self.seen:
404 if self.seen[guid]['id'] == id_:
405 _LOG.debug('already seen {}'.format(id_))
406 return # already seen
407 sender = self._get_entry_email(parsed=parsed, entry=entry)
408 subject = self._get_entry_title(entry)
409 extra_headers = _collections.OrderedDict((
410 ('Date', self._get_entry_date(entry)),
411 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
412 ('User-Agent', 'rss2email'),
413 ('X-RSS-Feed', self.url),
415 ('X-RSS-URL', self._get_entry_link(entry)),
416 ('X-RSS-TAGS', self._get_entry_tags(entry)),
418 for k,v in extra_headers.items(): # remove empty tags, etc.
421 if self.bonus_header:
422 for header in self.bonus_header.splitlines():
424 key,value = header.split(':', 1)
425 extra_headers[key.strip()] = value.strip()
428 'malformed bonus-header: {}'.format(
431 content = self._get_entry_content(entry)
433 content = self._process_entry_content(
434 entry=entry, content=content, subject=subject)
435 except _error.ProcessingError as e:
438 message = _email.get_message(
442 body=content['value'],
443 content_type=content['type'].split('/', 1)[1],
444 extra_headers=extra_headers,
446 section=self.section)
447 return (guid, id_, sender, message)
449 def _get_entry_id(self, entry):
450 """Get best ID from an entry."""
452 if getattr(entry, 'id', None):
453 # Newer versions of feedparser could return a dictionary
454 if isinstance(entry.id, dict):
455 return entry.id.values()[0]
457 content = self._get_entry_content(entry)
458 content_value = content['value'].strip()
460 return _hashlib.sha1(
461 content_value.encode('unicode-escape')).hexdigest()
462 elif getattr(entry, 'link', None):
463 return _hashlib.sha1(
464 entry.link.encode('unicode-escape')).hexdigest()
465 elif getattr(entry, 'title', None):
466 return _hashlib.sha1(
467 entry.title.encode('unicode-escape')).hexdigest()
469 def _get_entry_link(self, entry):
470 return entry.get('link', None)
472 def _get_entry_title(self, entry):
473 if hasattr(entry, 'title_detail') and entry.title_detail:
474 title = entry.title_detail.value
475 if 'html' in entry.title_detail.type:
476 title = _html2text.html2text(title)
478 content = self._get_entry_content(entry)
479 value = content['value']
480 if content['type'] in ('text/html', 'application/xhtml+xml'):
481 value = _html2text.html2text(value)
483 title = title.replace('\n', ' ').strip()
486 def _get_entry_date(self, entry):
487 datetime = _time.gmtime()
489 for datetype in self.date_header_order:
490 kind = datetype + '_parsed'
491 if entry.get(kind, None):
492 datetime = entry[kind]
494 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
496 def _get_entry_name(self, parsed, entry):
499 >>> import feedparser
500 >>> f = Feed(name='test-feed')
501 >>> parsed = feedparser.parse(
502 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
505 ... ' <name>Example author</name>\\n'
506 ... ' <email>me@example.com</email>\\n'
507 ... ' <url>http://example.com/</url>\\n'
512 >>> entry = parsed.entries[0]
513 >>> f.friendly_name = False
514 >>> f._get_entry_name(parsed, entry)
516 >>> f.friendly_name = True
517 >>> f._get_entry_name(parsed, entry)
520 if not self.friendly_name:
524 parts.append(feed.get('title', ''))
525 for x in [entry, feed]:
526 if 'name' in x.get('author_detail', []):
527 if x.author_detail.name:
530 parts.append(x.author_detail.name)
532 if not ''.join(parts) and self.use_publisher_email:
533 if 'name' in feed.get('publisher_detail', []):
536 parts.append(feed.publisher_detail.name)
537 return _html2text.unescape(''.join(parts))
539 def _validate_email(self, email, default=None):
540 """Do a basic quality check on email address
542 Return `default` if the address doesn't appear to be
543 well-formed. If `default` is `None`, return
546 >>> f = Feed(name='test-feed')
547 >>> f._validate_email('valid@example.com', 'default@example.com')
549 >>> f._validate_email('invalid@', 'default@example.com')
550 'default@example.com'
551 >>> f._validate_email('@invalid', 'default@example.com')
552 'default@example.com'
553 >>> f._validate_email('invalid', 'default@example.com')
554 'default@example.com'
556 parts = email.split('@')
557 if len(parts) != 2 or '' in parts:
559 return self.from_email
563 def _get_entry_address(self, parsed, entry):
564 """Get the best From email address ('<jdoe@a.com>')
566 If the best guess isn't well-formed (something@somthing.com),
567 use `self.from_email` instead.
570 return self.from_email
572 if 'email' in entry.get('author_detail', []):
573 return self._validate_email(entry.author_detail.email)
574 elif 'email' in feed.get('author_detail', []):
575 return self._validate_email(feed.author_detail.email)
576 if self.use_publisher_email:
577 if 'email' in feed.get('publisher_detail', []):
578 return self._validate_email(feed.publisher_detail.email)
579 if feed.get('errorreportsto', None):
580 return self._validate_email(feed.errorreportsto)
581 _LOG.debug('no sender address found, fallback to default')
582 return self.from_email
584 def _get_entry_email(self, parsed, entry):
585 """Get the best From email address ('John <jdoe@a.com>')
587 name = self._get_entry_name(parsed=parsed, entry=entry)
588 address = self._get_entry_address(parsed=parsed, entry=entry)
589 return _formataddr((name, address))
591 def _get_entry_tags(self, entry):
592 """Add post tags, if available
594 >>> f = Feed(name='test-feed')
595 >>> f._get_entry_tags({
596 ... 'tags': [{'term': 'tag1',
598 ... 'label': None}]})
600 >>> f._get_entry_tags({
601 ... 'tags': [{'term': 'tag1',
606 ... 'label': None}]})
609 Test some troublesome cases. No tags:
611 >>> f._get_entry_tags({})
615 >>> f._get_entry_tags({'tags': []})
617 Tags without a ``term`` entry:
619 >>> f._get_entry_tags({
620 ... 'tags': [{'scheme': None,
621 ... 'label': None}]})
623 Tags with an empty term:
625 >>> f._get_entry_tags({
626 ... 'tags': [{'term': '',
628 ... 'label': None}]})
630 taglist = [tag['term'] for tag in entry.get('tags', [])
631 if tag.get('term', '')]
633 return ','.join(taglist)
635 def _get_entry_content(self, entry):
636 """Select the best content from an entry.
638 Returns a feedparser content dict.
641 # * We have a bunch of potential contents.
642 # * We go thru looking for our first choice.
643 # (HTML or text, depending on self.html_mail)
644 # * If that doesn't work, we go thru looking for our second choice.
645 # * If that still doesn't work, we just take the first one.
647 # Possible future improvement:
648 # * Instead of just taking the first one
649 # pick the one in the "best" language.
650 # * HACK: hardcoded .html_mail, should take a tuple of media types
651 contents = list(entry.get('content', []))
652 if entry.get('summary_detail', None):
653 contents.append(entry.summary_detail)
655 types = ['text/html', 'text/plain']
657 types = ['text/plain', 'text/html']
658 for content_type in types:
659 for content in contents:
660 if content['type'] == content_type:
664 return {'type': 'text/plain', 'value': ''}
666 def _process_entry_content(self, entry, content, subject):
667 "Convert entry content to the requested format."
668 link = self._get_entry_link(entry)
675 if self.use_css and self.css:
677 ' <style type="text/css">',
685 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
689 if content['type'] in ('text/html', 'application/xhtml+xml'):
690 lines.append(content['value'].strip())
692 lines.append(_saxutils.escape(content['value'].strip()))
693 lines.append('</div>')
695 '<div class="footer">'
696 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
698 for enclosure in getattr(entry, 'enclosures', []):
699 if getattr(enclosure, 'url', None):
701 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
703 if getattr(enclosure, 'src', None):
705 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
708 '<p><img src="{}" /></p>'.format(enclosure.src))
709 for elink in getattr(entry, 'links', []):
710 if elink.get('rel', None) == 'via':
712 title = elink.get('title', url)
713 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
721 content['type'] = 'text/html'
722 content['value'] = '\n'.join(lines)
724 else: # not self.html_mail
725 if content['type'] in ('text/html', 'application/xhtml+xml'):
727 lines = [_html2text.html2text(content['value'])]
728 except _html_parser.HTMLParseError as e:
729 raise _error.ProcessingError(parsed=None, feed=self)
731 lines = [content['value']]
733 lines.append('URL: {}'.format(link))
734 for enclosure in getattr(entry, 'enclosures', []):
735 if getattr(enclosure, 'url', None):
736 lines.append('Enclosure: {}'.format(enclosure.url))
737 if getattr(enclosure, 'src', None):
738 lines.append('Enclosure: {}'.format(enclosure.src))
739 for elink in getattr(entry, 'links', []):
740 if elink.get('rel', None) == 'via':
742 title = elink.get('title', url)
743 lines.append('Via: {} {}'.format(title, url))
744 content['type'] = 'text/plain'
745 content['value'] = '\n'.join(lines)
748 def _send(self, sender, message):
749 _LOG.info('send message for {}'.format(self))
750 section = self.section
751 if section not in self.config:
753 _email.send(sender=sender, recipient=self.to, message=message,
754 config=self.config, section=section)
756 def run(self, send=True):
757 """Fetch and process the feed, mailing entry emails.
760 ... name='test-feed',
761 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
762 >>> def send(sender, message):
763 ... print('send from {}:'.format(sender))
764 ... print(message.as_string())
765 >>> feed._send = send
766 >>> feed.to = 'jdoe@dummy.invalid'
767 >>> #parsed = feed.run() # enable for debugging
770 raise _error.NoToEmailAddress(feed=self)
771 parsed = self._fetch()
772 for (guid, id_, sender, message) in self._process(parsed):
773 _LOG.debug('new message: {}'.format(message['Subject']))
775 self._send(sender=sender, message=message)
776 if guid not in self.seen:
778 self.seen[guid]['id'] = id_
779 self.etag = parsed.get('etag', None)
780 self.modified = parsed.get('modified', None)