1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.utils import formataddr as _formataddr
32 import hashlib as _hashlib
34 import socket as _socket
36 import urllib.error as _urllib_error
37 import urllib.request as _urllib_request
39 import xml.sax as _sax
40 import xml.sax.saxutils as _saxutils
42 import feedparser as _feedparser
43 import html2text as _html2text
46 from . import __version__
47 from . import LOG as _LOG
48 from . import config as _config
49 from . import email as _email
50 from . import error as _error
51 from . import util as _util
54 _feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
55 _urllib_request.install_opener(_urllib_request.build_opener())
57 for e in ['error', 'gaierror']:
58 if hasattr(_socket, e):
59 _SOCKET_ERRORS.append(getattr(_socket, e))
60 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
64 """Utility class for feed manipulation and storage.
68 >>> from .config import CONFIG
71 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
73 test-feed (http://example.com/feed.atom -> a@b.com)
77 'user@rss2email.invalid'
79 >>> feed.from_email = 'a@b.com'
80 >>> feed.save_to_config()
81 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
83 from = user@rss2email.invalid
88 url = http://example.com/feed.atom
93 >>> feed.etag = 'dummy etag'
94 >>> string = pickle.dumps(feed)
95 >>> feed = pickle.loads(string)
96 >>> feed.load_from_config(config=CONFIG)
100 'http://example.com/feed.atom'
102 Names can only contain ASCII letters, digits, and '._-'. Here the
103 invalid space causes an exception:
105 >>> Feed(name='invalid name')
106 Traceback (most recent call last):
108 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
110 You must define a URL:
112 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
113 Traceback (most recent call last):
115 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
120 >>> CONFIG['DEFAULT']['to'] = ''
121 >>> test_section = CONFIG.pop('feed.test-feed')
123 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
125 # saved/loaded from feed.dat using __getstate__/__setstate__.
126 _dynamic_attributes = [
133 ## saved/loaded from ConfigParser instance
134 # attributes that aren't in DEFAULT
135 _non_default_configured_attributes = [
138 # attributes that are in DEFAULT
139 _default_configured_attributes = [
140 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
141 _default_configured_attributes[
142 _default_configured_attributes.index('from')
143 ] = 'from_email' # `from` is a Python keyword
144 # all attributes that are saved/loaded from .config
145 _configured_attributes = (
146 _non_default_configured_attributes + _default_configured_attributes)
147 # attribute name -> .config option
148 _configured_attribute_translations = dict(
149 (attr,attr) for attr in _non_default_configured_attributes)
150 _configured_attribute_translations.update(dict(
151 zip(_default_configured_attributes,
152 _config.CONFIG['DEFAULT'].keys())))
153 # .config option -> attribute name
154 _configured_attribute_inverse_translations = dict(
155 (v,k) for k,v in _configured_attribute_translations.items())
157 # hints for value conversion
158 _boolean_attributes = [
160 'use_publisher_email',
168 'links_after_each_paragraph',
173 _integer_attributes = [
183 def __init__(self, name=None, url=None, to=None, config=None):
184 self._set_name(name=name)
186 self.__setstate__(dict(
187 (attr, getattr(self, attr))
188 for attr in self._dynamic_attributes))
189 self.load_from_config(config=config)
196 return '{} ({} -> {})'.format(self.name, self.url, self.to)
199 return '<Feed {}>'.format(str(self))
201 def __getstate__(self):
202 "Save dyamic attributes"
204 (key,getattr(self,key)) for key in self._dynamic_attributes)
206 def __setstate__(self, state):
207 "Restore dynamic attributes"
208 keys = sorted(state.keys())
209 if keys != sorted(self._dynamic_attributes):
210 raise ValueError(state)
211 self._set_name(name=state['name'])
212 self.__dict__.update(state)
214 def save_to_config(self):
215 "Save configured attributes"
216 data = _collections.OrderedDict()
217 default = self.config['DEFAULT']
218 for attr in self._configured_attributes:
219 key = self._configured_attribute_translations[attr]
220 value = getattr(self, attr)
221 if value is not None:
222 value = self._get_configured_option_value(
223 attribute=attr, value=value)
224 if (attr in self._non_default_configured_attributes or
225 value != default[key]):
227 self.config[self.section] = data
229 def load_from_config(self, config=None):
230 "Restore configured attributes"
232 config = _config.CONFIG
234 if self.section in self.config:
235 data = self.config[self.section]
237 data = self.config['DEFAULT']
238 keys = sorted(data.keys())
239 expected = sorted(self._configured_attribute_translations.values())
242 if (key not in keys and
243 key not in self._non_default_configured_attributes):
244 raise ValueError('missing key: {}'.format(key))
246 if key not in expected:
247 raise ValueError('extra key: {}'.format(key))
249 (self._configured_attribute_inverse_translations[k],
250 self._get_configured_attribute_value(
251 attribute=self._configured_attribute_inverse_translations[k],
253 for k in data.keys())
254 for attr in self._non_default_configured_attributes:
257 self.__dict__.update(data)
259 def _get_configured_option_value(self, attribute, value):
260 if value and attribute in self._list_attributes:
261 return ', '.join(value)
264 def _get_configured_attribute_value(self, attribute, key, data):
265 if attribute in self._boolean_attributes:
266 return data.getboolean(key)
267 elif attribute in self._integer_attributes:
268 return data.getint(key)
269 elif attribute in self._list_attributes:
270 return [x.strip() for x in data[key].split(',')]
274 """Reset dynamic data
280 def _set_name(self, name):
281 if not self._name_regexp.match(name):
282 raise _error.InvalidFeedName(name=name, feed=self)
284 self.section = 'feed.{}'.format(self.name)
287 """Fetch and parse a feed using feedparser.
290 ... name='test-feed',
291 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
292 >>> parsed = feed._fetch()
296 _LOG.info('fetch {}'.format(self))
298 raise _error.InvalidFeedConfig(setting='url', feed=self)
299 if self.section in self.config:
300 config = self.config[self.section]
302 config = self.config['DEFAULT']
303 proxy = config['proxy']
304 timeout = config.getint('feed-timeout')
307 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
308 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
309 return f(self.url, self.etag, modified=self.modified, **kwargs)
311 def _process(self, parsed):
312 _LOG.info('process {}'.format(self))
313 self._check_for_errors(parsed)
314 for entry in reversed(parsed.entries):
315 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
316 processed = self._process_entry(parsed=parsed, entry=entry)
320 def _check_for_errors(self, parsed):
322 status = getattr(parsed, 'status', 200)
323 _LOG.debug('HTTP status {}'.format(status))
325 _LOG.info('redirect {} from {} to {}'.format(
326 self.name, self.url, parsed['url']))
327 self.url = parsed['url']
328 elif status not in [200, 302, 304]:
329 raise _error.HTTPError(status=status, feed=self)
331 http_headers = parsed.get('headers', {})
333 _LOG.debug('HTTP headers: {}'.format(http_headers))
335 _LOG.warning('could not get HTTP headers: {}'.format(self))
338 if 'html' in http_headers.get('content-type', 'rss'):
339 _LOG.warning('looks like HTML: {}'.format(self))
341 if http_headers.get('content-length', '1') == '0':
342 _LOG.warning('empty page: {}'.format(self))
345 version = parsed.get('version', None)
347 _LOG.debug('feed version {}'.format(version))
349 _LOG.warning('unrecognized version: {}'.format(self))
352 exc = parsed.get('bozo_exception', None)
353 if isinstance(exc, _socket.timeout):
354 _LOG.error('timed out: {}'.format(self))
356 elif isinstance(exc, _SOCKET_ERRORS):
358 _LOG.error('{}: {}'.format(exc, self))
360 elif (hasattr(exc, 'reason') and
361 isinstance(exc.reason, _urllib_error.URLError)):
362 if isinstance(exc.reason, _SOCKET_ERRORS):
363 reason = exc.reason.args[1]
366 _LOG.error('{}: {}'.format(exc, self))
368 elif isinstance(exc, _feedparser.zlib.error):
369 _LOG.error('broken compression: {}'.format(self))
371 elif isinstance(exc, (IOError, AttributeError)):
372 _LOG.error('{}: {}'.format(exc, self))
374 elif isinstance(exc, KeyboardInterrupt):
376 elif isinstance(exc, _sax.SAXParseException):
377 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
379 elif parsed.bozo or exc:
381 exc = "can't process"
382 _LOG.error('processing error: {}: {}'.format(exc, self))
386 status in [200, 302] and
387 not parsed.entries and
389 raise _error.ProcessingError(parsed=parsed, feed=feed)
391 def _process_entry(self, parsed, entry):
392 id_ = self._get_entry_id(entry)
393 # If .trust_guid isn't set, we get back hashes of the content.
394 # Instead of letting these run wild, we put them in context
395 # by associating them with the actual ID (if it exists).
396 guid = entry.get('id', id_)
397 if isinstance(guid, dict):
398 guid = guid.values()[0]
399 if guid in self.seen:
400 if self.seen[guid] == id_:
401 _LOG.debug('already seen {}'.format(id_))
402 return # already seen
403 sender = self._get_entry_email(parsed=parsed, entry=entry)
404 link = entry.get('link', None)
405 subject = self._get_entry_title(entry)
406 extra_headers = _collections.OrderedDict((
407 ('Date', self._get_entry_date(entry)),
408 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
409 ('User-Agent', 'rss2email'),
410 ('X-RSS-Feed', self.url),
413 ('X-RSS-TAGS', self._get_entry_tags(entry)),
415 for k,v in extra_headers.items(): # remove empty tags, etc.
418 if self.bonus_header:
419 for header in self.bonus_header.splitlines():
421 key,value = header.split(':', 1)
422 extra_headers[key.strip()] = value.strip()
425 'malformed bonus-header: {}'.format(
428 content = self._get_entry_content(entry)
429 content = self._process_entry_content(
430 entry=entry, content=content, link=link, subject=subject)
431 message = _email.get_message(
435 body=content['value'],
436 content_type=content['type'].split('/', 1)[1],
437 extra_headers=extra_headers)
438 return (guid, id_, sender, message)
440 def _get_entry_id(self, entry):
441 """Get best ID from an entry."""
443 if getattr(entry, 'id', None):
444 # Newer versions of feedparser could return a dictionary
445 if isinstance(entry.id, dict):
446 return entry.id.values()[0]
448 content = self._get_entry_content(entry)
449 content_value = content['value'].strip()
451 return _hashlib.sha1(
452 content_value.encode('unicode-escape')).hexdigest()
453 elif getattr(entry, 'link', None):
454 return _hashlib.sha1(
455 entry.link.encode('unicode-escape')).hexdigest()
456 elif getattr(entry, 'title', None):
457 return _hashlib.sha1(
458 entry.title.encode('unicode-escape')).hexdigest()
460 def _get_entry_title(self, entry):
461 if hasattr(entry, 'title_detail') and entry.title_detail:
462 title = entry.title_detail.value
463 if 'html' in entry.title_detail.type:
464 title = _html2text.html2text(title)
466 content = self._get_entry_content(entry)
467 value = content['value']
468 if content['type'] in ('text/html', 'application/xhtml+xml'):
469 value = _html2text.html2text(value)
471 title = title.replace('\n', ' ').strip()
474 def _get_entry_date(self, entry):
475 datetime = _time.gmtime()
477 for datetype in self.date_header_order:
478 kind = datetype + '_parsed'
479 if entry.get(kind, None):
480 datetime = entry[kind]
482 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
484 def _get_entry_name(self, parsed, entry):
487 >>> import feedparser
488 >>> f = Feed(name='test-feed')
489 >>> parsed = feedparser.parse(
490 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
493 ... ' <name>Example author</name>\\n'
494 ... ' <email>me@example.com</email>\\n'
495 ... ' <url>http://example.com/</url>\\n'
500 >>> entry = parsed.entries[0]
501 >>> f.friendly_name = False
502 >>> f._get_entry_name(parsed, entry)
504 >>> f.friendly_name = True
505 >>> f._get_entry_name(parsed, entry)
508 if not self.friendly_name:
512 parts.append(feed.get('title', ''))
513 for x in [entry, feed]:
514 if 'name' in x.get('author_detail', []):
515 if x.author_detail.name:
518 parts.append(x.author_detail.name)
520 if not ''.join(parts) and self.use_publisher_email:
521 if 'name' in feed.get('publisher_detail', []):
524 parts.append(feed.publisher_detail.name)
525 return _html2text.unescape(''.join(parts))
527 def _validate_email(self, email, default=None):
528 """Do a basic quality check on email address
530 Return `default` if the address doesn't appear to be
531 well-formed. If `default` is `None`, return
534 >>> f = Feed(name='test-feed')
535 >>> f._validate_email('valid@example.com', 'default@example.com')
537 >>> f._validate_email('invalid@', 'default@example.com')
538 'default@example.com'
539 >>> f._validate_email('@invalid', 'default@example.com')
540 'default@example.com'
541 >>> f._validate_email('invalid', 'default@example.com')
542 'default@example.com'
544 parts = email.split('@')
545 if len(parts) != 2 or '' in parts:
547 return self.from_email
551 def _get_entry_address(self, parsed, entry):
552 """Get the best From email address ('<jdoe@a.com>')
554 If the best guess isn't well-formed (something@somthing.com),
555 use `self.from_email` instead.
558 return self.from_email
560 if 'email' in entry.get('author_detail', []):
561 return self._validate_email(entry.author_detail.email)
562 elif 'email' in feed.get('author_detail', []):
563 return self._validate_email(feed.author_detail.email)
564 if self.use_publisher_email:
565 if 'email' in feed.get('publisher_detail', []):
566 return self._validate_email(feed.publisher_detail.email)
567 if feed.get('errorreportsto', None):
568 return self._validate_email(feed.errorreportsto)
569 _LOG.debug('no sender address found, fallback to default')
570 return self.from_email
572 def _get_entry_email(self, parsed, entry):
573 """Get the best From email address ('John <jdoe@a.com>')
575 name = self._get_entry_name(parsed=parsed, entry=entry)
576 address = self._get_entry_address(parsed=parsed, entry=entry)
577 return _formataddr((name, address))
579 def _get_entry_tags(self, entry):
580 """Add post tags, if available
582 >>> f = Feed(name='test-feed')
583 >>> f._get_entry_tags({
584 ... 'tags': [{'term': 'tag1',
586 ... 'label': None}]})
588 >>> f._get_entry_tags({
589 ... 'tags': [{'term': 'tag1',
594 ... 'label': None}]})
597 Test some troublesome cases. No tags:
599 >>> f._get_entry_tags({})
603 >>> f._get_entry_tags({'tags': []})
605 Tags without a ``term`` entry:
607 >>> f._get_entry_tags({
608 ... 'tags': [{'scheme': None,
609 ... 'label': None}]})
611 Tags with an empty term:
613 >>> f._get_entry_tags({
614 ... 'tags': [{'term': '',
616 ... 'label': None}]})
618 taglist = [tag['term'] for tag in entry.get('tags', [])
619 if tag.get('term', '')]
621 return ','.join(taglist)
623 def _get_entry_content(self, entry):
624 """Select the best content from an entry.
626 Returns a feedparser content dict.
629 # * We have a bunch of potential contents.
630 # * We go thru looking for our first choice.
631 # (HTML or text, depending on self.html_mail)
632 # * If that doesn't work, we go thru looking for our second choice.
633 # * If that still doesn't work, we just take the first one.
635 # Possible future improvement:
636 # * Instead of just taking the first one
637 # pick the one in the "best" language.
638 # * HACK: hardcoded .html_mail, should take a tuple of media types
639 contents = list(entry.get('content', []))
640 if entry.get('summary_detail', None):
641 contents.append(entry.summary_detail)
643 types = ['text/html', 'text/plain']
645 types = ['text/plain', 'text/html']
646 for content_type in types:
647 for content in contents:
648 if content['type'] == content_type:
652 return {'type': 'text/plain', 'value': ''}
654 def _process_entry_content(self, entry, content, link, subject):
655 "Convert entry content to the requested format."
662 if self.use_css and self.css:
664 ' <style type="text/css">',
672 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
674 '<div id="body"><table><tr><td>',
676 if content['type'] in ('text/html', 'application/xhtml+xml'):
677 lines.append(content['value'].strip())
679 lines.append(_saxutils.escape(content['value'].strip()))
680 lines.append('</td></tr></table></div>')
682 '<div class="footer">'
683 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
685 for enclosure in getattr(entry, 'enclosures', []):
686 if getattr(enclosure, 'url', None):
688 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
690 if getattr(enclosure, 'src', None):
692 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
695 '<p><img src="{}" /></p>'.format(enclosure.src))
696 for elink in getattr(entry, 'links', []):
697 if elink.get('rel', None) == 'via':
700 'http://www.google.com/reader/public/atom/',
701 'http://www.google.com/reader/view/')
703 if elink.get('title', None):
704 title = elink['title']
705 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
713 content['type'] = 'text/html'
714 content['value'] = '\n'.join(lines)
716 else: # not self.html_mail
717 if content['type'] in ('text/html', 'application/xhtml+xml'):
718 lines = [_html2text.html2text(content['value'])]
720 lines = [content['value']]
722 lines.append('URL: {}'.format(link))
723 for enclosure in getattr(entry, 'enclosures', []):
724 if getattr(enclosure, 'url', None):
725 lines.append('Enclosure: {}'.format(enclosure.url))
726 if getattr(enclosure, 'src', None):
727 lines.append('Enclosure: {}'.format(enclosure.src))
728 for elink in getattr(entry, 'links', []):
729 if elink.get('rel', None) == 'via':
732 'http://www.google.com/reader/public/atom/',
733 'http://www.google.com/reader/view/')
735 if elink.get('title', None):
736 title = elink['title']
737 lines.append('Via: {} {}'.format(title, url))
738 content['type'] = 'text/plain'
739 content['value'] = '\n'.join(lines)
742 def _send(self, sender, message):
743 _LOG.info('send message for {}'.format(self))
744 section = self.section
745 if section not in self.config:
747 _email.send(sender=sender, recipient=self.to, message=message,
748 config=self.config, section=section)
750 def run(self, send=True):
751 """Fetch and process the feed, mailing entry emails.
754 ... name='test-feed',
755 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
756 >>> def send(sender, message):
757 ... print('send from {}:'.format(sender))
758 ... print(message.as_string())
759 >>> feed._send = send
760 >>> feed.to = 'jdoe@dummy.invalid'
761 >>> #parsed = feed.run() # enable for debugging
764 raise _error.NoToEmailAddress(feed=self)
765 parsed = self._fetch()
766 for (guid, id_, sender, message) in self._process(parsed):
767 _LOG.debug('new message: {}'.format(message['Subject']))
769 self._send(sender=sender, message=message)
770 self.seen[guid] = id_
771 self.etag = parsed.get('etag', None)
772 self.modified = parsed.get('modified', None)