1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.utils import formataddr as _formataddr
32 import hashlib as _hashlib
34 import socket as _socket
36 import urllib.error as _urllib_error
37 import urllib.request as _urllib_request
39 import xml.sax as _sax
40 import xml.sax.saxutils as _saxutils
42 import feedparser as _feedparser
43 import html2text as _html2text
46 from . import __version__
47 from . import LOG as _LOG
48 from . import config as _config
49 from . import email as _email
50 from . import error as _error
51 from . import util as _util
54 _feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
55 _urllib_request.install_opener(_urllib_request.build_opener())
57 for e in ['error', 'gaierror']:
58 if hasattr(_socket, e):
59 _SOCKET_ERRORS.append(getattr(_socket, e))
60 del e # cleanup namespace
61 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
65 """Utility class for feed manipulation and storage.
69 >>> from .config import CONFIG
72 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
74 test-feed (http://example.com/feed.atom -> a@b.com)
78 'user@rss2email.invalid'
80 >>> feed.from_email = 'a@b.com'
81 >>> feed.save_to_config()
82 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
84 from = user@rss2email.invalid
89 url = http://example.com/feed.atom
94 >>> feed.etag = 'dummy etag'
95 >>> string = pickle.dumps(feed)
96 >>> feed = pickle.loads(string)
97 >>> feed.load_from_config(config=CONFIG)
101 'http://example.com/feed.atom'
103 Names can only contain ASCII letters, digits, and '._-'. Here the
104 invalid space causes an exception:
106 >>> Feed(name='invalid name')
107 Traceback (most recent call last):
109 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
111 You must define a URL:
113 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
114 Traceback (most recent call last):
116 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
121 >>> CONFIG['DEFAULT']['to'] = ''
122 >>> test_section = CONFIG.pop('feed.test-feed')
124 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
126 # saved/loaded from feed.dat using __getstate__/__setstate__.
127 _dynamic_attributes = [
134 ## saved/loaded from ConfigParser instance
135 # attributes that aren't in DEFAULT
136 _non_default_configured_attributes = [
139 # attributes that are in DEFAULT
140 _default_configured_attributes = [
141 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
142 _default_configured_attributes[
143 _default_configured_attributes.index('from')
144 ] = 'from_email' # `from` is a Python keyword
145 # all attributes that are saved/loaded from .config
146 _configured_attributes = (
147 _non_default_configured_attributes + _default_configured_attributes)
148 # attribute name -> .config option
149 _configured_attribute_translations = dict(
150 (attr,attr) for attr in _non_default_configured_attributes)
151 _configured_attribute_translations.update(dict(
152 zip(_default_configured_attributes,
153 _config.CONFIG['DEFAULT'].keys())))
154 # .config option -> attribute name
155 _configured_attribute_inverse_translations = dict(
156 (v,k) for k,v in _configured_attribute_translations.items())
158 # hints for value conversion
159 _boolean_attributes = [
161 'use_publisher_email',
169 'links_after_each_paragraph',
174 _integer_attributes = [
184 def __init__(self, name=None, url=None, to=None, config=None):
185 self._set_name(name=name)
187 self.__setstate__(dict(
188 (attr, getattr(self, attr))
189 for attr in self._dynamic_attributes))
190 self.load_from_config(config=config)
197 return '{} ({} -> {})'.format(self.name, self.url, self.to)
200 return '<Feed {}>'.format(str(self))
202 def __getstate__(self):
203 "Save dyamic attributes"
205 (key,getattr(self,key)) for key in self._dynamic_attributes)
207 get_state = __getstate__ # make it publicly accessible
209 def __setstate__(self, state):
210 "Restore dynamic attributes"
211 keys = sorted(state.keys())
212 if keys != sorted(self._dynamic_attributes):
213 raise ValueError(state)
214 self._set_name(name=state['name'])
215 self.__dict__.update(state)
217 set_state = __setstate__ # make it publicly accessible
219 def save_to_config(self):
220 "Save configured attributes"
221 data = _collections.OrderedDict()
222 default = self.config['DEFAULT']
223 for attr in self._configured_attributes:
224 key = self._configured_attribute_translations[attr]
225 value = getattr(self, attr)
226 if value is not None:
227 value = self._get_configured_option_value(
228 attribute=attr, value=value)
229 if (attr in self._non_default_configured_attributes or
230 value != default[key]):
232 self.config[self.section] = data
234 def load_from_config(self, config=None):
235 "Restore configured attributes"
237 config = _config.CONFIG
239 if self.section in self.config:
240 data = self.config[self.section]
242 data = self.config['DEFAULT']
243 keys = sorted(data.keys())
244 expected = sorted(self._configured_attribute_translations.values())
247 if (key not in keys and
248 key not in self._non_default_configured_attributes):
249 raise _error.InvalidFeedConfig(
250 setting=key, feed=self,
251 message='missing configuration key: {}'.format(key))
253 if key not in expected:
254 raise _error.InvalidFeedConfig(
255 setting=key, feed=self,
256 message='extra configuration key: {}'.format(key))
258 (self._configured_attribute_inverse_translations[k],
259 self._get_configured_attribute_value(
260 attribute=self._configured_attribute_inverse_translations[k],
262 for k in data.keys())
263 for attr in self._non_default_configured_attributes:
266 self.__dict__.update(data)
268 def _get_configured_option_value(self, attribute, value):
269 if value and attribute in self._list_attributes:
270 return ', '.join(value)
273 def _get_configured_attribute_value(self, attribute, key, data):
274 if attribute in self._boolean_attributes:
275 return data.getboolean(key)
276 elif attribute in self._integer_attributes:
277 return data.getint(key)
278 elif attribute in self._list_attributes:
279 return [x.strip() for x in data[key].split(',')]
283 """Reset dynamic data
289 def _set_name(self, name):
290 if not self._name_regexp.match(name):
291 raise _error.InvalidFeedName(name=name, feed=self)
293 self.section = 'feed.{}'.format(self.name)
296 """Fetch and parse a feed using feedparser.
299 ... name='test-feed',
300 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
301 >>> parsed = feed._fetch()
305 _LOG.info('fetch {}'.format(self))
307 raise _error.InvalidFeedConfig(setting='url', feed=self)
308 if self.section in self.config:
309 config = self.config[self.section]
311 config = self.config['DEFAULT']
312 proxy = config['proxy']
313 timeout = config.getint('feed-timeout')
316 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
317 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
318 return f(self.url, self.etag, modified=self.modified, **kwargs)
320 def _process(self, parsed):
321 _LOG.info('process {}'.format(self))
322 self._check_for_errors(parsed)
323 for entry in reversed(parsed.entries):
324 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
325 processed = self._process_entry(parsed=parsed, entry=entry)
329 def _check_for_errors(self, parsed):
331 status = getattr(parsed, 'status', 200)
332 _LOG.debug('HTTP status {}'.format(status))
334 _LOG.info('redirect {} from {} to {}'.format(
335 self.name, self.url, parsed['url']))
336 self.url = parsed['url']
337 elif status not in [200, 302, 304]:
338 raise _error.HTTPError(status=status, feed=self)
340 http_headers = parsed.get('headers', {})
342 _LOG.debug('HTTP headers: {}'.format(http_headers))
344 _LOG.warning('could not get HTTP headers: {}'.format(self))
347 if 'html' in http_headers.get('content-type', 'rss'):
348 _LOG.warning('looks like HTML: {}'.format(self))
350 if http_headers.get('content-length', '1') == '0':
351 _LOG.warning('empty page: {}'.format(self))
354 version = parsed.get('version', None)
356 _LOG.debug('feed version {}'.format(version))
358 _LOG.warning('unrecognized version: {}'.format(self))
361 exc = parsed.get('bozo_exception', None)
362 if isinstance(exc, _socket.timeout):
363 _LOG.error('timed out: {}'.format(self))
365 elif isinstance(exc, _SOCKET_ERRORS):
367 _LOG.error('{}: {}'.format(exc, self))
369 elif (hasattr(exc, 'reason') and
370 isinstance(exc.reason, _urllib_error.URLError)):
371 if isinstance(exc.reason, _SOCKET_ERRORS):
372 reason = exc.reason.args[1]
375 _LOG.error('{}: {}'.format(exc, self))
377 elif isinstance(exc, _feedparser.zlib.error):
378 _LOG.error('broken compression: {}'.format(self))
380 elif isinstance(exc, (IOError, AttributeError)):
381 _LOG.error('{}: {}'.format(exc, self))
383 elif isinstance(exc, KeyboardInterrupt):
385 elif isinstance(exc, _sax.SAXParseException):
386 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
388 elif parsed.bozo or exc:
390 exc = "can't process"
391 _LOG.error('processing error: {}: {}'.format(exc, self))
395 status in [200, 302] and
396 not parsed.entries and
398 raise _error.ProcessingError(parsed=parsed, feed=feed)
400 def _process_entry(self, parsed, entry):
401 id_ = self._get_entry_id(entry)
402 # If .trust_guid isn't set, we get back hashes of the content.
403 # Instead of letting these run wild, we put them in context
404 # by associating them with the actual ID (if it exists).
405 guid = entry.get('id', id_)
406 if isinstance(guid, dict):
407 guid = guid.values()[0]
408 if guid in self.seen:
409 if self.seen[guid]['id'] == id_:
410 _LOG.debug('already seen {}'.format(id_))
411 return # already seen
412 sender = self._get_entry_email(parsed=parsed, entry=entry)
413 subject = self._get_entry_title(entry)
414 extra_headers = _collections.OrderedDict((
415 ('Date', self._get_entry_date(entry)),
416 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
417 ('User-Agent', 'rss2email'),
418 ('X-RSS-Feed', self.url),
420 ('X-RSS-URL', self._get_entry_link(entry)),
421 ('X-RSS-TAGS', self._get_entry_tags(entry)),
423 for k,v in extra_headers.items(): # remove empty tags, etc.
426 if self.bonus_header:
427 for header in self.bonus_header.splitlines():
429 key,value = header.split(':', 1)
430 extra_headers[key.strip()] = value.strip()
433 'malformed bonus-header: {}'.format(
436 content = self._get_entry_content(entry)
437 content = self._process_entry_content(
438 entry=entry, content=content, subject=subject)
439 message = _email.get_message(
443 body=content['value'],
444 content_type=content['type'].split('/', 1)[1],
445 extra_headers=extra_headers,
447 section=self.section)
448 return (guid, id_, sender, message)
450 def _get_entry_id(self, entry):
451 """Get best ID from an entry."""
453 if getattr(entry, 'id', None):
454 # Newer versions of feedparser could return a dictionary
455 if isinstance(entry.id, dict):
456 return entry.id.values()[0]
458 content = self._get_entry_content(entry)
459 content_value = content['value'].strip()
461 return _hashlib.sha1(
462 content_value.encode('unicode-escape')).hexdigest()
463 elif getattr(entry, 'link', None):
464 return _hashlib.sha1(
465 entry.link.encode('unicode-escape')).hexdigest()
466 elif getattr(entry, 'title', None):
467 return _hashlib.sha1(
468 entry.title.encode('unicode-escape')).hexdigest()
470 def _get_entry_link(self, entry):
471 return entry.get('link', None)
473 def _get_entry_title(self, entry):
474 if hasattr(entry, 'title_detail') and entry.title_detail:
475 title = entry.title_detail.value
476 if 'html' in entry.title_detail.type:
477 title = _html2text.html2text(title)
479 content = self._get_entry_content(entry)
480 value = content['value']
481 if content['type'] in ('text/html', 'application/xhtml+xml'):
482 value = _html2text.html2text(value)
484 title = title.replace('\n', ' ').strip()
487 def _get_entry_date(self, entry):
488 datetime = _time.gmtime()
490 for datetype in self.date_header_order:
491 kind = datetype + '_parsed'
492 if entry.get(kind, None):
493 datetime = entry[kind]
495 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
497 def _get_entry_name(self, parsed, entry):
500 >>> import feedparser
501 >>> f = Feed(name='test-feed')
502 >>> parsed = feedparser.parse(
503 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
506 ... ' <name>Example author</name>\\n'
507 ... ' <email>me@example.com</email>\\n'
508 ... ' <url>http://example.com/</url>\\n'
513 >>> entry = parsed.entries[0]
514 >>> f.friendly_name = False
515 >>> f._get_entry_name(parsed, entry)
517 >>> f.friendly_name = True
518 >>> f._get_entry_name(parsed, entry)
521 if not self.friendly_name:
525 parts.append(feed.get('title', ''))
526 for x in [entry, feed]:
527 if 'name' in x.get('author_detail', []):
528 if x.author_detail.name:
531 parts.append(x.author_detail.name)
533 if not ''.join(parts) and self.use_publisher_email:
534 if 'name' in feed.get('publisher_detail', []):
537 parts.append(feed.publisher_detail.name)
538 return _html2text.unescape(''.join(parts))
540 def _validate_email(self, email, default=None):
541 """Do a basic quality check on email address
543 Return `default` if the address doesn't appear to be
544 well-formed. If `default` is `None`, return
547 >>> f = Feed(name='test-feed')
548 >>> f._validate_email('valid@example.com', 'default@example.com')
550 >>> f._validate_email('invalid@', 'default@example.com')
551 'default@example.com'
552 >>> f._validate_email('@invalid', 'default@example.com')
553 'default@example.com'
554 >>> f._validate_email('invalid', 'default@example.com')
555 'default@example.com'
557 parts = email.split('@')
558 if len(parts) != 2 or '' in parts:
560 return self.from_email
564 def _get_entry_address(self, parsed, entry):
565 """Get the best From email address ('<jdoe@a.com>')
567 If the best guess isn't well-formed (something@somthing.com),
568 use `self.from_email` instead.
571 return self.from_email
573 if 'email' in entry.get('author_detail', []):
574 return self._validate_email(entry.author_detail.email)
575 elif 'email' in feed.get('author_detail', []):
576 return self._validate_email(feed.author_detail.email)
577 if self.use_publisher_email:
578 if 'email' in feed.get('publisher_detail', []):
579 return self._validate_email(feed.publisher_detail.email)
580 if feed.get('errorreportsto', None):
581 return self._validate_email(feed.errorreportsto)
582 _LOG.debug('no sender address found, fallback to default')
583 return self.from_email
585 def _get_entry_email(self, parsed, entry):
586 """Get the best From email address ('John <jdoe@a.com>')
588 name = self._get_entry_name(parsed=parsed, entry=entry)
589 address = self._get_entry_address(parsed=parsed, entry=entry)
590 return _formataddr((name, address))
592 def _get_entry_tags(self, entry):
593 """Add post tags, if available
595 >>> f = Feed(name='test-feed')
596 >>> f._get_entry_tags({
597 ... 'tags': [{'term': 'tag1',
599 ... 'label': None}]})
601 >>> f._get_entry_tags({
602 ... 'tags': [{'term': 'tag1',
607 ... 'label': None}]})
610 Test some troublesome cases. No tags:
612 >>> f._get_entry_tags({})
616 >>> f._get_entry_tags({'tags': []})
618 Tags without a ``term`` entry:
620 >>> f._get_entry_tags({
621 ... 'tags': [{'scheme': None,
622 ... 'label': None}]})
624 Tags with an empty term:
626 >>> f._get_entry_tags({
627 ... 'tags': [{'term': '',
629 ... 'label': None}]})
631 taglist = [tag['term'] for tag in entry.get('tags', [])
632 if tag.get('term', '')]
634 return ','.join(taglist)
636 def _get_entry_content(self, entry):
637 """Select the best content from an entry.
639 Returns a feedparser content dict.
642 # * We have a bunch of potential contents.
643 # * We go thru looking for our first choice.
644 # (HTML or text, depending on self.html_mail)
645 # * If that doesn't work, we go thru looking for our second choice.
646 # * If that still doesn't work, we just take the first one.
648 # Possible future improvement:
649 # * Instead of just taking the first one
650 # pick the one in the "best" language.
651 # * HACK: hardcoded .html_mail, should take a tuple of media types
652 contents = list(entry.get('content', []))
653 if entry.get('summary_detail', None):
654 contents.append(entry.summary_detail)
656 types = ['text/html', 'text/plain']
658 types = ['text/plain', 'text/html']
659 for content_type in types:
660 for content in contents:
661 if content['type'] == content_type:
665 return {'type': 'text/plain', 'value': ''}
667 def _process_entry_content(self, entry, content, subject):
668 "Convert entry content to the requested format."
669 link = self._get_entry_link(entry)
676 if self.use_css and self.css:
678 ' <style type="text/css">',
686 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
688 '<div id="body"><table><tr><td>',
690 if content['type'] in ('text/html', 'application/xhtml+xml'):
691 lines.append(content['value'].strip())
693 lines.append(_saxutils.escape(content['value'].strip()))
694 lines.append('</td></tr></table></div>')
696 '<div class="footer">'
697 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
699 for enclosure in getattr(entry, 'enclosures', []):
700 if getattr(enclosure, 'url', None):
702 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
704 if getattr(enclosure, 'src', None):
706 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
709 '<p><img src="{}" /></p>'.format(enclosure.src))
710 for elink in getattr(entry, 'links', []):
711 if elink.get('rel', None) == 'via':
714 if elink.get('title', None):
715 title = elink['title']
716 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
724 content['type'] = 'text/html'
725 content['value'] = '\n'.join(lines)
727 else: # not self.html_mail
728 if content['type'] in ('text/html', 'application/xhtml+xml'):
729 lines = [_html2text.html2text(content['value'])]
731 lines = [content['value']]
733 lines.append('URL: {}'.format(link))
734 for enclosure in getattr(entry, 'enclosures', []):
735 if getattr(enclosure, 'url', None):
736 lines.append('Enclosure: {}'.format(enclosure.url))
737 if getattr(enclosure, 'src', None):
738 lines.append('Enclosure: {}'.format(enclosure.src))
739 for elink in getattr(entry, 'links', []):
740 if elink.get('rel', None) == 'via':
743 if elink.get('title', None):
744 title = elink['title']
745 lines.append('Via: {} {}'.format(title, url))
746 content['type'] = 'text/plain'
747 content['value'] = '\n'.join(lines)
750 def _send(self, sender, message):
751 _LOG.info('send message for {}'.format(self))
752 section = self.section
753 if section not in self.config:
755 _email.send(sender=sender, recipient=self.to, message=message,
756 config=self.config, section=section)
758 def run(self, send=True):
759 """Fetch and process the feed, mailing entry emails.
762 ... name='test-feed',
763 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
764 >>> def send(sender, message):
765 ... print('send from {}:'.format(sender))
766 ... print(message.as_string())
767 >>> feed._send = send
768 >>> feed.to = 'jdoe@dummy.invalid'
769 >>> #parsed = feed.run() # enable for debugging
772 raise _error.NoToEmailAddress(feed=self)
773 parsed = self._fetch()
774 for (guid, id_, sender, message) in self._process(parsed):
775 _LOG.debug('new message: {}'.format(message['Subject']))
777 self._send(sender=sender, message=message)
778 if guid not in self.seen:
780 self.seen[guid]['id'] = id_
781 self.etag = parsed.get('etag', None)
782 self.modified = parsed.get('modified', None)