1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.utils import formataddr as _formataddr
32 import hashlib as _hashlib
34 import socket as _socket
36 import urllib.error as _urllib_error
37 import urllib.request as _urllib_request
39 import xml.sax as _sax
40 import xml.sax.saxutils as _saxutils
42 import feedparser as _feedparser
43 import html2text as _html2text
46 from . import __version__
47 from . import LOG as _LOG
48 from . import config as _config
49 from . import email as _email
50 from . import error as _error
51 from . import util as _util
54 _feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
55 _urllib_request.install_opener(_urllib_request.build_opener())
57 for e in ['error', 'gaierror']:
58 if hasattr(_socket, e):
59 _SOCKET_ERRORS.append(getattr(_socket, e))
60 del e # cleanup namespace
61 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
65 """Utility class for feed manipulation and storage.
69 >>> from .config import CONFIG
72 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
74 test-feed (http://example.com/feed.atom -> a@b.com)
78 'user@rss2email.invalid'
80 >>> feed.from_email = 'a@b.com'
81 >>> feed.save_to_config()
82 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
84 from = user@rss2email.invalid
89 url = http://example.com/feed.atom
94 >>> feed.etag = 'dummy etag'
95 >>> string = pickle.dumps(feed)
96 >>> feed = pickle.loads(string)
97 >>> feed.load_from_config(config=CONFIG)
101 'http://example.com/feed.atom'
103 Names can only contain ASCII letters, digits, and '._-'. Here the
104 invalid space causes an exception:
106 >>> Feed(name='invalid name')
107 Traceback (most recent call last):
109 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
111 You must define a URL:
113 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
114 Traceback (most recent call last):
116 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
121 >>> CONFIG['DEFAULT']['to'] = ''
122 >>> test_section = CONFIG.pop('feed.test-feed')
124 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
126 # saved/loaded from feed.dat using __getstate__/__setstate__.
127 _dynamic_attributes = [
134 ## saved/loaded from ConfigParser instance
135 # attributes that aren't in DEFAULT
136 _non_default_configured_attributes = [
139 # attributes that are in DEFAULT
140 _default_configured_attributes = [
141 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
142 _default_configured_attributes[
143 _default_configured_attributes.index('from')
144 ] = 'from_email' # `from` is a Python keyword
145 # all attributes that are saved/loaded from .config
146 _configured_attributes = (
147 _non_default_configured_attributes + _default_configured_attributes)
148 # attribute name -> .config option
149 _configured_attribute_translations = dict(
150 (attr,attr) for attr in _non_default_configured_attributes)
151 _configured_attribute_translations.update(dict(
152 zip(_default_configured_attributes,
153 _config.CONFIG['DEFAULT'].keys())))
154 # .config option -> attribute name
155 _configured_attribute_inverse_translations = dict(
156 (v,k) for k,v in _configured_attribute_translations.items())
158 # hints for value conversion
159 _boolean_attributes = [
161 'use_publisher_email',
169 'links_after_each_paragraph',
174 _integer_attributes = [
184 _function_attributes = [
188 def __init__(self, name=None, url=None, to=None, config=None):
189 self._set_name(name=name)
191 self.__setstate__(dict(
192 (attr, getattr(self, attr))
193 for attr in self._dynamic_attributes))
194 self.load_from_config(config=config)
201 return '{} ({} -> {})'.format(self.name, self.url, self.to)
204 return '<Feed {}>'.format(str(self))
206 def __getstate__(self):
207 "Save dyamic attributes"
209 (key,getattr(self,key)) for key in self._dynamic_attributes)
211 get_state = __getstate__ # make it publicly accessible
213 def __setstate__(self, state):
214 "Restore dynamic attributes"
215 keys = sorted(state.keys())
216 if keys != sorted(self._dynamic_attributes):
217 raise ValueError(state)
218 self._set_name(name=state['name'])
219 self.__dict__.update(state)
221 set_state = __setstate__ # make it publicly accessible
223 def save_to_config(self):
224 "Save configured attributes"
225 data = _collections.OrderedDict()
226 default = self.config['DEFAULT']
227 for attr in self._configured_attributes:
228 key = self._configured_attribute_translations[attr]
229 value = getattr(self, attr)
230 if value is not None:
231 value = self._get_configured_option_value(
232 attribute=attr, value=value)
233 if (attr in self._non_default_configured_attributes or
234 value != default[key]):
236 self.config[self.section] = data
238 def load_from_config(self, config=None):
239 "Restore configured attributes"
241 config = _config.CONFIG
243 if self.section in self.config:
244 data = self.config[self.section]
246 data = self.config['DEFAULT']
247 keys = sorted(data.keys())
248 expected = sorted(self._configured_attribute_translations.values())
251 if (key not in keys and
252 key not in self._non_default_configured_attributes):
253 raise ValueError('missing key: {}'.format(key))
255 if key not in expected:
256 raise ValueError('extra key: {}'.format(key))
258 (self._configured_attribute_inverse_translations[k],
259 self._get_configured_attribute_value(
260 attribute=self._configured_attribute_inverse_translations[k],
262 for k in data.keys())
263 for attr in self._non_default_configured_attributes:
266 self.__dict__.update(data)
268 def _get_configured_option_value(self, attribute, value):
271 elif attribute in self._list_attributes:
272 return ', '.join(value)
273 elif attribute in self._function_attributes:
274 return _util.import_name(value)
277 def _get_configured_attribute_value(self, attribute, key, data):
278 if attribute in self._boolean_attributes:
279 return data.getboolean(key)
280 elif attribute in self._integer_attributes:
281 return data.getint(key)
282 elif attribute in self._list_attributes:
283 return [x.strip() for x in data[key].split(',')]
284 elif attribute in self._function_attributes:
286 return _util.import_function(data[key])
291 """Reset dynamic data
297 def _set_name(self, name):
298 if not self._name_regexp.match(name):
299 raise _error.InvalidFeedName(name=name, feed=self)
301 self.section = 'feed.{}'.format(self.name)
304 """Fetch and parse a feed using feedparser.
307 ... name='test-feed',
308 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
309 >>> parsed = feed._fetch()
313 _LOG.info('fetch {}'.format(self))
315 raise _error.InvalidFeedConfig(setting='url', feed=self)
316 if self.section in self.config:
317 config = self.config[self.section]
319 config = self.config['DEFAULT']
320 proxy = config['proxy']
321 timeout = config.getint('feed-timeout')
324 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
325 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
326 return f(self.url, self.etag, modified=self.modified, **kwargs)
328 def _process(self, parsed):
329 _LOG.info('process {}'.format(self))
330 self._check_for_errors(parsed)
331 for entry in reversed(parsed.entries):
332 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
333 processed = self._process_entry(parsed=parsed, entry=entry)
335 guid,id_,sender,message = processed
336 if self.post_process:
337 message = self.post_process(
338 feed=self, parsed=parsed, entry=entry, guid=guid,
342 yield (guid, id_, sender, message)
344 def _check_for_errors(self, parsed):
346 status = getattr(parsed, 'status', 200)
347 _LOG.debug('HTTP status {}'.format(status))
349 _LOG.info('redirect {} from {} to {}'.format(
350 self.name, self.url, parsed['url']))
351 self.url = parsed['url']
352 elif status not in [200, 302, 304]:
353 raise _error.HTTPError(status=status, feed=self)
355 http_headers = parsed.get('headers', {})
357 _LOG.debug('HTTP headers: {}'.format(http_headers))
359 _LOG.warning('could not get HTTP headers: {}'.format(self))
362 if 'html' in http_headers.get('content-type', 'rss'):
363 _LOG.warning('looks like HTML: {}'.format(self))
365 if http_headers.get('content-length', '1') == '0':
366 _LOG.warning('empty page: {}'.format(self))
369 version = parsed.get('version', None)
371 _LOG.debug('feed version {}'.format(version))
373 _LOG.warning('unrecognized version: {}'.format(self))
376 exc = parsed.get('bozo_exception', None)
377 if isinstance(exc, _socket.timeout):
378 _LOG.error('timed out: {}'.format(self))
380 elif isinstance(exc, _SOCKET_ERRORS):
382 _LOG.error('{}: {}'.format(exc, self))
384 elif (hasattr(exc, 'reason') and
385 isinstance(exc.reason, _urllib_error.URLError)):
386 if isinstance(exc.reason, _SOCKET_ERRORS):
387 reason = exc.reason.args[1]
390 _LOG.error('{}: {}'.format(exc, self))
392 elif isinstance(exc, _feedparser.zlib.error):
393 _LOG.error('broken compression: {}'.format(self))
395 elif isinstance(exc, (IOError, AttributeError)):
396 _LOG.error('{}: {}'.format(exc, self))
398 elif isinstance(exc, KeyboardInterrupt):
400 elif isinstance(exc, _sax.SAXParseException):
401 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
403 elif parsed.bozo or exc:
405 exc = "can't process"
406 _LOG.error('processing error: {}: {}'.format(exc, self))
410 status in [200, 302] and
411 not parsed.entries and
413 raise _error.ProcessingError(parsed=parsed, feed=feed)
415 def _process_entry(self, parsed, entry):
416 id_ = self._get_entry_id(entry)
417 # If .trust_guid isn't set, we get back hashes of the content.
418 # Instead of letting these run wild, we put them in context
419 # by associating them with the actual ID (if it exists).
420 guid = entry.get('id', id_)
421 if isinstance(guid, dict):
422 guid = guid.values()[0]
423 if guid in self.seen:
424 if self.seen[guid]['id'] == id_:
425 _LOG.debug('already seen {}'.format(id_))
426 return # already seen
427 sender = self._get_entry_email(parsed=parsed, entry=entry)
428 subject = self._get_entry_title(entry)
429 extra_headers = _collections.OrderedDict((
430 ('Date', self._get_entry_date(entry)),
431 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
432 ('User-Agent', 'rss2email'),
433 ('X-RSS-Feed', self.url),
435 ('X-RSS-URL', self._get_entry_link(entry)),
436 ('X-RSS-TAGS', self._get_entry_tags(entry)),
438 for k,v in extra_headers.items(): # remove empty tags, etc.
441 if self.bonus_header:
442 for header in self.bonus_header.splitlines():
444 key,value = header.split(':', 1)
445 extra_headers[key.strip()] = value.strip()
448 'malformed bonus-header: {}'.format(
451 content = self._get_entry_content(entry)
452 content = self._process_entry_content(
453 entry=entry, content=content, subject=subject)
454 message = _email.get_message(
458 body=content['value'],
459 content_type=content['type'].split('/', 1)[1],
460 extra_headers=extra_headers)
461 return (guid, id_, sender, message)
463 def _get_entry_id(self, entry):
464 """Get best ID from an entry."""
466 if getattr(entry, 'id', None):
467 # Newer versions of feedparser could return a dictionary
468 if isinstance(entry.id, dict):
469 return entry.id.values()[0]
471 content = self._get_entry_content(entry)
472 content_value = content['value'].strip()
474 return _hashlib.sha1(
475 content_value.encode('unicode-escape')).hexdigest()
476 elif getattr(entry, 'link', None):
477 return _hashlib.sha1(
478 entry.link.encode('unicode-escape')).hexdigest()
479 elif getattr(entry, 'title', None):
480 return _hashlib.sha1(
481 entry.title.encode('unicode-escape')).hexdigest()
483 def _get_entry_link(self, entry):
484 return entry.get('link', None)
486 def _get_entry_title(self, entry):
487 if hasattr(entry, 'title_detail') and entry.title_detail:
488 title = entry.title_detail.value
489 if 'html' in entry.title_detail.type:
490 title = _html2text.html2text(title)
492 content = self._get_entry_content(entry)
493 value = content['value']
494 if content['type'] in ('text/html', 'application/xhtml+xml'):
495 value = _html2text.html2text(value)
497 title = title.replace('\n', ' ').strip()
500 def _get_entry_date(self, entry):
501 datetime = _time.gmtime()
503 for datetype in self.date_header_order:
504 kind = datetype + '_parsed'
505 if entry.get(kind, None):
506 datetime = entry[kind]
508 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
510 def _get_entry_name(self, parsed, entry):
513 >>> import feedparser
514 >>> f = Feed(name='test-feed')
515 >>> parsed = feedparser.parse(
516 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
519 ... ' <name>Example author</name>\\n'
520 ... ' <email>me@example.com</email>\\n'
521 ... ' <url>http://example.com/</url>\\n'
526 >>> entry = parsed.entries[0]
527 >>> f.friendly_name = False
528 >>> f._get_entry_name(parsed, entry)
530 >>> f.friendly_name = True
531 >>> f._get_entry_name(parsed, entry)
534 if not self.friendly_name:
538 parts.append(feed.get('title', ''))
539 for x in [entry, feed]:
540 if 'name' in x.get('author_detail', []):
541 if x.author_detail.name:
544 parts.append(x.author_detail.name)
546 if not ''.join(parts) and self.use_publisher_email:
547 if 'name' in feed.get('publisher_detail', []):
550 parts.append(feed.publisher_detail.name)
551 return _html2text.unescape(''.join(parts))
553 def _validate_email(self, email, default=None):
554 """Do a basic quality check on email address
556 Return `default` if the address doesn't appear to be
557 well-formed. If `default` is `None`, return
560 >>> f = Feed(name='test-feed')
561 >>> f._validate_email('valid@example.com', 'default@example.com')
563 >>> f._validate_email('invalid@', 'default@example.com')
564 'default@example.com'
565 >>> f._validate_email('@invalid', 'default@example.com')
566 'default@example.com'
567 >>> f._validate_email('invalid', 'default@example.com')
568 'default@example.com'
570 parts = email.split('@')
571 if len(parts) != 2 or '' in parts:
573 return self.from_email
577 def _get_entry_address(self, parsed, entry):
578 """Get the best From email address ('<jdoe@a.com>')
580 If the best guess isn't well-formed (something@somthing.com),
581 use `self.from_email` instead.
584 return self.from_email
586 if 'email' in entry.get('author_detail', []):
587 return self._validate_email(entry.author_detail.email)
588 elif 'email' in feed.get('author_detail', []):
589 return self._validate_email(feed.author_detail.email)
590 if self.use_publisher_email:
591 if 'email' in feed.get('publisher_detail', []):
592 return self._validate_email(feed.publisher_detail.email)
593 if feed.get('errorreportsto', None):
594 return self._validate_email(feed.errorreportsto)
595 _LOG.debug('no sender address found, fallback to default')
596 return self.from_email
598 def _get_entry_email(self, parsed, entry):
599 """Get the best From email address ('John <jdoe@a.com>')
601 name = self._get_entry_name(parsed=parsed, entry=entry)
602 address = self._get_entry_address(parsed=parsed, entry=entry)
603 return _formataddr((name, address))
605 def _get_entry_tags(self, entry):
606 """Add post tags, if available
608 >>> f = Feed(name='test-feed')
609 >>> f._get_entry_tags({
610 ... 'tags': [{'term': 'tag1',
612 ... 'label': None}]})
614 >>> f._get_entry_tags({
615 ... 'tags': [{'term': 'tag1',
620 ... 'label': None}]})
623 Test some troublesome cases. No tags:
625 >>> f._get_entry_tags({})
629 >>> f._get_entry_tags({'tags': []})
631 Tags without a ``term`` entry:
633 >>> f._get_entry_tags({
634 ... 'tags': [{'scheme': None,
635 ... 'label': None}]})
637 Tags with an empty term:
639 >>> f._get_entry_tags({
640 ... 'tags': [{'term': '',
642 ... 'label': None}]})
644 taglist = [tag['term'] for tag in entry.get('tags', [])
645 if tag.get('term', '')]
647 return ','.join(taglist)
649 def _get_entry_content(self, entry):
650 """Select the best content from an entry.
652 Returns a feedparser content dict.
655 # * We have a bunch of potential contents.
656 # * We go thru looking for our first choice.
657 # (HTML or text, depending on self.html_mail)
658 # * If that doesn't work, we go thru looking for our second choice.
659 # * If that still doesn't work, we just take the first one.
661 # Possible future improvement:
662 # * Instead of just taking the first one
663 # pick the one in the "best" language.
664 # * HACK: hardcoded .html_mail, should take a tuple of media types
665 contents = list(entry.get('content', []))
666 if entry.get('summary_detail', None):
667 contents.append(entry.summary_detail)
669 types = ['text/html', 'text/plain']
671 types = ['text/plain', 'text/html']
672 for content_type in types:
673 for content in contents:
674 if content['type'] == content_type:
678 return {'type': 'text/plain', 'value': ''}
680 def _process_entry_content(self, entry, content, subject):
681 "Convert entry content to the requested format."
682 link = self._get_entry_link(entry)
689 if self.use_css and self.css:
691 ' <style type="text/css">',
699 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
701 '<div id="body"><table><tr><td>',
703 if content['type'] in ('text/html', 'application/xhtml+xml'):
704 lines.append(content['value'].strip())
706 lines.append(_saxutils.escape(content['value'].strip()))
707 lines.append('</td></tr></table></div>')
709 '<div class="footer">'
710 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
712 for enclosure in getattr(entry, 'enclosures', []):
713 if getattr(enclosure, 'url', None):
715 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
717 if getattr(enclosure, 'src', None):
719 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
722 '<p><img src="{}" /></p>'.format(enclosure.src))
723 for elink in getattr(entry, 'links', []):
724 if elink.get('rel', None) == 'via':
727 'http://www.google.com/reader/public/atom/',
728 'http://www.google.com/reader/view/')
730 if elink.get('title', None):
731 title = elink['title']
732 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
740 content['type'] = 'text/html'
741 content['value'] = '\n'.join(lines)
743 else: # not self.html_mail
744 if content['type'] in ('text/html', 'application/xhtml+xml'):
745 lines = [_html2text.html2text(content['value'])]
747 lines = [content['value']]
749 lines.append('URL: {}'.format(link))
750 for enclosure in getattr(entry, 'enclosures', []):
751 if getattr(enclosure, 'url', None):
752 lines.append('Enclosure: {}'.format(enclosure.url))
753 if getattr(enclosure, 'src', None):
754 lines.append('Enclosure: {}'.format(enclosure.src))
755 for elink in getattr(entry, 'links', []):
756 if elink.get('rel', None) == 'via':
759 'http://www.google.com/reader/public/atom/',
760 'http://www.google.com/reader/view/')
762 if elink.get('title', None):
763 title = elink['title']
764 lines.append('Via: {} {}'.format(title, url))
765 content['type'] = 'text/plain'
766 content['value'] = '\n'.join(lines)
769 def _send(self, sender, message):
770 _LOG.info('send message for {}'.format(self))
771 section = self.section
772 if section not in self.config:
774 _email.send(sender=sender, recipient=self.to, message=message,
775 config=self.config, section=section)
777 def run(self, send=True):
778 """Fetch and process the feed, mailing entry emails.
781 ... name='test-feed',
782 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
783 >>> def send(sender, message):
784 ... print('send from {}:'.format(sender))
785 ... print(message.as_string())
786 >>> feed._send = send
787 >>> feed.to = 'jdoe@dummy.invalid'
788 >>> #parsed = feed.run() # enable for debugging
791 raise _error.NoToEmailAddress(feed=self)
792 parsed = self._fetch()
793 for (guid, id_, sender, message) in self._process(parsed):
794 _LOG.debug('new message: {}'.format(message['Subject']))
796 self._send(sender=sender, message=message)
797 if guid not in self.seen:
799 self.seen[guid]['id'] = id_
800 self.etag = parsed.get('etag', None)
801 self.modified = parsed.get('modified', None)