1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.mime.message import MIMEMessage as _MIMEMessage
32 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
33 from email.utils import formataddr as _formataddr
34 import hashlib as _hashlib
35 import html.parser as _html_parser
37 import socket as _socket
39 import urllib.error as _urllib_error
40 import urllib.request as _urllib_request
42 import xml.sax as _sax
43 import xml.sax.saxutils as _saxutils
45 import feedparser as _feedparser
46 import html2text as _html2text
49 from . import __version__
50 from . import LOG as _LOG
51 from . import config as _config
52 from . import email as _email
53 from . import error as _error
54 from . import util as _util
57 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
58 _feedparser.USER_AGENT = _USER_AGENT
59 _urllib_request.install_opener(_urllib_request.build_opener())
61 for e in ['error', 'herror', 'gaierror']:
62 if hasattr(_socket, e):
63 _SOCKET_ERRORS.append(getattr(_socket, e))
64 del e # cleanup namespace
65 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
68 # TypeError: 'str' does not support the buffer interface
69 _feedparser.PREFERRED_XML_PARSERS = []
73 """Utility class for feed manipulation and storage.
77 >>> from .config import CONFIG
80 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
82 test-feed (http://example.com/feed.atom -> a@b.com)
86 'user@rss2email.invalid'
88 >>> feed.from_email = 'a@b.com'
89 >>> feed.save_to_config()
90 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
92 from = user@rss2email.invalid
97 url = http://example.com/feed.atom
102 >>> feed.etag = 'dummy etag'
103 >>> string = pickle.dumps(feed)
104 >>> feed = pickle.loads(string)
105 >>> feed.load_from_config(config=CONFIG)
109 'http://example.com/feed.atom'
111 Names can only contain ASCII letters, digits, and '._-'. Here the
112 invalid space causes an exception:
114 >>> Feed(name='invalid name')
115 Traceback (most recent call last):
117 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
119 You must define a URL:
121 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
122 Traceback (most recent call last):
124 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
129 >>> CONFIG['DEFAULT']['to'] = ''
130 >>> test_section = CONFIG.pop('feed.test-feed')
132 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
134 # saved/loaded from feed.dat using __getstate__/__setstate__.
135 _dynamic_attributes = [
142 ## saved/loaded from ConfigParser instance
143 # attributes that aren't in DEFAULT
144 _non_default_configured_attributes = [
147 # attributes that are in DEFAULT
148 _default_configured_attributes = [
149 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
150 _default_configured_attributes[
151 _default_configured_attributes.index('from')
152 ] = 'from_email' # `from` is a Python keyword
153 # all attributes that are saved/loaded from .config
154 _configured_attributes = (
155 _non_default_configured_attributes + _default_configured_attributes)
156 # attribute name -> .config option
157 _configured_attribute_translations = dict(
158 (attr,attr) for attr in _non_default_configured_attributes)
159 _configured_attribute_translations.update(dict(
160 zip(_default_configured_attributes,
161 _config.CONFIG['DEFAULT'].keys())))
162 # .config option -> attribute name
163 _configured_attribute_inverse_translations = dict(
164 (v,k) for k,v in _configured_attribute_translations.items())
166 # hints for value conversion
167 _boolean_attributes = [
170 'use_publisher_email',
178 'links_after_each_paragraph',
183 _integer_attributes = [
193 _function_attributes = [
195 'digest_post_process',
198 def __init__(self, name=None, url=None, to=None, config=None):
199 self._set_name(name=name)
201 self.__setstate__(dict(
202 (attr, getattr(self, attr))
203 for attr in self._dynamic_attributes))
204 self.load_from_config(config=config)
211 return '{} ({} -> {})'.format(self.name, self.url, self.to)
214 return '<Feed {}>'.format(str(self))
216 def __getstate__(self):
217 "Save dyamic attributes"
219 (key,getattr(self,key)) for key in self._dynamic_attributes)
221 get_state = __getstate__ # make it publicly accessible
223 def __setstate__(self, state):
224 "Restore dynamic attributes"
225 keys = sorted(state.keys())
226 if keys != sorted(self._dynamic_attributes):
227 raise ValueError(state)
228 self._set_name(name=state['name'])
229 self.__dict__.update(state)
231 set_state = __setstate__ # make it publicly accessible
233 def save_to_config(self):
234 "Save configured attributes"
235 data = _collections.OrderedDict()
236 default = self.config['DEFAULT']
237 for attr in self._configured_attributes:
238 key = self._configured_attribute_translations[attr]
239 value = getattr(self, attr)
240 if value is not None:
241 value = self._get_configured_option_value(
242 attribute=attr, value=value)
243 if (attr in self._non_default_configured_attributes or
244 value != default[key]):
246 self.config[self.section] = data
248 def load_from_config(self, config=None):
249 "Restore configured attributes"
251 config = _config.CONFIG
253 if self.section in self.config:
254 data = self.config[self.section]
256 data = self.config['DEFAULT']
257 keys = sorted(data.keys())
258 expected = sorted(self._configured_attribute_translations.values())
261 if (key not in keys and
262 key not in self._non_default_configured_attributes):
263 raise _error.InvalidFeedConfig(
264 setting=key, feed=self,
265 message='missing configuration key: {}'.format(key))
267 if key not in expected:
268 raise _error.InvalidFeedConfig(
269 setting=key, feed=self,
270 message='extra configuration key: {}'.format(key))
272 (self._configured_attribute_inverse_translations[k],
273 self._get_configured_attribute_value(
274 attribute=self._configured_attribute_inverse_translations[k],
276 for k in data.keys())
277 for attr in self._non_default_configured_attributes:
280 self.__dict__.update(data)
282 def _get_configured_option_value(self, attribute, value):
285 elif attribute in self._list_attributes:
286 return ', '.join(value)
287 elif attribute in self._function_attributes:
288 return _util.import_name(value)
291 def _get_configured_attribute_value(self, attribute, key, data):
292 if attribute in self._boolean_attributes:
293 return data.getboolean(key)
294 elif attribute in self._integer_attributes:
295 return data.getint(key)
296 elif attribute in self._list_attributes:
297 return [x.strip() for x in data[key].split(',')]
298 elif attribute in self._function_attributes:
300 return _util.import_function(data[key])
305 """Reset dynamic data
311 def _set_name(self, name):
312 if not self._name_regexp.match(name):
313 raise _error.InvalidFeedName(name=name, feed=self)
315 self.section = 'feed.{}'.format(self.name)
318 """Fetch and parse a feed using feedparser.
321 ... name='test-feed',
322 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
323 >>> parsed = feed._fetch()
327 _LOG.info('fetch {}'.format(self))
329 raise _error.InvalidFeedConfig(setting='url', feed=self)
330 if self.section in self.config:
331 config = self.config[self.section]
333 config = self.config['DEFAULT']
334 proxy = config['proxy']
335 timeout = config.getint('feed-timeout')
338 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
339 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
340 return f(self.url, self.etag, modified=self.modified, **kwargs)
342 def _process(self, parsed):
343 _LOG.info('process {}'.format(self))
344 self._check_for_errors(parsed)
345 for entry in reversed(parsed.entries):
346 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
347 processed = self._process_entry(parsed=parsed, entry=entry)
349 guid,id_,sender,message = processed
350 if self.post_process:
351 message = self.post_process(
352 feed=self, parsed=parsed, entry=entry, guid=guid,
356 yield (guid, id_, sender, message)
358 def _check_for_errors(self, parsed):
360 status = getattr(parsed, 'status', 200)
361 _LOG.debug('HTTP status {}'.format(status))
363 _LOG.info('redirect {} from {} to {}'.format(
364 self.name, self.url, parsed['url']))
365 self.url = parsed['url']
366 elif status not in [200, 302, 304]:
367 raise _error.HTTPError(status=status, feed=self)
369 http_headers = parsed.get('headers', {})
371 _LOG.debug('HTTP headers: {}'.format(http_headers))
373 _LOG.warning('could not get HTTP headers: {}'.format(self))
376 if 'html' in http_headers.get('content-type', 'rss'):
377 _LOG.warning('looks like HTML: {}'.format(self))
379 if http_headers.get('content-length', '1') == '0':
380 _LOG.warning('empty page: {}'.format(self))
383 version = parsed.get('version', None)
385 _LOG.debug('feed version {}'.format(version))
387 _LOG.warning('unrecognized version: {}'.format(self))
390 exc = parsed.get('bozo_exception', None)
391 if isinstance(exc, _socket.timeout):
392 _LOG.error('timed out: {}'.format(self))
394 elif isinstance(exc, OSError):
395 _LOG.error('{}: {}'.format(exc, self))
397 elif isinstance(exc, _SOCKET_ERRORS):
398 _LOG.error('{}: {}'.format(exc, self))
400 elif isinstance(exc, _feedparser.zlib.error):
401 _LOG.error('broken compression: {}'.format(self))
403 elif isinstance(exc, (IOError, AttributeError)):
404 _LOG.error('{}: {}'.format(exc, self))
406 elif isinstance(exc, KeyboardInterrupt):
408 elif isinstance(exc, _sax.SAXParseException):
409 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
411 elif (parsed.bozo and
412 isinstance(exc, _feedparser.CharacterEncodingOverride)):
414 'incorrectly declared encoding: {}: {}'.format(exc, self))
416 elif parsed.bozo or exc:
418 exc = "can't process"
419 _LOG.error('processing error: {}: {}'.format(exc, self))
423 status in [200, 302] and
424 not parsed.entries and
426 raise _error.ProcessingError(parsed=parsed, feed=feed)
428 def _html2text(self, html, baseurl=''):
429 self.config.setup_html2text(section=self.section)
430 return _html2text.html2text(html=html, baseurl=baseurl)
432 def _process_entry(self, parsed, entry):
433 id_ = self._get_entry_id(entry)
434 # If .trust_guid isn't set, we get back hashes of the content.
435 # Instead of letting these run wild, we put them in context
436 # by associating them with the actual ID (if it exists).
437 guid = entry.get('id', id_)
438 if isinstance(guid, dict):
439 guid = guid.values()[0]
440 if guid in self.seen:
441 if self.seen[guid]['id'] == id_:
442 _LOG.debug('already seen {}'.format(id_))
443 return # already seen
444 sender = self._get_entry_email(parsed=parsed, entry=entry)
445 subject = self._get_entry_title(entry)
446 extra_headers = _collections.OrderedDict((
447 ('Date', self._get_entry_date(entry)),
448 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
449 ('User-Agent', _USER_AGENT),
450 ('X-RSS-Feed', self.url),
452 ('X-RSS-URL', self._get_entry_link(entry)),
453 ('X-RSS-TAGS', self._get_entry_tags(entry)),
455 for k,v in extra_headers.items(): # remove empty tags, etc.
458 if self.bonus_header:
459 for header in self.bonus_header.splitlines():
461 key,value = header.split(':', 1)
462 extra_headers[key.strip()] = value.strip()
465 'malformed bonus-header: {}'.format(
468 content = self._get_entry_content(entry)
470 content = self._process_entry_content(
471 entry=entry, content=content, subject=subject)
472 except _error.ProcessingError as e:
475 message = _email.get_message(
479 body=content['value'],
480 content_type=content['type'].split('/', 1)[1],
481 extra_headers=extra_headers,
483 section=self.section)
484 return (guid, id_, sender, message)
486 def _get_entry_id(self, entry):
487 """Get best ID from an entry."""
489 if getattr(entry, 'id', None):
490 # Newer versions of feedparser could return a dictionary
491 if isinstance(entry.id, dict):
492 return entry.id.values()[0]
494 content = self._get_entry_content(entry)
495 content_value = content['value'].strip()
497 return _hashlib.sha1(
498 content_value.encode('unicode-escape')).hexdigest()
499 elif getattr(entry, 'link', None):
500 return _hashlib.sha1(
501 entry.link.encode('unicode-escape')).hexdigest()
502 elif getattr(entry, 'title', None):
503 return _hashlib.sha1(
504 entry.title.encode('unicode-escape')).hexdigest()
506 def _get_entry_link(self, entry):
507 return entry.get('link', None)
509 def _get_entry_title(self, entry):
510 if hasattr(entry, 'title_detail') and entry.title_detail:
511 title = entry.title_detail.value
512 if 'html' in entry.title_detail.type:
513 title = self._html2text(title)
515 content = self._get_entry_content(entry)
516 value = content['value']
517 if content['type'] in ('text/html', 'application/xhtml+xml'):
518 value = self._html2text(value)
520 title = title.replace('\n', ' ').strip()
523 def _get_entry_date(self, entry):
524 datetime = _time.gmtime()
526 for datetype in self.date_header_order:
527 kind = datetype + '_parsed'
528 if entry.get(kind, None):
529 datetime = entry[kind]
531 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
533 def _get_entry_name(self, parsed, entry):
536 >>> import feedparser
537 >>> f = Feed(name='test-feed')
538 >>> parsed = feedparser.parse(
539 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
542 ... ' <name>Example author</name>\\n'
543 ... ' <email>me@example.com</email>\\n'
544 ... ' <url>http://example.com/</url>\\n'
549 >>> entry = parsed.entries[0]
550 >>> f.friendly_name = False
551 >>> f._get_entry_name(parsed, entry)
553 >>> f.friendly_name = True
554 >>> f._get_entry_name(parsed, entry)
557 if not self.friendly_name:
561 parts.append(feed.get('title', ''))
562 for x in [entry, feed]:
563 if 'name' in x.get('author_detail', []):
564 if x.author_detail.name:
567 parts.append(x.author_detail.name)
569 if not ''.join(parts) and self.use_publisher_email:
570 if 'name' in feed.get('publisher_detail', []):
573 parts.append(feed.publisher_detail.name)
574 return _html2text.unescape(''.join(parts))
576 def _validate_email(self, email, default=None):
577 """Do a basic quality check on email address
579 Return `default` if the address doesn't appear to be
580 well-formed. If `default` is `None`, return
583 >>> f = Feed(name='test-feed')
584 >>> f._validate_email('valid@example.com', 'default@example.com')
586 >>> f._validate_email('invalid@', 'default@example.com')
587 'default@example.com'
588 >>> f._validate_email('@invalid', 'default@example.com')
589 'default@example.com'
590 >>> f._validate_email('invalid', 'default@example.com')
591 'default@example.com'
593 parts = email.split('@')
594 if len(parts) != 2 or '' in parts:
596 return self.from_email
600 def _get_entry_address(self, parsed, entry):
601 """Get the best From email address ('<jdoe@a.com>')
603 If the best guess isn't well-formed (something@somthing.com),
604 use `self.from_email` instead.
607 return self.from_email
609 if 'email' in entry.get('author_detail', []):
610 return self._validate_email(entry.author_detail.email)
611 elif 'email' in feed.get('author_detail', []):
612 return self._validate_email(feed.author_detail.email)
613 if self.use_publisher_email:
614 if 'email' in feed.get('publisher_detail', []):
615 return self._validate_email(feed.publisher_detail.email)
616 if feed.get('errorreportsto', None):
617 return self._validate_email(feed.errorreportsto)
618 _LOG.debug('no sender address found, fallback to default')
619 return self.from_email
621 def _get_entry_email(self, parsed, entry):
622 """Get the best From email address ('John <jdoe@a.com>')
624 name = self._get_entry_name(parsed=parsed, entry=entry)
625 address = self._get_entry_address(parsed=parsed, entry=entry)
626 return _formataddr((name, address))
628 def _get_entry_tags(self, entry):
629 """Add post tags, if available
631 >>> f = Feed(name='test-feed')
632 >>> f._get_entry_tags({
633 ... 'tags': [{'term': 'tag1',
635 ... 'label': None}]})
637 >>> f._get_entry_tags({
638 ... 'tags': [{'term': 'tag1',
643 ... 'label': None}]})
646 Test some troublesome cases. No tags:
648 >>> f._get_entry_tags({})
652 >>> f._get_entry_tags({'tags': []})
654 Tags without a ``term`` entry:
656 >>> f._get_entry_tags({
657 ... 'tags': [{'scheme': None,
658 ... 'label': None}]})
660 Tags with an empty term:
662 >>> f._get_entry_tags({
663 ... 'tags': [{'term': '',
665 ... 'label': None}]})
667 taglist = [tag['term'] for tag in entry.get('tags', [])
668 if tag.get('term', '')]
670 return ','.join(taglist)
672 def _get_entry_content(self, entry):
673 """Select the best content from an entry.
675 Returns a feedparser content dict.
678 # * We have a bunch of potential contents.
679 # * We go thru looking for our first choice.
680 # (HTML or text, depending on self.html_mail)
681 # * If that doesn't work, we go thru looking for our second choice.
682 # * If that still doesn't work, we just take the first one.
684 # Possible future improvement:
685 # * Instead of just taking the first one
686 # pick the one in the "best" language.
687 # * HACK: hardcoded .html_mail, should take a tuple of media types
688 contents = list(entry.get('content', []))
689 if entry.get('summary_detail', None):
690 contents.append(entry.summary_detail)
692 types = ['text/html', 'text/plain']
694 types = ['text/plain', 'text/html']
695 for content_type in types:
696 for content in contents:
697 if content['type'] == content_type:
701 return {'type': 'text/plain', 'value': ''}
703 def _process_entry_content(self, entry, content, subject):
704 "Convert entry content to the requested format."
705 link = self._get_entry_link(entry)
712 if self.use_css and self.css:
714 ' <style type="text/css">',
722 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
726 if content['type'] in ('text/html', 'application/xhtml+xml'):
727 lines.append(content['value'].strip())
729 lines.append(_saxutils.escape(content['value'].strip()))
730 lines.append('</div>')
732 '<div class="footer">'
733 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
735 for enclosure in getattr(entry, 'enclosures', []):
736 if getattr(enclosure, 'url', None):
738 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
740 if getattr(enclosure, 'src', None):
742 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
745 '<p><img src="{}" /></p>'.format(enclosure.src))
746 for elink in getattr(entry, 'links', []):
747 if elink.get('rel', None) == 'via':
749 title = elink.get('title', url)
750 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
758 content['type'] = 'text/html'
759 content['value'] = '\n'.join(lines)
761 else: # not self.html_mail
762 if content['type'] in ('text/html', 'application/xhtml+xml'):
764 lines = [self._html2text(content['value'])]
765 except _html_parser.HTMLParseError as e:
766 raise _error.ProcessingError(parsed=None, feed=self)
768 lines = [content['value']]
770 lines.append('URL: {}'.format(link))
771 for enclosure in getattr(entry, 'enclosures', []):
772 if getattr(enclosure, 'url', None):
773 lines.append('Enclosure: {}'.format(enclosure.url))
774 if getattr(enclosure, 'src', None):
775 lines.append('Enclosure: {}'.format(enclosure.src))
776 for elink in getattr(entry, 'links', []):
777 if elink.get('rel', None) == 'via':
779 title = elink.get('title', url)
780 lines.append('Via: {} {}'.format(title, url))
781 content['type'] = 'text/plain'
782 content['value'] = '\n'.join(lines)
785 def _send(self, sender, message):
786 _LOG.info('send message for {}'.format(self))
787 section = self.section
788 if section not in self.config:
790 _email.send(sender=sender, recipient=self.to, message=message,
791 config=self.config, section=section)
793 def run(self, send=True):
794 """Fetch and process the feed, mailing entry emails.
797 ... name='test-feed',
798 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
799 >>> def send(sender, message):
800 ... print('send from {}:'.format(sender))
801 ... print(message.as_string())
802 >>> feed._send = send
803 >>> feed.to = 'jdoe@dummy.invalid'
804 >>> #parsed = feed.run() # enable for debugging
807 raise _error.NoToEmailAddress(feed=self)
808 parsed = self._fetch()
811 digest = self._new_digest()
814 for (guid, id_, sender, message) in self._process(parsed):
815 _LOG.debug('new message: {}'.format(message['Subject']))
817 seen.append((guid, id_))
818 self._append_to_digest(digest=digest, message=message)
821 self._send(sender=sender, message=message)
822 if guid not in self.seen:
824 self.seen[guid]['id'] = id_
826 if self.digest and seen:
827 if self.digest_post_process:
828 digest = self.digest_post_process(
829 feed=self, parsed=parsed, seen=seen, message=digest)
833 digest=digest, seen=seen, sender=sender, send=send)
835 self.etag = parsed.get('etag', None)
836 self.modified = parsed.get('modified', None)
838 def _new_digest(self):
839 digest = _MIMEMultipart('digest')
840 digest['To'] = self.to # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
841 digest['Subject'] = 'digest for {}'.format(self.name)
842 digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
843 digest['User-Agent'] = _USER_AGENT
844 digest['X-RSS-Feed'] = self.url
847 def _append_to_digest(self, digest, message):
848 part = _MIMEMessage(message)
849 part.add_header('Content-Disposition', 'attachment')
852 def _send_digest(self, digest, seen, sender, send=True):
853 """Send a digest message
855 The date is extracted from the last message in the digest
856 payload. We assume that this part exists. If you don't have
857 any messages in the digest, don't call this function.
859 digest['From'] = sender # TODO: _Header(), _formataddr()...
860 last_part = digest.get_payload()[-1]
861 last_message = last_part.get_payload()[0]
862 digest['Date'] = last_message['Date']
864 _LOG.debug('new digest for {}'.format(self))
866 self._send(sender=sender, message=digest)
867 for (guid, id_) in seen:
868 if guid not in self.seen:
870 self.seen[guid]['id'] = id_