1 # Copyright (C) 2004-2013 Aaron Swartz
4 # Dennis Keitzel <github@pinshot.net>
6 # Etienne Millon <me@emillon.org>
7 # J. Lewis Muir <jlmuir@imca-cat.org>
9 # Lindsey Smith <lindsey.smith@gmail.com>
11 # Martin 'Joey' Schulze
13 # W. Trevor King <wking@tremily.us>
15 # This file is part of rss2email.
17 # rss2email is free software: you can redistribute it and/or modify it under
18 # the terms of the GNU General Public License as published by the Free Software
19 # Foundation, either version 2 of the License, or (at your option) version 3 of
22 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
23 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
24 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
26 # You should have received a copy of the GNU General Public License along with
27 # rss2email. If not, see <http://www.gnu.org/licenses/>.
29 """Define the ``Feed`` class for handling a single feed
32 import collections as _collections
33 from email.mime.message import MIMEMessage as _MIMEMessage
34 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
35 from email.utils import formataddr as _formataddr
36 import hashlib as _hashlib
37 import html.parser as _html_parser
39 import socket as _socket
41 import urllib.error as _urllib_error
42 import urllib.request as _urllib_request
44 import xml.sax as _sax
45 import xml.sax.saxutils as _saxutils
47 import feedparser as _feedparser
48 import html2text as _html2text
51 from . import __version__
52 from . import LOG as _LOG
53 from . import config as _config
54 from . import email as _email
55 from . import error as _error
56 from . import util as _util
59 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
60 _feedparser.USER_AGENT = _USER_AGENT
61 _urllib_request.install_opener(_urllib_request.build_opener())
63 for e in ['error', 'herror', 'gaierror']:
64 if hasattr(_socket, e):
65 _SOCKET_ERRORS.append(getattr(_socket, e))
66 del e # cleanup namespace
67 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
70 # TypeError: 'str' does not support the buffer interface
71 _feedparser.PREFERRED_XML_PARSERS = []
75 """Utility class for feed manipulation and storage.
79 >>> from .config import CONFIG
82 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
84 test-feed (http://example.com/feed.atom -> a@b.com)
88 'user@rss2email.invalid'
90 >>> feed.from_email = 'a@b.com'
91 >>> feed.save_to_config()
92 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
94 from = user@rss2email.invalid
99 url = http://example.com/feed.atom
104 >>> feed.etag = 'dummy etag'
105 >>> string = pickle.dumps(feed)
106 >>> feed = pickle.loads(string)
107 >>> feed.load_from_config(config=CONFIG)
111 'http://example.com/feed.atom'
113 Names can only contain ASCII letters, digits, and '._-'. Here the
114 invalid space causes an exception:
116 >>> Feed(name='invalid name')
117 Traceback (most recent call last):
119 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
121 You must define a URL:
123 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
124 Traceback (most recent call last):
126 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
131 >>> CONFIG['DEFAULT']['to'] = ''
132 >>> test_section = CONFIG.pop('feed.test-feed')
134 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
136 # saved/loaded from feed.dat using __getstate__/__setstate__.
137 _dynamic_attributes = [
144 ## saved/loaded from ConfigParser instance
145 # attributes that aren't in DEFAULT
146 _non_default_configured_attributes = [
149 # attributes that are in DEFAULT
150 _default_configured_attributes = [
151 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
152 _default_configured_attributes[
153 _default_configured_attributes.index('from')
154 ] = 'from_email' # `from` is a Python keyword
155 # all attributes that are saved/loaded from .config
156 _configured_attributes = (
157 _non_default_configured_attributes + _default_configured_attributes)
158 # attribute name -> .config option
159 _configured_attribute_translations = dict(
160 (attr,attr) for attr in _non_default_configured_attributes)
161 _configured_attribute_translations.update(dict(
162 zip(_default_configured_attributes,
163 _config.CONFIG['DEFAULT'].keys())))
164 # .config option -> attribute name
165 _configured_attribute_inverse_translations = dict(
166 (v,k) for k,v in _configured_attribute_translations.items())
168 # hints for value conversion
169 _boolean_attributes = [
172 'use_publisher_email',
179 'links_after_each_paragraph',
184 _integer_attributes = [
194 _function_attributes = [
196 'digest_post_process',
199 def __init__(self, name=None, url=None, to=None, config=None):
200 self._set_name(name=name)
202 self.__setstate__(dict(
203 (attr, getattr(self, attr))
204 for attr in self._dynamic_attributes))
205 self.load_from_config(config=config)
212 return '{} ({} -> {})'.format(self.name, self.url, self.to)
215 return '<Feed {}>'.format(str(self))
217 def __getstate__(self):
218 "Save dyamic attributes"
220 (key,getattr(self,key)) for key in self._dynamic_attributes)
222 get_state = __getstate__ # make it publicly accessible
224 def __setstate__(self, state):
225 "Restore dynamic attributes"
226 keys = sorted(state.keys())
227 if keys != sorted(self._dynamic_attributes):
228 raise ValueError(state)
229 self._set_name(name=state['name'])
230 self.__dict__.update(state)
232 set_state = __setstate__ # make it publicly accessible
234 def save_to_config(self):
235 "Save configured attributes"
236 data = _collections.OrderedDict()
237 default = self.config['DEFAULT']
238 for attr in self._configured_attributes:
239 key = self._configured_attribute_translations[attr]
240 value = getattr(self, attr)
241 if value is not None:
242 value = self._get_configured_option_value(
243 attribute=attr, value=value)
244 if (attr in self._non_default_configured_attributes or
245 value != default[key]):
247 self.config[self.section] = data
249 def load_from_config(self, config=None):
250 "Restore configured attributes"
252 config = _config.CONFIG
254 if self.section in self.config:
255 data = self.config[self.section]
257 data = self.config['DEFAULT']
258 keys = sorted(data.keys())
259 expected = sorted(self._configured_attribute_translations.values())
262 if (key not in keys and
263 key not in self._non_default_configured_attributes):
264 raise _error.InvalidFeedConfig(
265 setting=key, feed=self,
266 message='missing configuration key: {}'.format(key))
268 if key not in expected:
269 raise _error.InvalidFeedConfig(
270 setting=key, feed=self,
271 message='extra configuration key: {}'.format(key))
273 (self._configured_attribute_inverse_translations[k],
274 self._get_configured_attribute_value(
275 attribute=self._configured_attribute_inverse_translations[k],
277 for k in data.keys())
278 for attr in self._non_default_configured_attributes:
281 self.__dict__.update(data)
283 def _get_configured_option_value(self, attribute, value):
286 elif attribute in self._list_attributes:
287 return ', '.join(value)
288 elif attribute in self._function_attributes:
289 return _util.import_name(value)
292 def _get_configured_attribute_value(self, attribute, key, data):
293 if attribute in self._boolean_attributes:
294 return data.getboolean(key)
295 elif attribute in self._integer_attributes:
296 return data.getint(key)
297 elif attribute in self._list_attributes:
298 return [x.strip() for x in data[key].split(',')]
299 elif attribute in self._function_attributes:
301 return _util.import_function(data[key])
306 """Reset dynamic data
312 def _set_name(self, name):
313 if not self._name_regexp.match(name):
314 raise _error.InvalidFeedName(name=name, feed=self)
316 self.section = 'feed.{}'.format(self.name)
319 """Fetch and parse a feed using feedparser.
322 ... name='test-feed',
323 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
324 >>> parsed = feed._fetch()
328 _LOG.info('fetch {}'.format(self))
330 raise _error.InvalidFeedConfig(setting='url', feed=self)
331 if self.section in self.config:
332 config = self.config[self.section]
334 config = self.config['DEFAULT']
335 proxy = config['proxy']
336 timeout = config.getint('feed-timeout')
339 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
340 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
341 return f(self.url, self.etag, modified=self.modified, **kwargs)
343 def _process(self, parsed):
344 _LOG.info('process {}'.format(self))
345 self._check_for_errors(parsed)
346 for entry in reversed(parsed.entries):
347 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
348 processed = self._process_entry(parsed=parsed, entry=entry)
350 guid,id_,sender,message = processed
351 if self.post_process:
352 message = self.post_process(
353 feed=self, parsed=parsed, entry=entry, guid=guid,
357 yield (guid, id_, sender, message)
359 def _check_for_errors(self, parsed):
361 status = getattr(parsed, 'status', 200)
362 _LOG.debug('HTTP status {}'.format(status))
364 _LOG.info('redirect {} from {} to {}'.format(
365 self.name, self.url, parsed['url']))
366 self.url = parsed['url']
367 elif status not in [200, 302, 304]:
368 raise _error.HTTPError(status=status, feed=self)
370 http_headers = parsed.get('headers', {})
372 _LOG.debug('HTTP headers: {}'.format(http_headers))
374 _LOG.warning('could not get HTTP headers: {}'.format(self))
377 if 'html' in http_headers.get('content-type', 'rss'):
378 _LOG.warning('looks like HTML: {}'.format(self))
380 if http_headers.get('content-length', '1') == '0':
381 _LOG.warning('empty page: {}'.format(self))
384 version = parsed.get('version', None)
386 _LOG.debug('feed version {}'.format(version))
388 _LOG.warning('unrecognized version: {}'.format(self))
391 exc = parsed.get('bozo_exception', None)
392 if isinstance(exc, _socket.timeout):
393 _LOG.error('timed out: {}'.format(self))
395 elif isinstance(exc, OSError):
396 _LOG.error('{}: {}'.format(exc, self))
398 elif isinstance(exc, _SOCKET_ERRORS):
399 _LOG.error('{}: {}'.format(exc, self))
401 elif isinstance(exc, _feedparser.zlib.error):
402 _LOG.error('broken compression: {}'.format(self))
404 elif isinstance(exc, (IOError, AttributeError)):
405 _LOG.error('{}: {}'.format(exc, self))
407 elif isinstance(exc, KeyboardInterrupt):
409 elif isinstance(exc, _sax.SAXParseException):
410 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
412 elif (parsed.bozo and
413 isinstance(exc, _feedparser.CharacterEncodingOverride)):
415 'incorrectly declared encoding: {}: {}'.format(exc, self))
417 elif parsed.bozo or exc:
419 exc = "can't process"
420 _LOG.error('processing error: {}: {}'.format(exc, self))
424 status in [200, 302] and
425 not parsed.entries and
427 raise _error.ProcessingError(parsed=parsed, feed=feed)
429 def _html2text(self, html, baseurl='', default=None):
430 self.config.setup_html2text(section=self.section)
432 return _html2text.html2text(html=html, baseurl=baseurl)
433 except _html_parser.HTMLParseError as e:
434 if default is not None:
438 def _process_entry(self, parsed, entry):
439 id_ = self._get_entry_id(entry)
440 # If .trust_guid isn't set, we get back hashes of the content.
441 # Instead of letting these run wild, we put them in context
442 # by associating them with the actual ID (if it exists).
443 guid = entry.get('id', id_)
444 if isinstance(guid, dict):
445 guid = guid.values()[0]
446 if guid in self.seen:
447 if self.seen[guid]['id'] == id_:
448 _LOG.debug('already seen {}'.format(id_))
449 return # already seen
450 sender = self._get_entry_email(parsed=parsed, entry=entry)
451 subject = self._get_entry_title(entry)
452 extra_headers = _collections.OrderedDict((
453 ('Date', self._get_entry_date(entry)),
454 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
455 ('User-Agent', _USER_AGENT),
456 ('X-RSS-Feed', self.url),
458 ('X-RSS-URL', self._get_entry_link(entry)),
459 ('X-RSS-TAGS', self._get_entry_tags(entry)),
461 for k,v in extra_headers.items(): # remove empty tags, etc.
464 if self.bonus_header:
465 for header in self.bonus_header.splitlines():
467 key,value = header.split(':', 1)
468 extra_headers[key.strip()] = value.strip()
471 'malformed bonus-header: {}'.format(
474 content = self._get_entry_content(entry)
476 content = self._process_entry_content(
477 entry=entry, content=content, subject=subject)
478 except _error.ProcessingError as e:
481 message = _email.get_message(
485 body=content['value'],
486 content_type=content['type'].split('/', 1)[1],
487 extra_headers=extra_headers,
489 section=self.section)
490 return (guid, id_, sender, message)
492 def _get_entry_id(self, entry):
493 """Get best ID from an entry."""
495 if getattr(entry, 'id', None):
496 # Newer versions of feedparser could return a dictionary
497 if isinstance(entry.id, dict):
498 return entry.id.values()[0]
500 content = self._get_entry_content(entry)
501 content_value = content['value'].strip()
503 return _hashlib.sha1(
504 content_value.encode('unicode-escape')).hexdigest()
505 elif getattr(entry, 'link', None):
506 return _hashlib.sha1(
507 entry.link.encode('unicode-escape')).hexdigest()
508 elif getattr(entry, 'title', None):
509 return _hashlib.sha1(
510 entry.title.encode('unicode-escape')).hexdigest()
512 def _get_entry_link(self, entry):
513 return entry.get('link', None)
515 def _get_entry_title(self, entry):
516 if hasattr(entry, 'title_detail') and entry.title_detail:
517 title = entry.title_detail.value
518 if 'html' in entry.title_detail.type:
519 title = self._html2text(title, default=title)
521 content = self._get_entry_content(entry)
522 value = content['value']
523 if content['type'] in ('text/html', 'application/xhtml+xml'):
524 value = self._html2text(value, default=value)
526 title = title.replace('\n', ' ').strip()
529 def _get_entry_date(self, entry):
530 datetime = _time.gmtime()
532 for datetype in self.date_header_order:
533 kind = datetype + '_parsed'
534 if entry.get(kind, None):
535 datetime = entry[kind]
537 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
539 def _get_entry_name(self, parsed, entry):
542 >>> import feedparser
543 >>> f = Feed(name='test-feed')
544 >>> parsed = feedparser.parse(
545 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
548 ... ' <name>Example author</name>\\n'
549 ... ' <email>me@example.com</email>\\n'
550 ... ' <url>http://example.com/</url>\\n'
555 >>> entry = parsed.entries[0]
556 >>> f.name_format = ''
557 >>> f._get_entry_name(parsed, entry)
559 >>> f.name_format = '{author}'
560 >>> f._get_entry_name(parsed, entry)
562 >>> f.name_format = '{feed-title}: {author}'
563 >>> f._get_entry_name(parsed, entry)
565 >>> f.name_format = '{author} ({feed.name})'
566 >>> f._get_entry_name(parsed, entry)
567 'Example author (test-feed)'
569 if not self.name_format:
573 'feed-title': '<feed title>',
574 'author': '<author>',
575 'publisher': '<publisher>',
578 data['feed-title'] = feed.get('title', '')
579 for x in [entry, feed]:
580 if 'name' in x.get('author_detail', []):
581 if x.author_detail.name:
582 data['author'] = x.author_detail.name
584 if 'name' in feed.get('publisher_detail', []):
585 data['publisher'] = feed.publisher_detail.name
586 name = self.name_format.format(**data)
587 return _html2text.unescape(name)
589 def _validate_email(self, email, default=None):
590 """Do a basic quality check on email address
592 Return `default` if the address doesn't appear to be
593 well-formed. If `default` is `None`, return
596 >>> f = Feed(name='test-feed')
597 >>> f._validate_email('valid@example.com', 'default@example.com')
599 >>> f._validate_email('invalid@', 'default@example.com')
600 'default@example.com'
601 >>> f._validate_email('@invalid', 'default@example.com')
602 'default@example.com'
603 >>> f._validate_email('invalid', 'default@example.com')
604 'default@example.com'
606 parts = email.split('@')
607 if len(parts) != 2 or '' in parts:
609 return self.from_email
613 def _get_entry_address(self, parsed, entry):
614 """Get the best From email address ('<jdoe@a.com>')
616 If the best guess isn't well-formed (something@somthing.com),
617 use `self.from_email` instead.
620 return self.from_email
622 if 'email' in entry.get('author_detail', []):
623 return self._validate_email(entry.author_detail.email)
624 elif 'email' in feed.get('author_detail', []):
625 return self._validate_email(feed.author_detail.email)
626 if self.use_publisher_email:
627 if 'email' in feed.get('publisher_detail', []):
628 return self._validate_email(feed.publisher_detail.email)
629 if feed.get('errorreportsto', None):
630 return self._validate_email(feed.errorreportsto)
631 _LOG.debug('no sender address found, fallback to default')
632 return self.from_email
634 def _get_entry_email(self, parsed, entry):
635 """Get the best From email address ('John <jdoe@a.com>')
637 name = self._get_entry_name(parsed=parsed, entry=entry)
638 address = self._get_entry_address(parsed=parsed, entry=entry)
639 return _formataddr((name, address))
641 def _get_entry_tags(self, entry):
642 """Add post tags, if available
644 >>> f = Feed(name='test-feed')
645 >>> f._get_entry_tags({
646 ... 'tags': [{'term': 'tag1',
648 ... 'label': None}]})
650 >>> f._get_entry_tags({
651 ... 'tags': [{'term': 'tag1',
656 ... 'label': None}]})
659 Test some troublesome cases. No tags:
661 >>> f._get_entry_tags({})
665 >>> f._get_entry_tags({'tags': []})
667 Tags without a ``term`` entry:
669 >>> f._get_entry_tags({
670 ... 'tags': [{'scheme': None,
671 ... 'label': None}]})
673 Tags with an empty term:
675 >>> f._get_entry_tags({
676 ... 'tags': [{'term': '',
678 ... 'label': None}]})
680 taglist = [tag['term'] for tag in entry.get('tags', [])
681 if tag.get('term', '')]
683 return ','.join(taglist)
685 def _get_entry_content(self, entry):
686 """Select the best content from an entry.
688 Returns a feedparser content dict.
691 # * We have a bunch of potential contents.
692 # * We go thru looking for our first choice.
693 # (HTML or text, depending on self.html_mail)
694 # * If that doesn't work, we go thru looking for our second choice.
695 # * If that still doesn't work, we just take the first one.
697 # Possible future improvement:
698 # * Instead of just taking the first one
699 # pick the one in the "best" language.
700 # * HACK: hardcoded .html_mail, should take a tuple of media types
701 contents = list(entry.get('content', []))
702 if entry.get('summary_detail', None):
703 contents.append(entry.summary_detail)
705 types = ['text/html', 'text/plain']
707 types = ['text/plain', 'text/html']
708 for content_type in types:
709 for content in contents:
710 if content['type'] == content_type:
714 return {'type': 'text/plain', 'value': ''}
716 def _process_entry_content(self, entry, content, subject):
717 "Convert entry content to the requested format."
718 link = self._get_entry_link(entry)
725 if self.use_css and self.css:
727 ' <style type="text/css">',
735 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
739 if content['type'] in ('text/html', 'application/xhtml+xml'):
740 lines.append(content['value'].strip())
742 lines.append(_saxutils.escape(content['value'].strip()))
743 lines.append('</div>')
745 '<div class="footer">'
746 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
748 for enclosure in getattr(entry, 'enclosures', []):
749 if getattr(enclosure, 'url', None):
751 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
753 if getattr(enclosure, 'src', None):
755 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
758 '<p><img src="{}" /></p>'.format(enclosure.src))
759 for elink in getattr(entry, 'links', []):
760 if elink.get('rel', None) == 'via':
762 title = elink.get('title', url)
763 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
771 content['type'] = 'text/html'
772 content['value'] = '\n'.join(lines)
774 else: # not self.html_mail
775 if content['type'] in ('text/html', 'application/xhtml+xml'):
777 lines = [self._html2text(content['value'])]
778 except _html_parser.HTMLParseError as e:
779 raise _error.ProcessingError(parsed=None, feed=self)
781 lines = [content['value']]
783 lines.append('URL: {}'.format(link))
784 for enclosure in getattr(entry, 'enclosures', []):
785 if getattr(enclosure, 'url', None):
786 lines.append('Enclosure: {}'.format(enclosure.url))
787 if getattr(enclosure, 'src', None):
788 lines.append('Enclosure: {}'.format(enclosure.src))
789 for elink in getattr(entry, 'links', []):
790 if elink.get('rel', None) == 'via':
792 title = elink.get('title', url)
793 lines.append('Via: {} {}'.format(title, url))
794 content['type'] = 'text/plain'
795 content['value'] = '\n'.join(lines)
798 def _send(self, sender, message):
799 _LOG.info('send message for {}'.format(self))
800 section = self.section
801 if section not in self.config:
803 _email.send(sender=sender, recipient=self.to, message=message,
804 config=self.config, section=section)
806 def run(self, send=True):
807 """Fetch and process the feed, mailing entry emails.
810 ... name='test-feed',
811 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
812 >>> def send(sender, message):
813 ... print('send from {}:'.format(sender))
814 ... print(message.as_string())
815 >>> feed._send = send
816 >>> feed.to = 'jdoe@dummy.invalid'
817 >>> #parsed = feed.run() # enable for debugging
820 raise _error.NoToEmailAddress(feed=self)
821 parsed = self._fetch()
824 digest = self._new_digest()
827 for (guid, id_, sender, message) in self._process(parsed):
828 _LOG.debug('new message: {}'.format(message['Subject']))
830 seen.append((guid, id_))
831 self._append_to_digest(digest=digest, message=message)
834 self._send(sender=sender, message=message)
835 if guid not in self.seen:
837 self.seen[guid]['id'] = id_
839 if self.digest and seen:
840 if self.digest_post_process:
841 digest = self.digest_post_process(
842 feed=self, parsed=parsed, seen=seen, message=digest)
846 digest=digest, seen=seen, sender=sender, send=send)
848 self.etag = parsed.get('etag', None)
849 self.modified = parsed.get('modified', None)
851 def _new_digest(self):
852 digest = _MIMEMultipart('digest')
853 digest['To'] = self.to # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
854 digest['Subject'] = 'digest for {}'.format(self.name)
855 digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
856 digest['User-Agent'] = _USER_AGENT
857 digest['X-RSS-Feed'] = self.url
860 def _append_to_digest(self, digest, message):
861 part = _MIMEMessage(message)
862 part.add_header('Content-Disposition', 'attachment')
865 def _send_digest(self, digest, seen, sender, send=True):
866 """Send a digest message
868 The date is extracted from the last message in the digest
869 payload. We assume that this part exists. If you don't have
870 any messages in the digest, don't call this function.
872 digest['From'] = sender # TODO: _Header(), _formataddr()...
873 last_part = digest.get_payload()[-1]
874 last_message = last_part.get_payload()[0]
875 digest['Date'] = last_message['Date']
877 _LOG.debug('new digest for {}'.format(self))
879 self._send(sender=sender, message=digest)
880 for (guid, id_) in seen:
881 if guid not in self.seen:
883 self.seen[guid]['id'] = id_