1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.mime.message import MIMEMessage as _MIMEMessage
32 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
33 from email.utils import formataddr as _formataddr
34 import hashlib as _hashlib
35 import html.parser as _html_parser
37 import socket as _socket
39 import urllib.error as _urllib_error
40 import urllib.request as _urllib_request
42 import xml.sax as _sax
43 import xml.sax.saxutils as _saxutils
45 import feedparser as _feedparser
46 import html2text as _html2text
49 from . import __version__
50 from . import LOG as _LOG
51 from . import config as _config
52 from . import email as _email
53 from . import error as _error
54 from . import util as _util
57 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
58 _feedparser.USER_AGENT = _USER_AGENT
59 _urllib_request.install_opener(_urllib_request.build_opener())
61 for e in ['error', 'herror', 'gaierror']:
62 if hasattr(_socket, e):
63 _SOCKET_ERRORS.append(getattr(_socket, e))
64 del e # cleanup namespace
65 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
69 """Utility class for feed manipulation and storage.
73 >>> from .config import CONFIG
76 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
78 test-feed (http://example.com/feed.atom -> a@b.com)
82 'user@rss2email.invalid'
84 >>> feed.from_email = 'a@b.com'
85 >>> feed.save_to_config()
86 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
88 from = user@rss2email.invalid
93 url = http://example.com/feed.atom
98 >>> feed.etag = 'dummy etag'
99 >>> string = pickle.dumps(feed)
100 >>> feed = pickle.loads(string)
101 >>> feed.load_from_config(config=CONFIG)
105 'http://example.com/feed.atom'
107 Names can only contain ASCII letters, digits, and '._-'. Here the
108 invalid space causes an exception:
110 >>> Feed(name='invalid name')
111 Traceback (most recent call last):
113 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
115 You must define a URL:
117 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
118 Traceback (most recent call last):
120 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
125 >>> CONFIG['DEFAULT']['to'] = ''
126 >>> test_section = CONFIG.pop('feed.test-feed')
128 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
130 # saved/loaded from feed.dat using __getstate__/__setstate__.
131 _dynamic_attributes = [
138 ## saved/loaded from ConfigParser instance
139 # attributes that aren't in DEFAULT
140 _non_default_configured_attributes = [
143 # attributes that are in DEFAULT
144 _default_configured_attributes = [
145 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
146 _default_configured_attributes[
147 _default_configured_attributes.index('from')
148 ] = 'from_email' # `from` is a Python keyword
149 # all attributes that are saved/loaded from .config
150 _configured_attributes = (
151 _non_default_configured_attributes + _default_configured_attributes)
152 # attribute name -> .config option
153 _configured_attribute_translations = dict(
154 (attr,attr) for attr in _non_default_configured_attributes)
155 _configured_attribute_translations.update(dict(
156 zip(_default_configured_attributes,
157 _config.CONFIG['DEFAULT'].keys())))
158 # .config option -> attribute name
159 _configured_attribute_inverse_translations = dict(
160 (v,k) for k,v in _configured_attribute_translations.items())
162 # hints for value conversion
163 _boolean_attributes = [
166 'use_publisher_email',
174 'links_after_each_paragraph',
179 _integer_attributes = [
189 _function_attributes = [
191 'digest_post_process',
194 def __init__(self, name=None, url=None, to=None, config=None):
195 self._set_name(name=name)
197 self.__setstate__(dict(
198 (attr, getattr(self, attr))
199 for attr in self._dynamic_attributes))
200 self.load_from_config(config=config)
207 return '{} ({} -> {})'.format(self.name, self.url, self.to)
210 return '<Feed {}>'.format(str(self))
212 def __getstate__(self):
213 "Save dyamic attributes"
215 (key,getattr(self,key)) for key in self._dynamic_attributes)
217 get_state = __getstate__ # make it publicly accessible
219 def __setstate__(self, state):
220 "Restore dynamic attributes"
221 keys = sorted(state.keys())
222 if keys != sorted(self._dynamic_attributes):
223 raise ValueError(state)
224 self._set_name(name=state['name'])
225 self.__dict__.update(state)
227 set_state = __setstate__ # make it publicly accessible
229 def save_to_config(self):
230 "Save configured attributes"
231 data = _collections.OrderedDict()
232 default = self.config['DEFAULT']
233 for attr in self._configured_attributes:
234 key = self._configured_attribute_translations[attr]
235 value = getattr(self, attr)
236 if value is not None:
237 value = self._get_configured_option_value(
238 attribute=attr, value=value)
239 if (attr in self._non_default_configured_attributes or
240 value != default[key]):
242 self.config[self.section] = data
244 def load_from_config(self, config=None):
245 "Restore configured attributes"
247 config = _config.CONFIG
249 if self.section in self.config:
250 data = self.config[self.section]
252 data = self.config['DEFAULT']
253 keys = sorted(data.keys())
254 expected = sorted(self._configured_attribute_translations.values())
257 if (key not in keys and
258 key not in self._non_default_configured_attributes):
259 raise _error.InvalidFeedConfig(
260 setting=key, feed=self,
261 message='missing configuration key: {}'.format(key))
263 if key not in expected:
264 raise _error.InvalidFeedConfig(
265 setting=key, feed=self,
266 message='extra configuration key: {}'.format(key))
268 (self._configured_attribute_inverse_translations[k],
269 self._get_configured_attribute_value(
270 attribute=self._configured_attribute_inverse_translations[k],
272 for k in data.keys())
273 for attr in self._non_default_configured_attributes:
276 self.__dict__.update(data)
278 def _get_configured_option_value(self, attribute, value):
281 elif attribute in self._list_attributes:
282 return ', '.join(value)
283 elif attribute in self._function_attributes:
284 return _util.import_name(value)
287 def _get_configured_attribute_value(self, attribute, key, data):
288 if attribute in self._boolean_attributes:
289 return data.getboolean(key)
290 elif attribute in self._integer_attributes:
291 return data.getint(key)
292 elif attribute in self._list_attributes:
293 return [x.strip() for x in data[key].split(',')]
294 elif attribute in self._function_attributes:
296 return _util.import_function(data[key])
301 """Reset dynamic data
307 def _set_name(self, name):
308 if not self._name_regexp.match(name):
309 raise _error.InvalidFeedName(name=name, feed=self)
311 self.section = 'feed.{}'.format(self.name)
314 """Fetch and parse a feed using feedparser.
317 ... name='test-feed',
318 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
319 >>> parsed = feed._fetch()
323 _LOG.info('fetch {}'.format(self))
325 raise _error.InvalidFeedConfig(setting='url', feed=self)
326 if self.section in self.config:
327 config = self.config[self.section]
329 config = self.config['DEFAULT']
330 proxy = config['proxy']
331 timeout = config.getint('feed-timeout')
334 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
335 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
336 return f(self.url, self.etag, modified=self.modified, **kwargs)
338 def _process(self, parsed):
339 _LOG.info('process {}'.format(self))
340 self._check_for_errors(parsed)
341 for entry in reversed(parsed.entries):
342 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
343 processed = self._process_entry(parsed=parsed, entry=entry)
345 guid,id_,sender,message = processed
346 if self.post_process:
347 message = self.post_process(
348 feed=self, parsed=parsed, entry=entry, guid=guid,
352 yield (guid, id_, sender, message)
354 def _check_for_errors(self, parsed):
356 status = getattr(parsed, 'status', 200)
357 _LOG.debug('HTTP status {}'.format(status))
359 _LOG.info('redirect {} from {} to {}'.format(
360 self.name, self.url, parsed['url']))
361 self.url = parsed['url']
362 elif status not in [200, 302, 304]:
363 raise _error.HTTPError(status=status, feed=self)
365 http_headers = parsed.get('headers', {})
367 _LOG.debug('HTTP headers: {}'.format(http_headers))
369 _LOG.warning('could not get HTTP headers: {}'.format(self))
372 if 'html' in http_headers.get('content-type', 'rss'):
373 _LOG.warning('looks like HTML: {}'.format(self))
375 if http_headers.get('content-length', '1') == '0':
376 _LOG.warning('empty page: {}'.format(self))
379 version = parsed.get('version', None)
381 _LOG.debug('feed version {}'.format(version))
383 _LOG.warning('unrecognized version: {}'.format(self))
386 exc = parsed.get('bozo_exception', None)
387 if isinstance(exc, _socket.timeout):
388 _LOG.error('timed out: {}'.format(self))
390 elif isinstance(exc, OSError):
391 _LOG.error('{}: {}'.format(exc, self))
393 elif isinstance(exc, _SOCKET_ERRORS):
394 _LOG.error('{}: {}'.format(exc, self))
396 elif isinstance(exc, _feedparser.zlib.error):
397 _LOG.error('broken compression: {}'.format(self))
399 elif isinstance(exc, (IOError, AttributeError)):
400 _LOG.error('{}: {}'.format(exc, self))
402 elif isinstance(exc, KeyboardInterrupt):
404 elif isinstance(exc, _sax.SAXParseException):
405 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
407 elif parsed.bozo or exc:
409 exc = "can't process"
410 _LOG.error('processing error: {}: {}'.format(exc, self))
414 status in [200, 302] and
415 not parsed.entries and
417 raise _error.ProcessingError(parsed=parsed, feed=feed)
419 def _html2text(self, html, baseurl=''):
420 self.config.setup_html2text(section=self.section)
421 return _html2text.html2text(html=html, baseurl=baseurl)
423 def _process_entry(self, parsed, entry):
424 id_ = self._get_entry_id(entry)
425 # If .trust_guid isn't set, we get back hashes of the content.
426 # Instead of letting these run wild, we put them in context
427 # by associating them with the actual ID (if it exists).
428 guid = entry.get('id', id_)
429 if isinstance(guid, dict):
430 guid = guid.values()[0]
431 if guid in self.seen:
432 if self.seen[guid]['id'] == id_:
433 _LOG.debug('already seen {}'.format(id_))
434 return # already seen
435 sender = self._get_entry_email(parsed=parsed, entry=entry)
436 subject = self._get_entry_title(entry)
437 extra_headers = _collections.OrderedDict((
438 ('Date', self._get_entry_date(entry)),
439 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
440 ('User-Agent', _USER_AGENT),
441 ('X-RSS-Feed', self.url),
443 ('X-RSS-URL', self._get_entry_link(entry)),
444 ('X-RSS-TAGS', self._get_entry_tags(entry)),
446 for k,v in extra_headers.items(): # remove empty tags, etc.
449 if self.bonus_header:
450 for header in self.bonus_header.splitlines():
452 key,value = header.split(':', 1)
453 extra_headers[key.strip()] = value.strip()
456 'malformed bonus-header: {}'.format(
459 content = self._get_entry_content(entry)
461 content = self._process_entry_content(
462 entry=entry, content=content, subject=subject)
463 except _error.ProcessingError as e:
466 message = _email.get_message(
470 body=content['value'],
471 content_type=content['type'].split('/', 1)[1],
472 extra_headers=extra_headers,
474 section=self.section)
475 return (guid, id_, sender, message)
477 def _get_entry_id(self, entry):
478 """Get best ID from an entry."""
480 if getattr(entry, 'id', None):
481 # Newer versions of feedparser could return a dictionary
482 if isinstance(entry.id, dict):
483 return entry.id.values()[0]
485 content = self._get_entry_content(entry)
486 content_value = content['value'].strip()
488 return _hashlib.sha1(
489 content_value.encode('unicode-escape')).hexdigest()
490 elif getattr(entry, 'link', None):
491 return _hashlib.sha1(
492 entry.link.encode('unicode-escape')).hexdigest()
493 elif getattr(entry, 'title', None):
494 return _hashlib.sha1(
495 entry.title.encode('unicode-escape')).hexdigest()
497 def _get_entry_link(self, entry):
498 return entry.get('link', None)
500 def _get_entry_title(self, entry):
501 if hasattr(entry, 'title_detail') and entry.title_detail:
502 title = entry.title_detail.value
503 if 'html' in entry.title_detail.type:
504 title = self._html2text(title)
506 content = self._get_entry_content(entry)
507 value = content['value']
508 if content['type'] in ('text/html', 'application/xhtml+xml'):
509 value = self._html2text(value)
511 title = title.replace('\n', ' ').strip()
514 def _get_entry_date(self, entry):
515 datetime = _time.gmtime()
517 for datetype in self.date_header_order:
518 kind = datetype + '_parsed'
519 if entry.get(kind, None):
520 datetime = entry[kind]
522 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
524 def _get_entry_name(self, parsed, entry):
527 >>> import feedparser
528 >>> f = Feed(name='test-feed')
529 >>> parsed = feedparser.parse(
530 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
533 ... ' <name>Example author</name>\\n'
534 ... ' <email>me@example.com</email>\\n'
535 ... ' <url>http://example.com/</url>\\n'
540 >>> entry = parsed.entries[0]
541 >>> f.friendly_name = False
542 >>> f._get_entry_name(parsed, entry)
544 >>> f.friendly_name = True
545 >>> f._get_entry_name(parsed, entry)
548 if not self.friendly_name:
552 parts.append(feed.get('title', ''))
553 for x in [entry, feed]:
554 if 'name' in x.get('author_detail', []):
555 if x.author_detail.name:
558 parts.append(x.author_detail.name)
560 if not ''.join(parts) and self.use_publisher_email:
561 if 'name' in feed.get('publisher_detail', []):
564 parts.append(feed.publisher_detail.name)
565 return _html2text.unescape(''.join(parts))
567 def _validate_email(self, email, default=None):
568 """Do a basic quality check on email address
570 Return `default` if the address doesn't appear to be
571 well-formed. If `default` is `None`, return
574 >>> f = Feed(name='test-feed')
575 >>> f._validate_email('valid@example.com', 'default@example.com')
577 >>> f._validate_email('invalid@', 'default@example.com')
578 'default@example.com'
579 >>> f._validate_email('@invalid', 'default@example.com')
580 'default@example.com'
581 >>> f._validate_email('invalid', 'default@example.com')
582 'default@example.com'
584 parts = email.split('@')
585 if len(parts) != 2 or '' in parts:
587 return self.from_email
591 def _get_entry_address(self, parsed, entry):
592 """Get the best From email address ('<jdoe@a.com>')
594 If the best guess isn't well-formed (something@somthing.com),
595 use `self.from_email` instead.
598 return self.from_email
600 if 'email' in entry.get('author_detail', []):
601 return self._validate_email(entry.author_detail.email)
602 elif 'email' in feed.get('author_detail', []):
603 return self._validate_email(feed.author_detail.email)
604 if self.use_publisher_email:
605 if 'email' in feed.get('publisher_detail', []):
606 return self._validate_email(feed.publisher_detail.email)
607 if feed.get('errorreportsto', None):
608 return self._validate_email(feed.errorreportsto)
609 _LOG.debug('no sender address found, fallback to default')
610 return self.from_email
612 def _get_entry_email(self, parsed, entry):
613 """Get the best From email address ('John <jdoe@a.com>')
615 name = self._get_entry_name(parsed=parsed, entry=entry)
616 address = self._get_entry_address(parsed=parsed, entry=entry)
617 return _formataddr((name, address))
619 def _get_entry_tags(self, entry):
620 """Add post tags, if available
622 >>> f = Feed(name='test-feed')
623 >>> f._get_entry_tags({
624 ... 'tags': [{'term': 'tag1',
626 ... 'label': None}]})
628 >>> f._get_entry_tags({
629 ... 'tags': [{'term': 'tag1',
634 ... 'label': None}]})
637 Test some troublesome cases. No tags:
639 >>> f._get_entry_tags({})
643 >>> f._get_entry_tags({'tags': []})
645 Tags without a ``term`` entry:
647 >>> f._get_entry_tags({
648 ... 'tags': [{'scheme': None,
649 ... 'label': None}]})
651 Tags with an empty term:
653 >>> f._get_entry_tags({
654 ... 'tags': [{'term': '',
656 ... 'label': None}]})
658 taglist = [tag['term'] for tag in entry.get('tags', [])
659 if tag.get('term', '')]
661 return ','.join(taglist)
663 def _get_entry_content(self, entry):
664 """Select the best content from an entry.
666 Returns a feedparser content dict.
669 # * We have a bunch of potential contents.
670 # * We go thru looking for our first choice.
671 # (HTML or text, depending on self.html_mail)
672 # * If that doesn't work, we go thru looking for our second choice.
673 # * If that still doesn't work, we just take the first one.
675 # Possible future improvement:
676 # * Instead of just taking the first one
677 # pick the one in the "best" language.
678 # * HACK: hardcoded .html_mail, should take a tuple of media types
679 contents = list(entry.get('content', []))
680 if entry.get('summary_detail', None):
681 contents.append(entry.summary_detail)
683 types = ['text/html', 'text/plain']
685 types = ['text/plain', 'text/html']
686 for content_type in types:
687 for content in contents:
688 if content['type'] == content_type:
692 return {'type': 'text/plain', 'value': ''}
694 def _process_entry_content(self, entry, content, subject):
695 "Convert entry content to the requested format."
696 link = self._get_entry_link(entry)
703 if self.use_css and self.css:
705 ' <style type="text/css">',
713 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
717 if content['type'] in ('text/html', 'application/xhtml+xml'):
718 lines.append(content['value'].strip())
720 lines.append(_saxutils.escape(content['value'].strip()))
721 lines.append('</div>')
723 '<div class="footer">'
724 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
726 for enclosure in getattr(entry, 'enclosures', []):
727 if getattr(enclosure, 'url', None):
729 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
731 if getattr(enclosure, 'src', None):
733 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
736 '<p><img src="{}" /></p>'.format(enclosure.src))
737 for elink in getattr(entry, 'links', []):
738 if elink.get('rel', None) == 'via':
740 title = elink.get('title', url)
741 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
749 content['type'] = 'text/html'
750 content['value'] = '\n'.join(lines)
752 else: # not self.html_mail
753 if content['type'] in ('text/html', 'application/xhtml+xml'):
755 lines = [self._html2text(content['value'])]
756 except _html_parser.HTMLParseError as e:
757 raise _error.ProcessingError(parsed=None, feed=self)
759 lines = [content['value']]
761 lines.append('URL: {}'.format(link))
762 for enclosure in getattr(entry, 'enclosures', []):
763 if getattr(enclosure, 'url', None):
764 lines.append('Enclosure: {}'.format(enclosure.url))
765 if getattr(enclosure, 'src', None):
766 lines.append('Enclosure: {}'.format(enclosure.src))
767 for elink in getattr(entry, 'links', []):
768 if elink.get('rel', None) == 'via':
770 title = elink.get('title', url)
771 lines.append('Via: {} {}'.format(title, url))
772 content['type'] = 'text/plain'
773 content['value'] = '\n'.join(lines)
776 def _send(self, sender, message):
777 _LOG.info('send message for {}'.format(self))
778 section = self.section
779 if section not in self.config:
781 _email.send(sender=sender, recipient=self.to, message=message,
782 config=self.config, section=section)
784 def run(self, send=True):
785 """Fetch and process the feed, mailing entry emails.
788 ... name='test-feed',
789 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
790 >>> def send(sender, message):
791 ... print('send from {}:'.format(sender))
792 ... print(message.as_string())
793 >>> feed._send = send
794 >>> feed.to = 'jdoe@dummy.invalid'
795 >>> #parsed = feed.run() # enable for debugging
798 raise _error.NoToEmailAddress(feed=self)
799 parsed = self._fetch()
802 digest = self._new_digest()
805 for (guid, id_, sender, message) in self._process(parsed):
806 _LOG.debug('new message: {}'.format(message['Subject']))
808 seen.append((guid, id_))
809 self._append_to_digest(digest=digest, message=message)
812 self._send(sender=sender, message=message)
813 if guid not in self.seen:
815 self.seen[guid]['id'] = id_
817 if self.digest and seen:
818 if self.digest_post_process:
819 digest = self.digest_post_process(
820 feed=self, parsed=parsed, seen=seen, message=digest)
824 digest=digest, seen=seen, sender=sender, send=send)
826 self.etag = parsed.get('etag', None)
827 self.modified = parsed.get('modified', None)
829 def _new_digest(self):
830 digest = _MIMEMultipart('digest')
831 digest['To'] = self.to # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
832 digest['Subject'] = 'digest for {}'.format(self.name)
833 digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
834 digest['User-Agent'] = _USER_AGENT
835 digest['X-RSS-Feed'] = self.url
838 def _append_to_digest(self, digest, message):
839 part = _MIMEMessage(message)
840 part.add_header('Content-Disposition', 'attachment')
843 def _send_digest(self, digest, seen, sender, send=True):
844 """Send a digest message
846 The date is extracted from the last message in the digest
847 payload. We assume that this part exists. If you don't have
848 any messages in the digest, don't call this function.
850 digest['From'] = sender # TODO: _Header(), _formataddr()...
851 last_part = digest.get_payload()[-1]
852 last_message = last_part.get_payload()[0]
853 digest['Date'] = last_message['Date']
855 _LOG.debug('new digest for {}'.format(self))
857 self._send(sender=sender, message=digest)
858 for (guid, id_) in seen:
859 if guid not in self.seen:
861 self.seen[guid]['id'] = id_