1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.mime.message import MIMEMessage as _MIMEMessage
32 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
33 from email.utils import formataddr as _formataddr
34 import hashlib as _hashlib
35 import html.parser as _html_parser
37 import socket as _socket
39 import urllib.error as _urllib_error
40 import urllib.request as _urllib_request
42 import xml.sax as _sax
43 import xml.sax.saxutils as _saxutils
45 import feedparser as _feedparser
46 import html2text as _html2text
49 from . import __version__
50 from . import LOG as _LOG
51 from . import config as _config
52 from . import email as _email
53 from . import error as _error
54 from . import util as _util
57 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
58 _feedparser.USER_AGENT = _USER_AGENT
59 _urllib_request.install_opener(_urllib_request.build_opener())
61 for e in ['error', 'herror', 'gaierror']:
62 if hasattr(_socket, e):
63 _SOCKET_ERRORS.append(getattr(_socket, e))
64 del e # cleanup namespace
65 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
69 """Utility class for feed manipulation and storage.
73 >>> from .config import CONFIG
76 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
78 test-feed (http://example.com/feed.atom -> a@b.com)
82 'user@rss2email.invalid'
84 >>> feed.from_email = 'a@b.com'
85 >>> feed.save_to_config()
86 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
88 from = user@rss2email.invalid
93 url = http://example.com/feed.atom
98 >>> feed.etag = 'dummy etag'
99 >>> string = pickle.dumps(feed)
100 >>> feed = pickle.loads(string)
101 >>> feed.load_from_config(config=CONFIG)
105 'http://example.com/feed.atom'
107 Names can only contain ASCII letters, digits, and '._-'. Here the
108 invalid space causes an exception:
110 >>> Feed(name='invalid name')
111 Traceback (most recent call last):
113 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
115 You must define a URL:
117 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
118 Traceback (most recent call last):
120 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
125 >>> CONFIG['DEFAULT']['to'] = ''
126 >>> test_section = CONFIG.pop('feed.test-feed')
128 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
130 # saved/loaded from feed.dat using __getstate__/__setstate__.
131 _dynamic_attributes = [
138 ## saved/loaded from ConfigParser instance
139 # attributes that aren't in DEFAULT
140 _non_default_configured_attributes = [
143 # attributes that are in DEFAULT
144 _default_configured_attributes = [
145 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
146 _default_configured_attributes[
147 _default_configured_attributes.index('from')
148 ] = 'from_email' # `from` is a Python keyword
149 # all attributes that are saved/loaded from .config
150 _configured_attributes = (
151 _non_default_configured_attributes + _default_configured_attributes)
152 # attribute name -> .config option
153 _configured_attribute_translations = dict(
154 (attr,attr) for attr in _non_default_configured_attributes)
155 _configured_attribute_translations.update(dict(
156 zip(_default_configured_attributes,
157 _config.CONFIG['DEFAULT'].keys())))
158 # .config option -> attribute name
159 _configured_attribute_inverse_translations = dict(
160 (v,k) for k,v in _configured_attribute_translations.items())
162 # hints for value conversion
163 _boolean_attributes = [
166 'use_publisher_email',
174 'links_after_each_paragraph',
179 _integer_attributes = [
189 _function_attributes = [
191 'digest_post_process',
194 def __init__(self, name=None, url=None, to=None, config=None):
195 self._set_name(name=name)
197 self.__setstate__(dict(
198 (attr, getattr(self, attr))
199 for attr in self._dynamic_attributes))
200 self.load_from_config(config=config)
207 return '{} ({} -> {})'.format(self.name, self.url, self.to)
210 return '<Feed {}>'.format(str(self))
212 def __getstate__(self):
213 "Save dyamic attributes"
215 (key,getattr(self,key)) for key in self._dynamic_attributes)
217 get_state = __getstate__ # make it publicly accessible
219 def __setstate__(self, state):
220 "Restore dynamic attributes"
221 keys = sorted(state.keys())
222 if keys != sorted(self._dynamic_attributes):
223 raise ValueError(state)
224 self._set_name(name=state['name'])
225 self.__dict__.update(state)
227 set_state = __setstate__ # make it publicly accessible
229 def save_to_config(self):
230 "Save configured attributes"
231 data = _collections.OrderedDict()
232 default = self.config['DEFAULT']
233 for attr in self._configured_attributes:
234 key = self._configured_attribute_translations[attr]
235 value = getattr(self, attr)
236 if value is not None:
237 value = self._get_configured_option_value(
238 attribute=attr, value=value)
239 if (attr in self._non_default_configured_attributes or
240 value != default[key]):
242 self.config[self.section] = data
244 def load_from_config(self, config=None):
245 "Restore configured attributes"
247 config = _config.CONFIG
249 if self.section in self.config:
250 data = self.config[self.section]
252 data = self.config['DEFAULT']
253 keys = sorted(data.keys())
254 expected = sorted(self._configured_attribute_translations.values())
257 if (key not in keys and
258 key not in self._non_default_configured_attributes):
259 raise _error.InvalidFeedConfig(
260 setting=key, feed=self,
261 message='missing configuration key: {}'.format(key))
263 if key not in expected:
264 raise _error.InvalidFeedConfig(
265 setting=key, feed=self,
266 message='extra configuration key: {}'.format(key))
268 (self._configured_attribute_inverse_translations[k],
269 self._get_configured_attribute_value(
270 attribute=self._configured_attribute_inverse_translations[k],
272 for k in data.keys())
273 for attr in self._non_default_configured_attributes:
276 self.__dict__.update(data)
278 def _get_configured_option_value(self, attribute, value):
281 elif attribute in self._list_attributes:
282 return ', '.join(value)
283 elif attribute in self._function_attributes:
284 return _util.import_name(value)
287 def _get_configured_attribute_value(self, attribute, key, data):
288 if attribute in self._boolean_attributes:
289 return data.getboolean(key)
290 elif attribute in self._integer_attributes:
291 return data.getint(key)
292 elif attribute in self._list_attributes:
293 return [x.strip() for x in data[key].split(',')]
294 elif attribute in self._function_attributes:
296 return _util.import_function(data[key])
301 """Reset dynamic data
307 def _set_name(self, name):
308 if not self._name_regexp.match(name):
309 raise _error.InvalidFeedName(name=name, feed=self)
311 self.section = 'feed.{}'.format(self.name)
314 """Fetch and parse a feed using feedparser.
317 ... name='test-feed',
318 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
319 >>> parsed = feed._fetch()
323 _LOG.info('fetch {}'.format(self))
325 raise _error.InvalidFeedConfig(setting='url', feed=self)
326 if self.section in self.config:
327 config = self.config[self.section]
329 config = self.config['DEFAULT']
330 proxy = config['proxy']
331 timeout = config.getint('feed-timeout')
334 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
335 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
336 return f(self.url, self.etag, modified=self.modified, **kwargs)
338 def _process(self, parsed):
339 _LOG.info('process {}'.format(self))
340 self._check_for_errors(parsed)
341 for entry in reversed(parsed.entries):
342 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
343 processed = self._process_entry(parsed=parsed, entry=entry)
345 guid,id_,sender,message = processed
346 if self.post_process:
347 message = self.post_process(
348 feed=self, parsed=parsed, entry=entry, guid=guid,
352 yield (guid, id_, sender, message)
354 def _check_for_errors(self, parsed):
356 status = getattr(parsed, 'status', 200)
357 _LOG.debug('HTTP status {}'.format(status))
359 _LOG.info('redirect {} from {} to {}'.format(
360 self.name, self.url, parsed['url']))
361 self.url = parsed['url']
362 elif status not in [200, 302, 304]:
363 raise _error.HTTPError(status=status, feed=self)
365 http_headers = parsed.get('headers', {})
367 _LOG.debug('HTTP headers: {}'.format(http_headers))
369 _LOG.warning('could not get HTTP headers: {}'.format(self))
372 if 'html' in http_headers.get('content-type', 'rss'):
373 _LOG.warning('looks like HTML: {}'.format(self))
375 if http_headers.get('content-length', '1') == '0':
376 _LOG.warning('empty page: {}'.format(self))
379 version = parsed.get('version', None)
381 _LOG.debug('feed version {}'.format(version))
383 _LOG.warning('unrecognized version: {}'.format(self))
386 exc = parsed.get('bozo_exception', None)
387 if isinstance(exc, _socket.timeout):
388 _LOG.error('timed out: {}'.format(self))
390 elif isinstance(exc, OSError):
391 _LOG.error('{}: {}'.format(exc, self))
393 elif isinstance(exc, _SOCKET_ERRORS):
394 _LOG.error('{}: {}'.format(exc, self))
396 elif isinstance(exc, _feedparser.zlib.error):
397 _LOG.error('broken compression: {}'.format(self))
399 elif isinstance(exc, (IOError, AttributeError)):
400 _LOG.error('{}: {}'.format(exc, self))
402 elif isinstance(exc, KeyboardInterrupt):
404 elif isinstance(exc, _sax.SAXParseException):
405 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
407 elif parsed.bozo or exc:
409 exc = "can't process"
410 _LOG.error('processing error: {}: {}'.format(exc, self))
414 status in [200, 302] and
415 not parsed.entries and
417 raise _error.ProcessingError(parsed=parsed, feed=feed)
419 def _html2text(self, html, baseurl='', default=None):
420 self.config.setup_html2text(section=self.section)
422 return _html2text.html2text(html=html, baseurl=baseurl)
423 except _html_parser.HTMLParseError as e:
424 if default is not None:
428 def _process_entry(self, parsed, entry):
429 id_ = self._get_entry_id(entry)
430 # If .trust_guid isn't set, we get back hashes of the content.
431 # Instead of letting these run wild, we put them in context
432 # by associating them with the actual ID (if it exists).
433 guid = entry.get('id', id_)
434 if isinstance(guid, dict):
435 guid = guid.values()[0]
436 if guid in self.seen:
437 if self.seen[guid]['id'] == id_:
438 _LOG.debug('already seen {}'.format(id_))
439 return # already seen
440 sender = self._get_entry_email(parsed=parsed, entry=entry)
441 subject = self._get_entry_title(entry)
442 extra_headers = _collections.OrderedDict((
443 ('Date', self._get_entry_date(entry)),
444 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
445 ('User-Agent', _USER_AGENT),
446 ('X-RSS-Feed', self.url),
448 ('X-RSS-URL', self._get_entry_link(entry)),
449 ('X-RSS-TAGS', self._get_entry_tags(entry)),
451 for k,v in extra_headers.items(): # remove empty tags, etc.
454 if self.bonus_header:
455 for header in self.bonus_header.splitlines():
457 key,value = header.split(':', 1)
458 extra_headers[key.strip()] = value.strip()
461 'malformed bonus-header: {}'.format(
464 content = self._get_entry_content(entry)
466 content = self._process_entry_content(
467 entry=entry, content=content, subject=subject)
468 except _error.ProcessingError as e:
471 message = _email.get_message(
475 body=content['value'],
476 content_type=content['type'].split('/', 1)[1],
477 extra_headers=extra_headers,
479 section=self.section)
480 return (guid, id_, sender, message)
482 def _get_entry_id(self, entry):
483 """Get best ID from an entry."""
485 if getattr(entry, 'id', None):
486 # Newer versions of feedparser could return a dictionary
487 if isinstance(entry.id, dict):
488 return entry.id.values()[0]
490 content = self._get_entry_content(entry)
491 content_value = content['value'].strip()
493 return _hashlib.sha1(
494 content_value.encode('unicode-escape')).hexdigest()
495 elif getattr(entry, 'link', None):
496 return _hashlib.sha1(
497 entry.link.encode('unicode-escape')).hexdigest()
498 elif getattr(entry, 'title', None):
499 return _hashlib.sha1(
500 entry.title.encode('unicode-escape')).hexdigest()
502 def _get_entry_link(self, entry):
503 return entry.get('link', None)
505 def _get_entry_title(self, entry):
506 if hasattr(entry, 'title_detail') and entry.title_detail:
507 title = entry.title_detail.value
508 if 'html' in entry.title_detail.type:
509 title = self._html2text(title, default=title)
511 content = self._get_entry_content(entry)
512 value = content['value']
513 if content['type'] in ('text/html', 'application/xhtml+xml'):
514 value = self._html2text(value, default=value)
516 title = title.replace('\n', ' ').strip()
519 def _get_entry_date(self, entry):
520 datetime = _time.gmtime()
522 for datetype in self.date_header_order:
523 kind = datetype + '_parsed'
524 if entry.get(kind, None):
525 datetime = entry[kind]
527 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
529 def _get_entry_name(self, parsed, entry):
532 >>> import feedparser
533 >>> f = Feed(name='test-feed')
534 >>> parsed = feedparser.parse(
535 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
538 ... ' <name>Example author</name>\\n'
539 ... ' <email>me@example.com</email>\\n'
540 ... ' <url>http://example.com/</url>\\n'
545 >>> entry = parsed.entries[0]
546 >>> f.friendly_name = False
547 >>> f._get_entry_name(parsed, entry)
549 >>> f.friendly_name = True
550 >>> f._get_entry_name(parsed, entry)
553 if not self.friendly_name:
557 parts.append(feed.get('title', ''))
558 for x in [entry, feed]:
559 if 'name' in x.get('author_detail', []):
560 if x.author_detail.name:
563 parts.append(x.author_detail.name)
565 if not ''.join(parts) and self.use_publisher_email:
566 if 'name' in feed.get('publisher_detail', []):
569 parts.append(feed.publisher_detail.name)
570 return _html2text.unescape(''.join(parts))
572 def _validate_email(self, email, default=None):
573 """Do a basic quality check on email address
575 Return `default` if the address doesn't appear to be
576 well-formed. If `default` is `None`, return
579 >>> f = Feed(name='test-feed')
580 >>> f._validate_email('valid@example.com', 'default@example.com')
582 >>> f._validate_email('invalid@', 'default@example.com')
583 'default@example.com'
584 >>> f._validate_email('@invalid', 'default@example.com')
585 'default@example.com'
586 >>> f._validate_email('invalid', 'default@example.com')
587 'default@example.com'
589 parts = email.split('@')
590 if len(parts) != 2 or '' in parts:
592 return self.from_email
596 def _get_entry_address(self, parsed, entry):
597 """Get the best From email address ('<jdoe@a.com>')
599 If the best guess isn't well-formed (something@somthing.com),
600 use `self.from_email` instead.
603 return self.from_email
605 if 'email' in entry.get('author_detail', []):
606 return self._validate_email(entry.author_detail.email)
607 elif 'email' in feed.get('author_detail', []):
608 return self._validate_email(feed.author_detail.email)
609 if self.use_publisher_email:
610 if 'email' in feed.get('publisher_detail', []):
611 return self._validate_email(feed.publisher_detail.email)
612 if feed.get('errorreportsto', None):
613 return self._validate_email(feed.errorreportsto)
614 _LOG.debug('no sender address found, fallback to default')
615 return self.from_email
617 def _get_entry_email(self, parsed, entry):
618 """Get the best From email address ('John <jdoe@a.com>')
620 name = self._get_entry_name(parsed=parsed, entry=entry)
621 address = self._get_entry_address(parsed=parsed, entry=entry)
622 return _formataddr((name, address))
624 def _get_entry_tags(self, entry):
625 """Add post tags, if available
627 >>> f = Feed(name='test-feed')
628 >>> f._get_entry_tags({
629 ... 'tags': [{'term': 'tag1',
631 ... 'label': None}]})
633 >>> f._get_entry_tags({
634 ... 'tags': [{'term': 'tag1',
639 ... 'label': None}]})
642 Test some troublesome cases. No tags:
644 >>> f._get_entry_tags({})
648 >>> f._get_entry_tags({'tags': []})
650 Tags without a ``term`` entry:
652 >>> f._get_entry_tags({
653 ... 'tags': [{'scheme': None,
654 ... 'label': None}]})
656 Tags with an empty term:
658 >>> f._get_entry_tags({
659 ... 'tags': [{'term': '',
661 ... 'label': None}]})
663 taglist = [tag['term'] for tag in entry.get('tags', [])
664 if tag.get('term', '')]
666 return ','.join(taglist)
668 def _get_entry_content(self, entry):
669 """Select the best content from an entry.
671 Returns a feedparser content dict.
674 # * We have a bunch of potential contents.
675 # * We go thru looking for our first choice.
676 # (HTML or text, depending on self.html_mail)
677 # * If that doesn't work, we go thru looking for our second choice.
678 # * If that still doesn't work, we just take the first one.
680 # Possible future improvement:
681 # * Instead of just taking the first one
682 # pick the one in the "best" language.
683 # * HACK: hardcoded .html_mail, should take a tuple of media types
684 contents = list(entry.get('content', []))
685 if entry.get('summary_detail', None):
686 contents.append(entry.summary_detail)
688 types = ['text/html', 'text/plain']
690 types = ['text/plain', 'text/html']
691 for content_type in types:
692 for content in contents:
693 if content['type'] == content_type:
697 return {'type': 'text/plain', 'value': ''}
699 def _process_entry_content(self, entry, content, subject):
700 "Convert entry content to the requested format."
701 link = self._get_entry_link(entry)
708 if self.use_css and self.css:
710 ' <style type="text/css">',
718 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
722 if content['type'] in ('text/html', 'application/xhtml+xml'):
723 lines.append(content['value'].strip())
725 lines.append(_saxutils.escape(content['value'].strip()))
726 lines.append('</div>')
728 '<div class="footer">'
729 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
731 for enclosure in getattr(entry, 'enclosures', []):
732 if getattr(enclosure, 'url', None):
734 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
736 if getattr(enclosure, 'src', None):
738 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
741 '<p><img src="{}" /></p>'.format(enclosure.src))
742 for elink in getattr(entry, 'links', []):
743 if elink.get('rel', None) == 'via':
745 title = elink.get('title', url)
746 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
754 content['type'] = 'text/html'
755 content['value'] = '\n'.join(lines)
757 else: # not self.html_mail
758 if content['type'] in ('text/html', 'application/xhtml+xml'):
760 lines = [self._html2text(content['value'])]
761 except _html_parser.HTMLParseError as e:
762 raise _error.ProcessingError(parsed=None, feed=self)
764 lines = [content['value']]
766 lines.append('URL: {}'.format(link))
767 for enclosure in getattr(entry, 'enclosures', []):
768 if getattr(enclosure, 'url', None):
769 lines.append('Enclosure: {}'.format(enclosure.url))
770 if getattr(enclosure, 'src', None):
771 lines.append('Enclosure: {}'.format(enclosure.src))
772 for elink in getattr(entry, 'links', []):
773 if elink.get('rel', None) == 'via':
775 title = elink.get('title', url)
776 lines.append('Via: {} {}'.format(title, url))
777 content['type'] = 'text/plain'
778 content['value'] = '\n'.join(lines)
781 def _send(self, sender, message):
782 _LOG.info('send message for {}'.format(self))
783 section = self.section
784 if section not in self.config:
786 _email.send(sender=sender, recipient=self.to, message=message,
787 config=self.config, section=section)
789 def run(self, send=True):
790 """Fetch and process the feed, mailing entry emails.
793 ... name='test-feed',
794 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
795 >>> def send(sender, message):
796 ... print('send from {}:'.format(sender))
797 ... print(message.as_string())
798 >>> feed._send = send
799 >>> feed.to = 'jdoe@dummy.invalid'
800 >>> #parsed = feed.run() # enable for debugging
803 raise _error.NoToEmailAddress(feed=self)
804 parsed = self._fetch()
807 digest = self._new_digest()
810 for (guid, id_, sender, message) in self._process(parsed):
811 _LOG.debug('new message: {}'.format(message['Subject']))
813 seen.append((guid, id_))
814 self._append_to_digest(digest=digest, message=message)
817 self._send(sender=sender, message=message)
818 if guid not in self.seen:
820 self.seen[guid]['id'] = id_
822 if self.digest and seen:
823 if self.digest_post_process:
824 digest = self.digest_post_process(
825 feed=self, parsed=parsed, seen=seen, message=digest)
829 digest=digest, seen=seen, sender=sender, send=send)
831 self.etag = parsed.get('etag', None)
832 self.modified = parsed.get('modified', None)
834 def _new_digest(self):
835 digest = _MIMEMultipart('digest')
836 digest['To'] = self.to # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
837 digest['Subject'] = 'digest for {}'.format(self.name)
838 digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
839 digest['User-Agent'] = _USER_AGENT
840 digest['X-RSS-Feed'] = self.url
843 def _append_to_digest(self, digest, message):
844 part = _MIMEMessage(message)
845 part.add_header('Content-Disposition', 'attachment')
848 def _send_digest(self, digest, seen, sender, send=True):
849 """Send a digest message
851 The date is extracted from the last message in the digest
852 payload. We assume that this part exists. If you don't have
853 any messages in the digest, don't call this function.
855 digest['From'] = sender # TODO: _Header(), _formataddr()...
856 last_part = digest.get_payload()[-1]
857 last_message = last_part.get_payload()[0]
858 digest['Date'] = last_message['Date']
860 _LOG.debug('new digest for {}'.format(self))
862 self._send(sender=sender, message=digest)
863 for (guid, id_) in seen:
864 if guid not in self.seen:
866 self.seen[guid]['id'] = id_