1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.mime.message import MIMEMessage as _MIMEMessage
32 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
33 from email.utils import formataddr as _formataddr
34 import hashlib as _hashlib
35 import html.parser as _html_parser
37 import socket as _socket
39 import urllib.error as _urllib_error
40 import urllib.request as _urllib_request
42 import xml.sax as _sax
43 import xml.sax.saxutils as _saxutils
45 import feedparser as _feedparser
46 import html2text as _html2text
49 from . import __version__
50 from . import LOG as _LOG
51 from . import config as _config
52 from . import email as _email
53 from . import error as _error
54 from . import util as _util
57 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
58 _feedparser.USER_AGENT = _USER_AGENT
59 _urllib_request.install_opener(_urllib_request.build_opener())
61 for e in ['error', 'herror', 'gaierror']:
62 if hasattr(_socket, e):
63 _SOCKET_ERRORS.append(getattr(_socket, e))
64 del e # cleanup namespace
65 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
68 # TypeError: 'str' does not support the buffer interface
69 _feedparser.PREFERRED_XML_PARSERS = []
73 """Utility class for feed manipulation and storage.
77 >>> from .config import CONFIG
80 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
82 test-feed (http://example.com/feed.atom -> a@b.com)
86 'user@rss2email.invalid'
88 >>> feed.from_email = 'a@b.com'
89 >>> feed.save_to_config()
90 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
92 from = user@rss2email.invalid
97 url = http://example.com/feed.atom
102 >>> feed.etag = 'dummy etag'
103 >>> string = pickle.dumps(feed)
104 >>> feed = pickle.loads(string)
105 >>> feed.load_from_config(config=CONFIG)
109 'http://example.com/feed.atom'
111 Names can only contain ASCII letters, digits, and '._-'. Here the
112 invalid space causes an exception:
114 >>> Feed(name='invalid name')
115 Traceback (most recent call last):
117 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
119 You must define a URL:
121 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
122 Traceback (most recent call last):
124 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
129 >>> CONFIG['DEFAULT']['to'] = ''
130 >>> test_section = CONFIG.pop('feed.test-feed')
132 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
134 # saved/loaded from feed.dat using __getstate__/__setstate__.
135 _dynamic_attributes = [
142 ## saved/loaded from ConfigParser instance
143 # attributes that aren't in DEFAULT
144 _non_default_configured_attributes = [
147 # attributes that are in DEFAULT
148 _default_configured_attributes = [
149 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
150 _default_configured_attributes[
151 _default_configured_attributes.index('from')
152 ] = 'from_email' # `from` is a Python keyword
153 # all attributes that are saved/loaded from .config
154 _configured_attributes = (
155 _non_default_configured_attributes + _default_configured_attributes)
156 # attribute name -> .config option
157 _configured_attribute_translations = dict(
158 (attr,attr) for attr in _non_default_configured_attributes)
159 _configured_attribute_translations.update(dict(
160 zip(_default_configured_attributes,
161 _config.CONFIG['DEFAULT'].keys())))
162 # .config option -> attribute name
163 _configured_attribute_inverse_translations = dict(
164 (v,k) for k,v in _configured_attribute_translations.items())
166 # hints for value conversion
167 _boolean_attributes = [
170 'use_publisher_email',
178 'links_after_each_paragraph',
183 _integer_attributes = [
193 _function_attributes = [
195 'digest_post_process',
198 def __init__(self, name=None, url=None, to=None, config=None):
199 self._set_name(name=name)
201 self.__setstate__(dict(
202 (attr, getattr(self, attr))
203 for attr in self._dynamic_attributes))
204 self.load_from_config(config=config)
211 return '{} ({} -> {})'.format(self.name, self.url, self.to)
214 return '<Feed {}>'.format(str(self))
216 def __getstate__(self):
217 "Save dyamic attributes"
219 (key,getattr(self,key)) for key in self._dynamic_attributes)
221 get_state = __getstate__ # make it publicly accessible
223 def __setstate__(self, state):
224 "Restore dynamic attributes"
225 keys = sorted(state.keys())
226 if keys != sorted(self._dynamic_attributes):
227 raise ValueError(state)
228 self._set_name(name=state['name'])
229 self.__dict__.update(state)
231 set_state = __setstate__ # make it publicly accessible
233 def save_to_config(self):
234 "Save configured attributes"
235 data = _collections.OrderedDict()
236 default = self.config['DEFAULT']
237 for attr in self._configured_attributes:
238 key = self._configured_attribute_translations[attr]
239 value = getattr(self, attr)
240 if value is not None:
241 value = self._get_configured_option_value(
242 attribute=attr, value=value)
243 if (attr in self._non_default_configured_attributes or
244 value != default[key]):
246 self.config[self.section] = data
248 def load_from_config(self, config=None):
249 "Restore configured attributes"
251 config = _config.CONFIG
253 if self.section in self.config:
254 data = self.config[self.section]
256 data = self.config['DEFAULT']
257 keys = sorted(data.keys())
258 expected = sorted(self._configured_attribute_translations.values())
261 if (key not in keys and
262 key not in self._non_default_configured_attributes):
263 raise _error.InvalidFeedConfig(
264 setting=key, feed=self,
265 message='missing configuration key: {}'.format(key))
267 if key not in expected:
268 raise _error.InvalidFeedConfig(
269 setting=key, feed=self,
270 message='extra configuration key: {}'.format(key))
272 (self._configured_attribute_inverse_translations[k],
273 self._get_configured_attribute_value(
274 attribute=self._configured_attribute_inverse_translations[k],
276 for k in data.keys())
277 for attr in self._non_default_configured_attributes:
280 self.__dict__.update(data)
282 def _get_configured_option_value(self, attribute, value):
285 elif attribute in self._list_attributes:
286 return ', '.join(value)
287 elif attribute in self._function_attributes:
288 return _util.import_name(value)
291 def _get_configured_attribute_value(self, attribute, key, data):
292 if attribute in self._boolean_attributes:
293 return data.getboolean(key)
294 elif attribute in self._integer_attributes:
295 return data.getint(key)
296 elif attribute in self._list_attributes:
297 return [x.strip() for x in data[key].split(',')]
298 elif attribute in self._function_attributes:
300 return _util.import_function(data[key])
305 """Reset dynamic data
311 def _set_name(self, name):
312 if not self._name_regexp.match(name):
313 raise _error.InvalidFeedName(name=name, feed=self)
315 self.section = 'feed.{}'.format(self.name)
318 """Fetch and parse a feed using feedparser.
321 ... name='test-feed',
322 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
323 >>> parsed = feed._fetch()
327 _LOG.info('fetch {}'.format(self))
329 raise _error.InvalidFeedConfig(setting='url', feed=self)
330 if self.section in self.config:
331 config = self.config[self.section]
333 config = self.config['DEFAULT']
334 proxy = config['proxy']
335 timeout = config.getint('feed-timeout')
338 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
339 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
340 return f(self.url, self.etag, modified=self.modified, **kwargs)
342 def _process(self, parsed):
343 _LOG.info('process {}'.format(self))
344 self._check_for_errors(parsed)
345 for entry in reversed(parsed.entries):
346 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
347 processed = self._process_entry(parsed=parsed, entry=entry)
349 guid,id_,sender,message = processed
350 if self.post_process:
351 message = self.post_process(
352 feed=self, parsed=parsed, entry=entry, guid=guid,
356 yield (guid, id_, sender, message)
358 def _check_for_errors(self, parsed):
360 status = getattr(parsed, 'status', 200)
361 _LOG.debug('HTTP status {}'.format(status))
363 _LOG.info('redirect {} from {} to {}'.format(
364 self.name, self.url, parsed['url']))
365 self.url = parsed['url']
366 elif status not in [200, 302, 304]:
367 raise _error.HTTPError(status=status, feed=self)
369 http_headers = parsed.get('headers', {})
371 _LOG.debug('HTTP headers: {}'.format(http_headers))
373 _LOG.warning('could not get HTTP headers: {}'.format(self))
376 if 'html' in http_headers.get('content-type', 'rss'):
377 _LOG.warning('looks like HTML: {}'.format(self))
379 if http_headers.get('content-length', '1') == '0':
380 _LOG.warning('empty page: {}'.format(self))
383 version = parsed.get('version', None)
385 _LOG.debug('feed version {}'.format(version))
387 _LOG.warning('unrecognized version: {}'.format(self))
390 exc = parsed.get('bozo_exception', None)
391 if isinstance(exc, _socket.timeout):
392 _LOG.error('timed out: {}'.format(self))
394 elif isinstance(exc, OSError):
395 _LOG.error('{}: {}'.format(exc, self))
397 elif isinstance(exc, _SOCKET_ERRORS):
398 _LOG.error('{}: {}'.format(exc, self))
400 elif isinstance(exc, _feedparser.zlib.error):
401 _LOG.error('broken compression: {}'.format(self))
403 elif isinstance(exc, (IOError, AttributeError)):
404 _LOG.error('{}: {}'.format(exc, self))
406 elif isinstance(exc, KeyboardInterrupt):
408 elif isinstance(exc, _sax.SAXParseException):
409 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
411 elif parsed.bozo or exc:
413 exc = "can't process"
414 _LOG.error('processing error: {}: {}'.format(exc, self))
418 status in [200, 302] and
419 not parsed.entries and
421 raise _error.ProcessingError(parsed=parsed, feed=feed)
423 def _html2text(self, html, baseurl=''):
424 self.config.setup_html2text(section=self.section)
425 return _html2text.html2text(html=html, baseurl=baseurl)
427 def _process_entry(self, parsed, entry):
428 id_ = self._get_entry_id(entry)
429 # If .trust_guid isn't set, we get back hashes of the content.
430 # Instead of letting these run wild, we put them in context
431 # by associating them with the actual ID (if it exists).
432 guid = entry.get('id', id_)
433 if isinstance(guid, dict):
434 guid = guid.values()[0]
435 if guid in self.seen:
436 if self.seen[guid]['id'] == id_:
437 _LOG.debug('already seen {}'.format(id_))
438 return # already seen
439 sender = self._get_entry_email(parsed=parsed, entry=entry)
440 subject = self._get_entry_title(entry)
441 extra_headers = _collections.OrderedDict((
442 ('Date', self._get_entry_date(entry)),
443 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
444 ('User-Agent', _USER_AGENT),
445 ('X-RSS-Feed', self.url),
447 ('X-RSS-URL', self._get_entry_link(entry)),
448 ('X-RSS-TAGS', self._get_entry_tags(entry)),
450 for k,v in extra_headers.items(): # remove empty tags, etc.
453 if self.bonus_header:
454 for header in self.bonus_header.splitlines():
456 key,value = header.split(':', 1)
457 extra_headers[key.strip()] = value.strip()
460 'malformed bonus-header: {}'.format(
463 content = self._get_entry_content(entry)
465 content = self._process_entry_content(
466 entry=entry, content=content, subject=subject)
467 except _error.ProcessingError as e:
470 message = _email.get_message(
474 body=content['value'],
475 content_type=content['type'].split('/', 1)[1],
476 extra_headers=extra_headers,
478 section=self.section)
479 return (guid, id_, sender, message)
481 def _get_entry_id(self, entry):
482 """Get best ID from an entry."""
484 if getattr(entry, 'id', None):
485 # Newer versions of feedparser could return a dictionary
486 if isinstance(entry.id, dict):
487 return entry.id.values()[0]
489 content = self._get_entry_content(entry)
490 content_value = content['value'].strip()
492 return _hashlib.sha1(
493 content_value.encode('unicode-escape')).hexdigest()
494 elif getattr(entry, 'link', None):
495 return _hashlib.sha1(
496 entry.link.encode('unicode-escape')).hexdigest()
497 elif getattr(entry, 'title', None):
498 return _hashlib.sha1(
499 entry.title.encode('unicode-escape')).hexdigest()
501 def _get_entry_link(self, entry):
502 return entry.get('link', None)
504 def _get_entry_title(self, entry):
505 if hasattr(entry, 'title_detail') and entry.title_detail:
506 title = entry.title_detail.value
507 if 'html' in entry.title_detail.type:
508 title = self._html2text(title)
510 content = self._get_entry_content(entry)
511 value = content['value']
512 if content['type'] in ('text/html', 'application/xhtml+xml'):
513 value = self._html2text(value)
515 title = title.replace('\n', ' ').strip()
518 def _get_entry_date(self, entry):
519 datetime = _time.gmtime()
521 for datetype in self.date_header_order:
522 kind = datetype + '_parsed'
523 if entry.get(kind, None):
524 datetime = entry[kind]
526 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
528 def _get_entry_name(self, parsed, entry):
531 >>> import feedparser
532 >>> f = Feed(name='test-feed')
533 >>> parsed = feedparser.parse(
534 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
537 ... ' <name>Example author</name>\\n'
538 ... ' <email>me@example.com</email>\\n'
539 ... ' <url>http://example.com/</url>\\n'
544 >>> entry = parsed.entries[0]
545 >>> f.friendly_name = False
546 >>> f._get_entry_name(parsed, entry)
548 >>> f.friendly_name = True
549 >>> f._get_entry_name(parsed, entry)
552 if not self.friendly_name:
556 parts.append(feed.get('title', ''))
557 for x in [entry, feed]:
558 if 'name' in x.get('author_detail', []):
559 if x.author_detail.name:
562 parts.append(x.author_detail.name)
564 if not ''.join(parts) and self.use_publisher_email:
565 if 'name' in feed.get('publisher_detail', []):
568 parts.append(feed.publisher_detail.name)
569 return _html2text.unescape(''.join(parts))
571 def _validate_email(self, email, default=None):
572 """Do a basic quality check on email address
574 Return `default` if the address doesn't appear to be
575 well-formed. If `default` is `None`, return
578 >>> f = Feed(name='test-feed')
579 >>> f._validate_email('valid@example.com', 'default@example.com')
581 >>> f._validate_email('invalid@', 'default@example.com')
582 'default@example.com'
583 >>> f._validate_email('@invalid', 'default@example.com')
584 'default@example.com'
585 >>> f._validate_email('invalid', 'default@example.com')
586 'default@example.com'
588 parts = email.split('@')
589 if len(parts) != 2 or '' in parts:
591 return self.from_email
595 def _get_entry_address(self, parsed, entry):
596 """Get the best From email address ('<jdoe@a.com>')
598 If the best guess isn't well-formed (something@somthing.com),
599 use `self.from_email` instead.
602 return self.from_email
604 if 'email' in entry.get('author_detail', []):
605 return self._validate_email(entry.author_detail.email)
606 elif 'email' in feed.get('author_detail', []):
607 return self._validate_email(feed.author_detail.email)
608 if self.use_publisher_email:
609 if 'email' in feed.get('publisher_detail', []):
610 return self._validate_email(feed.publisher_detail.email)
611 if feed.get('errorreportsto', None):
612 return self._validate_email(feed.errorreportsto)
613 _LOG.debug('no sender address found, fallback to default')
614 return self.from_email
616 def _get_entry_email(self, parsed, entry):
617 """Get the best From email address ('John <jdoe@a.com>')
619 name = self._get_entry_name(parsed=parsed, entry=entry)
620 address = self._get_entry_address(parsed=parsed, entry=entry)
621 return _formataddr((name, address))
623 def _get_entry_tags(self, entry):
624 """Add post tags, if available
626 >>> f = Feed(name='test-feed')
627 >>> f._get_entry_tags({
628 ... 'tags': [{'term': 'tag1',
630 ... 'label': None}]})
632 >>> f._get_entry_tags({
633 ... 'tags': [{'term': 'tag1',
638 ... 'label': None}]})
641 Test some troublesome cases. No tags:
643 >>> f._get_entry_tags({})
647 >>> f._get_entry_tags({'tags': []})
649 Tags without a ``term`` entry:
651 >>> f._get_entry_tags({
652 ... 'tags': [{'scheme': None,
653 ... 'label': None}]})
655 Tags with an empty term:
657 >>> f._get_entry_tags({
658 ... 'tags': [{'term': '',
660 ... 'label': None}]})
662 taglist = [tag['term'] for tag in entry.get('tags', [])
663 if tag.get('term', '')]
665 return ','.join(taglist)
667 def _get_entry_content(self, entry):
668 """Select the best content from an entry.
670 Returns a feedparser content dict.
673 # * We have a bunch of potential contents.
674 # * We go thru looking for our first choice.
675 # (HTML or text, depending on self.html_mail)
676 # * If that doesn't work, we go thru looking for our second choice.
677 # * If that still doesn't work, we just take the first one.
679 # Possible future improvement:
680 # * Instead of just taking the first one
681 # pick the one in the "best" language.
682 # * HACK: hardcoded .html_mail, should take a tuple of media types
683 contents = list(entry.get('content', []))
684 if entry.get('summary_detail', None):
685 contents.append(entry.summary_detail)
687 types = ['text/html', 'text/plain']
689 types = ['text/plain', 'text/html']
690 for content_type in types:
691 for content in contents:
692 if content['type'] == content_type:
696 return {'type': 'text/plain', 'value': ''}
698 def _process_entry_content(self, entry, content, subject):
699 "Convert entry content to the requested format."
700 link = self._get_entry_link(entry)
707 if self.use_css and self.css:
709 ' <style type="text/css">',
717 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
721 if content['type'] in ('text/html', 'application/xhtml+xml'):
722 lines.append(content['value'].strip())
724 lines.append(_saxutils.escape(content['value'].strip()))
725 lines.append('</div>')
727 '<div class="footer">'
728 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
730 for enclosure in getattr(entry, 'enclosures', []):
731 if getattr(enclosure, 'url', None):
733 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
735 if getattr(enclosure, 'src', None):
737 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
740 '<p><img src="{}" /></p>'.format(enclosure.src))
741 for elink in getattr(entry, 'links', []):
742 if elink.get('rel', None) == 'via':
744 title = elink.get('title', url)
745 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
753 content['type'] = 'text/html'
754 content['value'] = '\n'.join(lines)
756 else: # not self.html_mail
757 if content['type'] in ('text/html', 'application/xhtml+xml'):
759 lines = [self._html2text(content['value'])]
760 except _html_parser.HTMLParseError as e:
761 raise _error.ProcessingError(parsed=None, feed=self)
763 lines = [content['value']]
765 lines.append('URL: {}'.format(link))
766 for enclosure in getattr(entry, 'enclosures', []):
767 if getattr(enclosure, 'url', None):
768 lines.append('Enclosure: {}'.format(enclosure.url))
769 if getattr(enclosure, 'src', None):
770 lines.append('Enclosure: {}'.format(enclosure.src))
771 for elink in getattr(entry, 'links', []):
772 if elink.get('rel', None) == 'via':
774 title = elink.get('title', url)
775 lines.append('Via: {} {}'.format(title, url))
776 content['type'] = 'text/plain'
777 content['value'] = '\n'.join(lines)
780 def _send(self, sender, message):
781 _LOG.info('send message for {}'.format(self))
782 section = self.section
783 if section not in self.config:
785 _email.send(sender=sender, recipient=self.to, message=message,
786 config=self.config, section=section)
788 def run(self, send=True):
789 """Fetch and process the feed, mailing entry emails.
792 ... name='test-feed',
793 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
794 >>> def send(sender, message):
795 ... print('send from {}:'.format(sender))
796 ... print(message.as_string())
797 >>> feed._send = send
798 >>> feed.to = 'jdoe@dummy.invalid'
799 >>> #parsed = feed.run() # enable for debugging
802 raise _error.NoToEmailAddress(feed=self)
803 parsed = self._fetch()
806 digest = self._new_digest()
809 for (guid, id_, sender, message) in self._process(parsed):
810 _LOG.debug('new message: {}'.format(message['Subject']))
812 seen.append((guid, id_))
813 self._append_to_digest(digest=digest, message=message)
816 self._send(sender=sender, message=message)
817 if guid not in self.seen:
819 self.seen[guid]['id'] = id_
821 if self.digest and seen:
822 if self.digest_post_process:
823 digest = self.digest_post_process(
824 feed=self, parsed=parsed, seen=seen, message=digest)
828 digest=digest, seen=seen, sender=sender, send=send)
830 self.etag = parsed.get('etag', None)
831 self.modified = parsed.get('modified', None)
833 def _new_digest(self):
834 digest = _MIMEMultipart('digest')
835 digest['To'] = self.to # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
836 digest['Subject'] = 'digest for {}'.format(self.name)
837 digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
838 digest['User-Agent'] = _USER_AGENT
839 digest['X-RSS-Feed'] = self.url
842 def _append_to_digest(self, digest, message):
843 part = _MIMEMessage(message)
844 part.add_header('Content-Disposition', 'attachment')
847 def _send_digest(self, digest, seen, sender, send=True):
848 """Send a digest message
850 The date is extracted from the last message in the digest
851 payload. We assume that this part exists. If you don't have
852 any messages in the digest, don't call this function.
854 digest['From'] = sender # TODO: _Header(), _formataddr()...
855 last_part = digest.get_payload()[-1]
856 last_message = last_part.get_payload()[0]
857 digest['Date'] = last_message['Date']
859 _LOG.debug('new digest for {}'.format(self))
861 self._send(sender=sender, message=digest)
862 for (guid, id_) in seen:
863 if guid not in self.seen:
865 self.seen[guid]['id'] = id_