1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.mime.message import MIMEMessage as _MIMEMessage
32 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
33 from email.utils import formataddr as _formataddr
34 import hashlib as _hashlib
35 import html.parser as _html_parser
37 import socket as _socket
39 import urllib.error as _urllib_error
40 import urllib.request as _urllib_request
42 import xml.sax as _sax
43 import xml.sax.saxutils as _saxutils
45 import feedparser as _feedparser
46 import html2text as _html2text
49 from . import __version__
50 from . import LOG as _LOG
51 from . import config as _config
52 from . import email as _email
53 from . import error as _error
54 from . import util as _util
57 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
58 _feedparser.USER_AGENT = _USER_AGENT
59 _urllib_request.install_opener(_urllib_request.build_opener())
61 for e in ['error', 'herror', 'gaierror']:
62 if hasattr(_socket, e):
63 _SOCKET_ERRORS.append(getattr(_socket, e))
64 del e # cleanup namespace
65 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
69 """Utility class for feed manipulation and storage.
73 >>> from .config import CONFIG
76 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
78 test-feed (http://example.com/feed.atom -> a@b.com)
82 'user@rss2email.invalid'
84 >>> feed.from_email = 'a@b.com'
85 >>> feed.save_to_config()
86 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
88 from = user@rss2email.invalid
93 url = http://example.com/feed.atom
98 >>> feed.etag = 'dummy etag'
99 >>> string = pickle.dumps(feed)
100 >>> feed = pickle.loads(string)
101 >>> feed.load_from_config(config=CONFIG)
105 'http://example.com/feed.atom'
107 Names can only contain ASCII letters, digits, and '._-'. Here the
108 invalid space causes an exception:
110 >>> Feed(name='invalid name')
111 Traceback (most recent call last):
113 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
115 You must define a URL:
117 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
118 Traceback (most recent call last):
120 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
125 >>> CONFIG['DEFAULT']['to'] = ''
126 >>> test_section = CONFIG.pop('feed.test-feed')
128 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
130 # saved/loaded from feed.dat using __getstate__/__setstate__.
131 _dynamic_attributes = [
138 ## saved/loaded from ConfigParser instance
139 # attributes that aren't in DEFAULT
140 _non_default_configured_attributes = [
143 # attributes that are in DEFAULT
144 _default_configured_attributes = [
145 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
146 _default_configured_attributes[
147 _default_configured_attributes.index('from')
148 ] = 'from_email' # `from` is a Python keyword
149 # all attributes that are saved/loaded from .config
150 _configured_attributes = (
151 _non_default_configured_attributes + _default_configured_attributes)
152 # attribute name -> .config option
153 _configured_attribute_translations = dict(
154 (attr,attr) for attr in _non_default_configured_attributes)
155 _configured_attribute_translations.update(dict(
156 zip(_default_configured_attributes,
157 _config.CONFIG['DEFAULT'].keys())))
158 # .config option -> attribute name
159 _configured_attribute_inverse_translations = dict(
160 (v,k) for k,v in _configured_attribute_translations.items())
162 # hints for value conversion
163 _boolean_attributes = [
166 'use_publisher_email',
173 'links_after_each_paragraph',
178 _integer_attributes = [
188 _function_attributes = [
190 'digest_post_process',
193 def __init__(self, name=None, url=None, to=None, config=None):
194 self._set_name(name=name)
196 self.__setstate__(dict(
197 (attr, getattr(self, attr))
198 for attr in self._dynamic_attributes))
199 self.load_from_config(config=config)
206 return '{} ({} -> {})'.format(self.name, self.url, self.to)
209 return '<Feed {}>'.format(str(self))
211 def __getstate__(self):
212 "Save dyamic attributes"
214 (key,getattr(self,key)) for key in self._dynamic_attributes)
216 get_state = __getstate__ # make it publicly accessible
218 def __setstate__(self, state):
219 "Restore dynamic attributes"
220 keys = sorted(state.keys())
221 if keys != sorted(self._dynamic_attributes):
222 raise ValueError(state)
223 self._set_name(name=state['name'])
224 self.__dict__.update(state)
226 set_state = __setstate__ # make it publicly accessible
228 def save_to_config(self):
229 "Save configured attributes"
230 data = _collections.OrderedDict()
231 default = self.config['DEFAULT']
232 for attr in self._configured_attributes:
233 key = self._configured_attribute_translations[attr]
234 value = getattr(self, attr)
235 if value is not None:
236 value = self._get_configured_option_value(
237 attribute=attr, value=value)
238 if (attr in self._non_default_configured_attributes or
239 value != default[key]):
241 self.config[self.section] = data
243 def load_from_config(self, config=None):
244 "Restore configured attributes"
246 config = _config.CONFIG
248 if self.section in self.config:
249 data = self.config[self.section]
251 data = self.config['DEFAULT']
252 keys = sorted(data.keys())
253 expected = sorted(self._configured_attribute_translations.values())
256 if (key not in keys and
257 key not in self._non_default_configured_attributes):
258 raise _error.InvalidFeedConfig(
259 setting=key, feed=self,
260 message='missing configuration key: {}'.format(key))
262 if key not in expected:
263 raise _error.InvalidFeedConfig(
264 setting=key, feed=self,
265 message='extra configuration key: {}'.format(key))
267 (self._configured_attribute_inverse_translations[k],
268 self._get_configured_attribute_value(
269 attribute=self._configured_attribute_inverse_translations[k],
271 for k in data.keys())
272 for attr in self._non_default_configured_attributes:
275 self.__dict__.update(data)
277 def _get_configured_option_value(self, attribute, value):
280 elif attribute in self._list_attributes:
281 return ', '.join(value)
282 elif attribute in self._function_attributes:
283 return _util.import_name(value)
286 def _get_configured_attribute_value(self, attribute, key, data):
287 if attribute in self._boolean_attributes:
288 return data.getboolean(key)
289 elif attribute in self._integer_attributes:
290 return data.getint(key)
291 elif attribute in self._list_attributes:
292 return [x.strip() for x in data[key].split(',')]
293 elif attribute in self._function_attributes:
295 return _util.import_function(data[key])
300 """Reset dynamic data
306 def _set_name(self, name):
307 if not self._name_regexp.match(name):
308 raise _error.InvalidFeedName(name=name, feed=self)
310 self.section = 'feed.{}'.format(self.name)
313 """Fetch and parse a feed using feedparser.
316 ... name='test-feed',
317 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
318 >>> parsed = feed._fetch()
322 _LOG.info('fetch {}'.format(self))
324 raise _error.InvalidFeedConfig(setting='url', feed=self)
325 if self.section in self.config:
326 config = self.config[self.section]
328 config = self.config['DEFAULT']
329 proxy = config['proxy']
330 timeout = config.getint('feed-timeout')
333 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
334 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
335 return f(self.url, self.etag, modified=self.modified, **kwargs)
337 def _process(self, parsed):
338 _LOG.info('process {}'.format(self))
339 self._check_for_errors(parsed)
340 for entry in reversed(parsed.entries):
341 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
342 processed = self._process_entry(parsed=parsed, entry=entry)
344 guid,id_,sender,message = processed
345 if self.post_process:
346 message = self.post_process(
347 feed=self, parsed=parsed, entry=entry, guid=guid,
351 yield (guid, id_, sender, message)
353 def _check_for_errors(self, parsed):
355 status = getattr(parsed, 'status', 200)
356 _LOG.debug('HTTP status {}'.format(status))
358 _LOG.info('redirect {} from {} to {}'.format(
359 self.name, self.url, parsed['url']))
360 self.url = parsed['url']
361 elif status not in [200, 302, 304]:
362 raise _error.HTTPError(status=status, feed=self)
364 http_headers = parsed.get('headers', {})
366 _LOG.debug('HTTP headers: {}'.format(http_headers))
368 _LOG.warning('could not get HTTP headers: {}'.format(self))
371 if 'html' in http_headers.get('content-type', 'rss'):
372 _LOG.warning('looks like HTML: {}'.format(self))
374 if http_headers.get('content-length', '1') == '0':
375 _LOG.warning('empty page: {}'.format(self))
378 version = parsed.get('version', None)
380 _LOG.debug('feed version {}'.format(version))
382 _LOG.warning('unrecognized version: {}'.format(self))
385 exc = parsed.get('bozo_exception', None)
386 if isinstance(exc, _socket.timeout):
387 _LOG.error('timed out: {}'.format(self))
389 elif isinstance(exc, OSError):
390 _LOG.error('{}: {}'.format(exc, self))
392 elif isinstance(exc, _SOCKET_ERRORS):
393 _LOG.error('{}: {}'.format(exc, self))
395 elif isinstance(exc, _feedparser.zlib.error):
396 _LOG.error('broken compression: {}'.format(self))
398 elif isinstance(exc, (IOError, AttributeError)):
399 _LOG.error('{}: {}'.format(exc, self))
401 elif isinstance(exc, KeyboardInterrupt):
403 elif isinstance(exc, _sax.SAXParseException):
404 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
406 elif parsed.bozo or exc:
408 exc = "can't process"
409 _LOG.error('processing error: {}: {}'.format(exc, self))
413 status in [200, 302] and
414 not parsed.entries and
416 raise _error.ProcessingError(parsed=parsed, feed=feed)
418 def _html2text(self, html, baseurl=''):
419 self.config.setup_html2text(section=self.section)
420 return _html2text.html2text(html=html, baseurl=baseurl)
422 def _process_entry(self, parsed, entry):
423 id_ = self._get_entry_id(entry)
424 # If .trust_guid isn't set, we get back hashes of the content.
425 # Instead of letting these run wild, we put them in context
426 # by associating them with the actual ID (if it exists).
427 guid = entry.get('id', id_)
428 if isinstance(guid, dict):
429 guid = guid.values()[0]
430 if guid in self.seen:
431 if self.seen[guid]['id'] == id_:
432 _LOG.debug('already seen {}'.format(id_))
433 return # already seen
434 sender = self._get_entry_email(parsed=parsed, entry=entry)
435 subject = self._get_entry_title(entry)
436 extra_headers = _collections.OrderedDict((
437 ('Date', self._get_entry_date(entry)),
438 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
439 ('User-Agent', _USER_AGENT),
440 ('X-RSS-Feed', self.url),
442 ('X-RSS-URL', self._get_entry_link(entry)),
443 ('X-RSS-TAGS', self._get_entry_tags(entry)),
445 for k,v in extra_headers.items(): # remove empty tags, etc.
448 if self.bonus_header:
449 for header in self.bonus_header.splitlines():
451 key,value = header.split(':', 1)
452 extra_headers[key.strip()] = value.strip()
455 'malformed bonus-header: {}'.format(
458 content = self._get_entry_content(entry)
460 content = self._process_entry_content(
461 entry=entry, content=content, subject=subject)
462 except _error.ProcessingError as e:
465 message = _email.get_message(
469 body=content['value'],
470 content_type=content['type'].split('/', 1)[1],
471 extra_headers=extra_headers,
473 section=self.section)
474 return (guid, id_, sender, message)
476 def _get_entry_id(self, entry):
477 """Get best ID from an entry."""
479 if getattr(entry, 'id', None):
480 # Newer versions of feedparser could return a dictionary
481 if isinstance(entry.id, dict):
482 return entry.id.values()[0]
484 content = self._get_entry_content(entry)
485 content_value = content['value'].strip()
487 return _hashlib.sha1(
488 content_value.encode('unicode-escape')).hexdigest()
489 elif getattr(entry, 'link', None):
490 return _hashlib.sha1(
491 entry.link.encode('unicode-escape')).hexdigest()
492 elif getattr(entry, 'title', None):
493 return _hashlib.sha1(
494 entry.title.encode('unicode-escape')).hexdigest()
496 def _get_entry_link(self, entry):
497 return entry.get('link', None)
499 def _get_entry_title(self, entry):
500 if hasattr(entry, 'title_detail') and entry.title_detail:
501 title = entry.title_detail.value
502 if 'html' in entry.title_detail.type:
503 title = self._html2text(title)
505 content = self._get_entry_content(entry)
506 value = content['value']
507 if content['type'] in ('text/html', 'application/xhtml+xml'):
508 value = self._html2text(value)
510 title = title.replace('\n', ' ').strip()
513 def _get_entry_date(self, entry):
514 datetime = _time.gmtime()
516 for datetype in self.date_header_order:
517 kind = datetype + '_parsed'
518 if entry.get(kind, None):
519 datetime = entry[kind]
521 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
523 def _get_entry_name(self, parsed, entry):
526 >>> import feedparser
527 >>> f = Feed(name='test-feed')
528 >>> parsed = feedparser.parse(
529 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
532 ... ' <name>Example author</name>\\n'
533 ... ' <email>me@example.com</email>\\n'
534 ... ' <url>http://example.com/</url>\\n'
539 >>> entry = parsed.entries[0]
540 >>> f.name_format = ''
541 >>> f._get_entry_name(parsed, entry)
543 >>> f.name_format = '{author}'
544 >>> f._get_entry_name(parsed, entry)
546 >>> f.name_format = '{feed-title}: {author}'
547 >>> f._get_entry_name(parsed, entry)
549 >>> f.name_format = '{author} ({feed.name})'
550 >>> f._get_entry_name(parsed, entry)
551 'Example author (test-feed)'
553 if not self.name_format:
555 data = {'feed': self}
557 data['feed-title'] = feed.get('title', '')
558 for x in [entry, feed]:
559 if 'name' in x.get('author_detail', []):
560 if x.author_detail.name:
561 data['author'] = x.author_detail.name
563 if 'name' in feed.get('publisher_detail', []):
564 data['publisher'] = feed.publisher_detail.name
565 name = self.name_format.format(**data)
566 return _html2text.unescape(name)
568 def _validate_email(self, email, default=None):
569 """Do a basic quality check on email address
571 Return `default` if the address doesn't appear to be
572 well-formed. If `default` is `None`, return
575 >>> f = Feed(name='test-feed')
576 >>> f._validate_email('valid@example.com', 'default@example.com')
578 >>> f._validate_email('invalid@', 'default@example.com')
579 'default@example.com'
580 >>> f._validate_email('@invalid', 'default@example.com')
581 'default@example.com'
582 >>> f._validate_email('invalid', 'default@example.com')
583 'default@example.com'
585 parts = email.split('@')
586 if len(parts) != 2 or '' in parts:
588 return self.from_email
592 def _get_entry_address(self, parsed, entry):
593 """Get the best From email address ('<jdoe@a.com>')
595 If the best guess isn't well-formed (something@somthing.com),
596 use `self.from_email` instead.
599 return self.from_email
601 if 'email' in entry.get('author_detail', []):
602 return self._validate_email(entry.author_detail.email)
603 elif 'email' in feed.get('author_detail', []):
604 return self._validate_email(feed.author_detail.email)
605 if self.use_publisher_email:
606 if 'email' in feed.get('publisher_detail', []):
607 return self._validate_email(feed.publisher_detail.email)
608 if feed.get('errorreportsto', None):
609 return self._validate_email(feed.errorreportsto)
610 _LOG.debug('no sender address found, fallback to default')
611 return self.from_email
613 def _get_entry_email(self, parsed, entry):
614 """Get the best From email address ('John <jdoe@a.com>')
616 name = self._get_entry_name(parsed=parsed, entry=entry)
617 address = self._get_entry_address(parsed=parsed, entry=entry)
618 return _formataddr((name, address))
620 def _get_entry_tags(self, entry):
621 """Add post tags, if available
623 >>> f = Feed(name='test-feed')
624 >>> f._get_entry_tags({
625 ... 'tags': [{'term': 'tag1',
627 ... 'label': None}]})
629 >>> f._get_entry_tags({
630 ... 'tags': [{'term': 'tag1',
635 ... 'label': None}]})
638 Test some troublesome cases. No tags:
640 >>> f._get_entry_tags({})
644 >>> f._get_entry_tags({'tags': []})
646 Tags without a ``term`` entry:
648 >>> f._get_entry_tags({
649 ... 'tags': [{'scheme': None,
650 ... 'label': None}]})
652 Tags with an empty term:
654 >>> f._get_entry_tags({
655 ... 'tags': [{'term': '',
657 ... 'label': None}]})
659 taglist = [tag['term'] for tag in entry.get('tags', [])
660 if tag.get('term', '')]
662 return ','.join(taglist)
664 def _get_entry_content(self, entry):
665 """Select the best content from an entry.
667 Returns a feedparser content dict.
670 # * We have a bunch of potential contents.
671 # * We go thru looking for our first choice.
672 # (HTML or text, depending on self.html_mail)
673 # * If that doesn't work, we go thru looking for our second choice.
674 # * If that still doesn't work, we just take the first one.
676 # Possible future improvement:
677 # * Instead of just taking the first one
678 # pick the one in the "best" language.
679 # * HACK: hardcoded .html_mail, should take a tuple of media types
680 contents = list(entry.get('content', []))
681 if entry.get('summary_detail', None):
682 contents.append(entry.summary_detail)
684 types = ['text/html', 'text/plain']
686 types = ['text/plain', 'text/html']
687 for content_type in types:
688 for content in contents:
689 if content['type'] == content_type:
693 return {'type': 'text/plain', 'value': ''}
695 def _process_entry_content(self, entry, content, subject):
696 "Convert entry content to the requested format."
697 link = self._get_entry_link(entry)
704 if self.use_css and self.css:
706 ' <style type="text/css">',
714 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
718 if content['type'] in ('text/html', 'application/xhtml+xml'):
719 lines.append(content['value'].strip())
721 lines.append(_saxutils.escape(content['value'].strip()))
722 lines.append('</div>')
724 '<div class="footer">'
725 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
727 for enclosure in getattr(entry, 'enclosures', []):
728 if getattr(enclosure, 'url', None):
730 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
732 if getattr(enclosure, 'src', None):
734 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
737 '<p><img src="{}" /></p>'.format(enclosure.src))
738 for elink in getattr(entry, 'links', []):
739 if elink.get('rel', None) == 'via':
741 title = elink.get('title', url)
742 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
750 content['type'] = 'text/html'
751 content['value'] = '\n'.join(lines)
753 else: # not self.html_mail
754 if content['type'] in ('text/html', 'application/xhtml+xml'):
756 lines = [self._html2text(content['value'])]
757 except _html_parser.HTMLParseError as e:
758 raise _error.ProcessingError(parsed=None, feed=self)
760 lines = [content['value']]
762 lines.append('URL: {}'.format(link))
763 for enclosure in getattr(entry, 'enclosures', []):
764 if getattr(enclosure, 'url', None):
765 lines.append('Enclosure: {}'.format(enclosure.url))
766 if getattr(enclosure, 'src', None):
767 lines.append('Enclosure: {}'.format(enclosure.src))
768 for elink in getattr(entry, 'links', []):
769 if elink.get('rel', None) == 'via':
771 title = elink.get('title', url)
772 lines.append('Via: {} {}'.format(title, url))
773 content['type'] = 'text/plain'
774 content['value'] = '\n'.join(lines)
777 def _send(self, sender, message):
778 _LOG.info('send message for {}'.format(self))
779 section = self.section
780 if section not in self.config:
782 _email.send(sender=sender, recipient=self.to, message=message,
783 config=self.config, section=section)
785 def run(self, send=True):
786 """Fetch and process the feed, mailing entry emails.
789 ... name='test-feed',
790 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
791 >>> def send(sender, message):
792 ... print('send from {}:'.format(sender))
793 ... print(message.as_string())
794 >>> feed._send = send
795 >>> feed.to = 'jdoe@dummy.invalid'
796 >>> #parsed = feed.run() # enable for debugging
799 raise _error.NoToEmailAddress(feed=self)
800 parsed = self._fetch()
803 digest = self._new_digest()
806 for (guid, id_, sender, message) in self._process(parsed):
807 _LOG.debug('new message: {}'.format(message['Subject']))
809 seen.append((guid, id_))
810 self._append_to_digest(digest=digest, message=message)
813 self._send(sender=sender, message=message)
814 if guid not in self.seen:
816 self.seen[guid]['id'] = id_
818 if self.digest and seen:
819 if self.digest_post_process:
820 digest = self.digest_post_process(
821 feed=self, parsed=parsed, seen=seen, message=digest)
825 digest=digest, seen=seen, sender=sender, send=send)
827 self.etag = parsed.get('etag', None)
828 self.modified = parsed.get('modified', None)
830 def _new_digest(self):
831 digest = _MIMEMultipart('digest')
832 digest['To'] = self.to # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
833 digest['Subject'] = 'digest for {}'.format(self.name)
834 digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
835 digest['User-Agent'] = _USER_AGENT
836 digest['X-RSS-Feed'] = self.url
839 def _append_to_digest(self, digest, message):
840 part = _MIMEMessage(message)
841 part.add_header('Content-Disposition', 'attachment')
844 def _send_digest(self, digest, seen, sender, send=True):
845 """Send a digest message
847 The date is extracted from the last message in the digest
848 payload. We assume that this part exists. If you don't have
849 any messages in the digest, don't call this function.
851 digest['From'] = sender # TODO: _Header(), _formataddr()...
852 last_part = digest.get_payload()[-1]
853 last_message = last_part.get_payload()[0]
854 digest['Date'] = last_message['Date']
856 _LOG.debug('new digest for {}'.format(self))
858 self._send(sender=sender, message=digest)
859 for (guid, id_) in seen:
860 if guid not in self.seen:
862 self.seen[guid]['id'] = id_