1 # Copyright (C) 2004-2013 Aaron Swartz
5 # Etienne Millon <me@emillon.org>
7 # Lindsey Smith <lindsey.smith@gmail.com>
9 # Martin 'Joey' Schulze
11 # W. Trevor King <wking@tremily.us>
13 # This file is part of rss2email.
15 # rss2email is free software: you can redistribute it and/or modify it under
16 # the terms of the GNU General Public License as published by the Free Software
17 # Foundation, either version 2 of the License, or (at your option) version 3 of
20 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
21 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
24 # You should have received a copy of the GNU General Public License along with
25 # rss2email. If not, see <http://www.gnu.org/licenses/>.
27 """Define the ``Feed`` class for handling a single feed
30 import collections as _collections
31 from email.mime.message import MIMEMessage as _MIMEMessage
32 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
33 from email.utils import formataddr as _formataddr
34 import hashlib as _hashlib
35 import html.parser as _html_parser
37 import socket as _socket
39 import urllib.error as _urllib_error
40 import urllib.request as _urllib_request
42 import xml.sax as _sax
43 import xml.sax.saxutils as _saxutils
45 import feedparser as _feedparser
46 import html2text as _html2text
49 from . import __version__
50 from . import LOG as _LOG
51 from . import config as _config
52 from . import email as _email
53 from . import error as _error
54 from . import util as _util
57 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
58 _feedparser.USER_AGENT = _USER_AGENT
59 _urllib_request.install_opener(_urllib_request.build_opener())
61 for e in ['error', 'herror', 'gaierror']:
62 if hasattr(_socket, e):
63 _SOCKET_ERRORS.append(getattr(_socket, e))
64 del e # cleanup namespace
65 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
69 """Utility class for feed manipulation and storage.
73 >>> from .config import CONFIG
76 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
78 test-feed (http://example.com/feed.atom -> a@b.com)
82 'user@rss2email.invalid'
84 >>> feed.from_email = 'a@b.com'
85 >>> feed.save_to_config()
86 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
88 from = user@rss2email.invalid
93 url = http://example.com/feed.atom
98 >>> feed.etag = 'dummy etag'
99 >>> string = pickle.dumps(feed)
100 >>> feed = pickle.loads(string)
101 >>> feed.load_from_config(config=CONFIG)
105 'http://example.com/feed.atom'
107 Names can only contain ASCII letters, digits, and '._-'. Here the
108 invalid space causes an exception:
110 >>> Feed(name='invalid name')
111 Traceback (most recent call last):
113 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
115 You must define a URL:
117 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
118 Traceback (most recent call last):
120 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
125 >>> CONFIG['DEFAULT']['to'] = ''
126 >>> test_section = CONFIG.pop('feed.test-feed')
128 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
130 # saved/loaded from feed.dat using __getstate__/__setstate__.
131 _dynamic_attributes = [
138 ## saved/loaded from ConfigParser instance
139 # attributes that aren't in DEFAULT
140 _non_default_configured_attributes = [
143 # attributes that are in DEFAULT
144 _default_configured_attributes = [
145 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
146 _default_configured_attributes[
147 _default_configured_attributes.index('from')
148 ] = 'from_email' # `from` is a Python keyword
149 # all attributes that are saved/loaded from .config
150 _configured_attributes = (
151 _non_default_configured_attributes + _default_configured_attributes)
152 # attribute name -> .config option
153 _configured_attribute_translations = dict(
154 (attr,attr) for attr in _non_default_configured_attributes)
155 _configured_attribute_translations.update(dict(
156 zip(_default_configured_attributes,
157 _config.CONFIG['DEFAULT'].keys())))
158 # .config option -> attribute name
159 _configured_attribute_inverse_translations = dict(
160 (v,k) for k,v in _configured_attribute_translations.items())
162 # hints for value conversion
163 _boolean_attributes = [
166 'use_publisher_email',
174 'links_after_each_paragraph',
179 _integer_attributes = [
189 _function_attributes = [
191 'digest_post_process',
194 def __init__(self, name=None, url=None, to=None, config=None):
195 self._set_name(name=name)
197 self.__setstate__(dict(
198 (attr, getattr(self, attr))
199 for attr in self._dynamic_attributes))
200 self.load_from_config(config=config)
207 return '{} ({} -> {})'.format(self.name, self.url, self.to)
210 return '<Feed {}>'.format(str(self))
212 def __getstate__(self):
213 "Save dyamic attributes"
215 (key,getattr(self,key)) for key in self._dynamic_attributes)
217 get_state = __getstate__ # make it publicly accessible
219 def __setstate__(self, state):
220 "Restore dynamic attributes"
221 keys = sorted(state.keys())
222 if keys != sorted(self._dynamic_attributes):
223 raise ValueError(state)
224 self._set_name(name=state['name'])
225 self.__dict__.update(state)
227 set_state = __setstate__ # make it publicly accessible
229 def save_to_config(self):
230 "Save configured attributes"
231 data = _collections.OrderedDict()
232 default = self.config['DEFAULT']
233 for attr in self._configured_attributes:
234 key = self._configured_attribute_translations[attr]
235 value = getattr(self, attr)
236 if value is not None:
237 value = self._get_configured_option_value(
238 attribute=attr, value=value)
239 if (attr in self._non_default_configured_attributes or
240 value != default[key]):
242 self.config[self.section] = data
244 def load_from_config(self, config=None):
245 "Restore configured attributes"
247 config = _config.CONFIG
249 if self.section in self.config:
250 data = self.config[self.section]
252 data = self.config['DEFAULT']
253 keys = sorted(data.keys())
254 expected = sorted(self._configured_attribute_translations.values())
257 if (key not in keys and
258 key not in self._non_default_configured_attributes):
259 raise _error.InvalidFeedConfig(
260 setting=key, feed=self,
261 message='missing configuration key: {}'.format(key))
263 if key not in expected:
264 raise _error.InvalidFeedConfig(
265 setting=key, feed=self,
266 message='extra configuration key: {}'.format(key))
268 (self._configured_attribute_inverse_translations[k],
269 self._get_configured_attribute_value(
270 attribute=self._configured_attribute_inverse_translations[k],
272 for k in data.keys())
273 for attr in self._non_default_configured_attributes:
276 self.__dict__.update(data)
278 def _get_configured_option_value(self, attribute, value):
281 elif attribute in self._list_attributes:
282 return ', '.join(value)
283 elif attribute in self._function_attributes:
284 return _util.import_name(value)
287 def _get_configured_attribute_value(self, attribute, key, data):
288 if attribute in self._boolean_attributes:
289 return data.getboolean(key)
290 elif attribute in self._integer_attributes:
291 return data.getint(key)
292 elif attribute in self._list_attributes:
293 return [x.strip() for x in data[key].split(',')]
294 elif attribute in self._function_attributes:
296 return _util.import_function(data[key])
301 """Reset dynamic data
307 def _set_name(self, name):
308 if not self._name_regexp.match(name):
309 raise _error.InvalidFeedName(name=name, feed=self)
311 self.section = 'feed.{}'.format(self.name)
314 """Fetch and parse a feed using feedparser.
317 ... name='test-feed',
318 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
319 >>> parsed = feed._fetch()
323 _LOG.info('fetch {}'.format(self))
325 raise _error.InvalidFeedConfig(setting='url', feed=self)
326 if self.section in self.config:
327 config = self.config[self.section]
329 config = self.config['DEFAULT']
330 proxy = config['proxy']
331 timeout = config.getint('feed-timeout')
334 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
335 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
336 return f(self.url, self.etag, modified=self.modified, **kwargs)
338 def _process(self, parsed):
339 _LOG.info('process {}'.format(self))
340 self._check_for_errors(parsed)
341 for entry in reversed(parsed.entries):
342 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
343 processed = self._process_entry(parsed=parsed, entry=entry)
345 guid,id_,sender,message = processed
346 if self.post_process:
347 message = self.post_process(
348 feed=self, parsed=parsed, entry=entry, guid=guid,
352 yield (guid, id_, sender, message)
354 def _check_for_errors(self, parsed):
356 status = getattr(parsed, 'status', 200)
357 _LOG.debug('HTTP status {}'.format(status))
359 _LOG.info('redirect {} from {} to {}'.format(
360 self.name, self.url, parsed['url']))
361 self.url = parsed['url']
362 elif status not in [200, 302, 304]:
363 raise _error.HTTPError(status=status, feed=self)
365 http_headers = parsed.get('headers', {})
367 _LOG.debug('HTTP headers: {}'.format(http_headers))
369 _LOG.warning('could not get HTTP headers: {}'.format(self))
372 if 'html' in http_headers.get('content-type', 'rss'):
373 _LOG.warning('looks like HTML: {}'.format(self))
375 if http_headers.get('content-length', '1') == '0':
376 _LOG.warning('empty page: {}'.format(self))
379 version = parsed.get('version', None)
381 _LOG.debug('feed version {}'.format(version))
383 _LOG.warning('unrecognized version: {}'.format(self))
386 exc = parsed.get('bozo_exception', None)
387 if isinstance(exc, _socket.timeout):
388 _LOG.error('timed out: {}'.format(self))
390 elif isinstance(exc, OSError):
391 _LOG.error('{}: {}'.format(exc, self))
393 elif isinstance(exc, _SOCKET_ERRORS):
394 _LOG.error('{}: {}'.format(exc, self))
396 elif isinstance(exc, _feedparser.zlib.error):
397 _LOG.error('broken compression: {}'.format(self))
399 elif isinstance(exc, (IOError, AttributeError)):
400 _LOG.error('{}: {}'.format(exc, self))
402 elif isinstance(exc, KeyboardInterrupt):
404 elif isinstance(exc, _sax.SAXParseException):
405 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
407 elif parsed.bozo or exc:
409 exc = "can't process"
410 _LOG.error('processing error: {}: {}'.format(exc, self))
414 status in [200, 302] and
415 not parsed.entries and
417 raise _error.ProcessingError(parsed=parsed, feed=feed)
419 def _process_entry(self, parsed, entry):
420 id_ = self._get_entry_id(entry)
421 # If .trust_guid isn't set, we get back hashes of the content.
422 # Instead of letting these run wild, we put them in context
423 # by associating them with the actual ID (if it exists).
424 guid = entry.get('id', id_)
425 if isinstance(guid, dict):
426 guid = guid.values()[0]
427 if guid in self.seen:
428 if self.seen[guid]['id'] == id_:
429 _LOG.debug('already seen {}'.format(id_))
430 return # already seen
431 sender = self._get_entry_email(parsed=parsed, entry=entry)
432 subject = self._get_entry_title(entry)
433 extra_headers = _collections.OrderedDict((
434 ('Date', self._get_entry_date(entry)),
435 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
436 ('User-Agent', _USER_AGENT),
437 ('X-RSS-Feed', self.url),
439 ('X-RSS-URL', self._get_entry_link(entry)),
440 ('X-RSS-TAGS', self._get_entry_tags(entry)),
442 for k,v in extra_headers.items(): # remove empty tags, etc.
445 if self.bonus_header:
446 for header in self.bonus_header.splitlines():
448 key,value = header.split(':', 1)
449 extra_headers[key.strip()] = value.strip()
452 'malformed bonus-header: {}'.format(
455 content = self._get_entry_content(entry)
457 content = self._process_entry_content(
458 entry=entry, content=content, subject=subject)
459 except _error.ProcessingError as e:
462 message = _email.get_message(
466 body=content['value'],
467 content_type=content['type'].split('/', 1)[1],
468 extra_headers=extra_headers,
470 section=self.section)
471 return (guid, id_, sender, message)
473 def _get_entry_id(self, entry):
474 """Get best ID from an entry."""
476 if getattr(entry, 'id', None):
477 # Newer versions of feedparser could return a dictionary
478 if isinstance(entry.id, dict):
479 return entry.id.values()[0]
481 content = self._get_entry_content(entry)
482 content_value = content['value'].strip()
484 return _hashlib.sha1(
485 content_value.encode('unicode-escape')).hexdigest()
486 elif getattr(entry, 'link', None):
487 return _hashlib.sha1(
488 entry.link.encode('unicode-escape')).hexdigest()
489 elif getattr(entry, 'title', None):
490 return _hashlib.sha1(
491 entry.title.encode('unicode-escape')).hexdigest()
493 def _get_entry_link(self, entry):
494 return entry.get('link', None)
496 def _get_entry_title(self, entry):
497 if hasattr(entry, 'title_detail') and entry.title_detail:
498 title = entry.title_detail.value
499 if 'html' in entry.title_detail.type:
500 title = _html2text.html2text(title)
502 content = self._get_entry_content(entry)
503 value = content['value']
504 if content['type'] in ('text/html', 'application/xhtml+xml'):
505 value = _html2text.html2text(value)
507 title = title.replace('\n', ' ').strip()
510 def _get_entry_date(self, entry):
511 datetime = _time.gmtime()
513 for datetype in self.date_header_order:
514 kind = datetype + '_parsed'
515 if entry.get(kind, None):
516 datetime = entry[kind]
518 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
520 def _get_entry_name(self, parsed, entry):
523 >>> import feedparser
524 >>> f = Feed(name='test-feed')
525 >>> parsed = feedparser.parse(
526 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
529 ... ' <name>Example author</name>\\n'
530 ... ' <email>me@example.com</email>\\n'
531 ... ' <url>http://example.com/</url>\\n'
536 >>> entry = parsed.entries[0]
537 >>> f.friendly_name = False
538 >>> f._get_entry_name(parsed, entry)
540 >>> f.friendly_name = True
541 >>> f._get_entry_name(parsed, entry)
544 if not self.friendly_name:
548 parts.append(feed.get('title', ''))
549 for x in [entry, feed]:
550 if 'name' in x.get('author_detail', []):
551 if x.author_detail.name:
554 parts.append(x.author_detail.name)
556 if not ''.join(parts) and self.use_publisher_email:
557 if 'name' in feed.get('publisher_detail', []):
560 parts.append(feed.publisher_detail.name)
561 return _html2text.unescape(''.join(parts))
563 def _validate_email(self, email, default=None):
564 """Do a basic quality check on email address
566 Return `default` if the address doesn't appear to be
567 well-formed. If `default` is `None`, return
570 >>> f = Feed(name='test-feed')
571 >>> f._validate_email('valid@example.com', 'default@example.com')
573 >>> f._validate_email('invalid@', 'default@example.com')
574 'default@example.com'
575 >>> f._validate_email('@invalid', 'default@example.com')
576 'default@example.com'
577 >>> f._validate_email('invalid', 'default@example.com')
578 'default@example.com'
580 parts = email.split('@')
581 if len(parts) != 2 or '' in parts:
583 return self.from_email
587 def _get_entry_address(self, parsed, entry):
588 """Get the best From email address ('<jdoe@a.com>')
590 If the best guess isn't well-formed (something@somthing.com),
591 use `self.from_email` instead.
594 return self.from_email
596 if 'email' in entry.get('author_detail', []):
597 return self._validate_email(entry.author_detail.email)
598 elif 'email' in feed.get('author_detail', []):
599 return self._validate_email(feed.author_detail.email)
600 if self.use_publisher_email:
601 if 'email' in feed.get('publisher_detail', []):
602 return self._validate_email(feed.publisher_detail.email)
603 if feed.get('errorreportsto', None):
604 return self._validate_email(feed.errorreportsto)
605 _LOG.debug('no sender address found, fallback to default')
606 return self.from_email
608 def _get_entry_email(self, parsed, entry):
609 """Get the best From email address ('John <jdoe@a.com>')
611 name = self._get_entry_name(parsed=parsed, entry=entry)
612 address = self._get_entry_address(parsed=parsed, entry=entry)
613 return _formataddr((name, address))
615 def _get_entry_tags(self, entry):
616 """Add post tags, if available
618 >>> f = Feed(name='test-feed')
619 >>> f._get_entry_tags({
620 ... 'tags': [{'term': 'tag1',
622 ... 'label': None}]})
624 >>> f._get_entry_tags({
625 ... 'tags': [{'term': 'tag1',
630 ... 'label': None}]})
633 Test some troublesome cases. No tags:
635 >>> f._get_entry_tags({})
639 >>> f._get_entry_tags({'tags': []})
641 Tags without a ``term`` entry:
643 >>> f._get_entry_tags({
644 ... 'tags': [{'scheme': None,
645 ... 'label': None}]})
647 Tags with an empty term:
649 >>> f._get_entry_tags({
650 ... 'tags': [{'term': '',
652 ... 'label': None}]})
654 taglist = [tag['term'] for tag in entry.get('tags', [])
655 if tag.get('term', '')]
657 return ','.join(taglist)
659 def _get_entry_content(self, entry):
660 """Select the best content from an entry.
662 Returns a feedparser content dict.
665 # * We have a bunch of potential contents.
666 # * We go thru looking for our first choice.
667 # (HTML or text, depending on self.html_mail)
668 # * If that doesn't work, we go thru looking for our second choice.
669 # * If that still doesn't work, we just take the first one.
671 # Possible future improvement:
672 # * Instead of just taking the first one
673 # pick the one in the "best" language.
674 # * HACK: hardcoded .html_mail, should take a tuple of media types
675 contents = list(entry.get('content', []))
676 if entry.get('summary_detail', None):
677 contents.append(entry.summary_detail)
679 types = ['text/html', 'text/plain']
681 types = ['text/plain', 'text/html']
682 for content_type in types:
683 for content in contents:
684 if content['type'] == content_type:
688 return {'type': 'text/plain', 'value': ''}
690 def _process_entry_content(self, entry, content, subject):
691 "Convert entry content to the requested format."
692 link = self._get_entry_link(entry)
699 if self.use_css and self.css:
701 ' <style type="text/css">',
709 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
713 if content['type'] in ('text/html', 'application/xhtml+xml'):
714 lines.append(content['value'].strip())
716 lines.append(_saxutils.escape(content['value'].strip()))
717 lines.append('</div>')
719 '<div class="footer">'
720 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
722 for enclosure in getattr(entry, 'enclosures', []):
723 if getattr(enclosure, 'url', None):
725 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
727 if getattr(enclosure, 'src', None):
729 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
732 '<p><img src="{}" /></p>'.format(enclosure.src))
733 for elink in getattr(entry, 'links', []):
734 if elink.get('rel', None) == 'via':
736 title = elink.get('title', url)
737 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
745 content['type'] = 'text/html'
746 content['value'] = '\n'.join(lines)
748 else: # not self.html_mail
749 if content['type'] in ('text/html', 'application/xhtml+xml'):
751 lines = [_html2text.html2text(content['value'])]
752 except _html_parser.HTMLParseError as e:
753 raise _error.ProcessingError(parsed=None, feed=self)
755 lines = [content['value']]
757 lines.append('URL: {}'.format(link))
758 for enclosure in getattr(entry, 'enclosures', []):
759 if getattr(enclosure, 'url', None):
760 lines.append('Enclosure: {}'.format(enclosure.url))
761 if getattr(enclosure, 'src', None):
762 lines.append('Enclosure: {}'.format(enclosure.src))
763 for elink in getattr(entry, 'links', []):
764 if elink.get('rel', None) == 'via':
766 title = elink.get('title', url)
767 lines.append('Via: {} {}'.format(title, url))
768 content['type'] = 'text/plain'
769 content['value'] = '\n'.join(lines)
772 def _send(self, sender, message):
773 _LOG.info('send message for {}'.format(self))
774 section = self.section
775 if section not in self.config:
777 _email.send(sender=sender, recipient=self.to, message=message,
778 config=self.config, section=section)
780 def run(self, send=True):
781 """Fetch and process the feed, mailing entry emails.
784 ... name='test-feed',
785 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
786 >>> def send(sender, message):
787 ... print('send from {}:'.format(sender))
788 ... print(message.as_string())
789 >>> feed._send = send
790 >>> feed.to = 'jdoe@dummy.invalid'
791 >>> #parsed = feed.run() # enable for debugging
794 raise _error.NoToEmailAddress(feed=self)
795 parsed = self._fetch()
798 digest = self._new_digest()
801 for (guid, id_, sender, message) in self._process(parsed):
802 _LOG.debug('new message: {}'.format(message['Subject']))
804 seen.append((guid, id_))
805 self._append_to_digest(digest=digest, message=message)
808 self._send(sender=sender, message=message)
809 if guid not in self.seen:
811 self.seen[guid]['id'] = id_
813 if self.digest and seen:
814 if self.digest_post_process:
815 digest = self.digest_post_process(
816 feed=self, parsed=parsed, seen=seen, message=digest)
820 digest=digest, seen=seen, sender=sender, send=send)
822 self.etag = parsed.get('etag', None)
823 self.modified = parsed.get('modified', None)
825 def _new_digest(self):
826 digest = _MIMEMultipart('digest')
827 digest['To'] = self.to # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
828 digest['Subject'] = 'digest for {}'.format(self.name)
829 digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
830 digest['User-Agent'] = _USER_AGENT
831 digest['X-RSS-Feed'] = self.url
834 def _append_to_digest(self, digest, message):
835 part = _MIMEMessage(message)
836 part.add_header('Content-Disposition', 'attachment')
839 def _send_digest(self, digest, seen, sender, send=True):
840 """Send a digest message
842 The date is extracted from the last message in the digest
843 payload. We assume that this part exists. If you don't have
844 any messages in the digest, don't call this function.
846 digest['From'] = sender # TODO: _Header(), _formataddr()...
847 last_part = digest.get_payload()[-1]
848 last_message = last_part.get_payload()[0]
849 digest['Date'] = last_message['Date']
851 _LOG.debug('new digest for {}'.format(self))
853 self._send(sender=sender, message=digest)
854 for (guid, id_) in seen:
855 if guid not in self.seen:
857 self.seen[guid]['id'] = id_