1 # -*- encoding: utf-8 -*-
3 """rss2email: get RSS feeds emailed to you
7 __url__ = 'http://rss2email.infogami.com'
8 __author__ = 'Lindsey Smith (lindsey@allthingsrss.com)'
9 __copyright__ = '(C) 2004 Aaron Swartz. GNU GPL 2 or 3.'
15 "Martin 'Joey' Schulze",
16 'Marcel Ackermann (http://www.DreamFlasher.de)',
17 'Lindsey Smith (maintainer)',
20 'Aaron Swartz (original author)',
23 import argparse as _argparse
24 import collections as _collections
25 import configparser as _configparser
26 from email.mime.text import MIMEText as _MIMEText
27 from email.header import Header as _Header
28 from email.utils import parseaddr as _parseaddr
29 from email.utils import formataddr as _formataddr
30 import hashlib as _hashlib
31 import logging as _logging
33 import pickle as _pickle
34 import pprint as _pprint
36 import smtplib as _smtplib
37 import socket as _socket
38 import subprocess as _subprocess
40 import threading as _threading
42 import traceback as _traceback
43 import types as _types
44 import urllib.request as _urllib_request
45 import urllib.error as _urllib_error
46 import xml.dom.minidom as _minidom
47 import xml.sax as _sax
48 import xml.sax.saxutils as _saxutils
52 import fcntl as _fcntl
53 # A pox on SunOS file locking methods
54 if 'sunos' not in sys.platform:
59 import feedparser as _feedparser
60 import html2text as _html2text
63 LOG = _logging.getLogger('rss2email')
64 LOG.addHandler(_logging.StreamHandler())
65 LOG.setLevel(_logging.ERROR)
67 _MODULE_DOCSTRING = __doc__
68 _feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
69 _urllib_request.install_opener(_urllib_request.build_opener())
71 for e in ['error', 'gaierror']:
72 if hasattr(_socket, e):
73 _SOCKET_ERRORS.append(getattr(_socket, e))
74 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
77 class RSS2EmailError (Exception):
78 def __init__(self, message):
79 super(RSS2EmailError, self).__init__(message)
83 if self.__cause__ is not None:
84 LOG.error('cause: {}'.format(self.__cause__))
87 class TimeoutError (RSS2EmailError):
88 def __init__(self, time_limited_function, message=None):
90 if time_limited_function.error is not None:
92 'error while running time limited function: {}'.format(
93 time_limited_function.error[1]))
95 message = '{} second timeout exceeded'.format(
96 time_limited_function.timeout)
97 super(TimeoutError, self).__init__(message=message)
98 self.time_limited_function = time_limited_function
101 class NoValidEncodingError (ValueError, RSS2EmailError):
102 def __init__(self, string, encodings):
103 message = 'no valid encoding for {} in {}'.format(string, encodings)
104 super(NoValidEncodingError, self).__init__(message=message)
106 self.encodings = encodings
109 class SMTPConnectionError (ValueError, RSS2EmailError):
110 def __init__(self, server, message=None):
112 message = 'could not connect to mail server {}'.format(server)
113 super(SMTPConnectionError, self).__init__(message=message)
117 super(SMTPConnectionError, self).log()
119 'check your config file to confirm that smtp-server and other '
120 'mail server settings are configured properly')
121 if hasattr(self.__cause__, 'reason'):
122 LOG.error('reason: {}'.format(self.__cause__.reason))
125 class SMTPAuthenticationError (SMTPConnectionError):
126 def __init__(self, server, username):
128 'could not authenticate with mail server {} as user {}'.format(
130 super(SMTPConnectionError, self).__init__(
131 server=server, message=message)
133 self.username = username
136 class SendmailError (RSS2EmailError):
137 def __init__(self, status=None, stdout=None, stderr=None):
139 message = 'sendmail exited with code {}'.format(status)
142 super(SendmailError, self).__init__(message=message)
148 super(SendmailError, self).log()
150 'Error attempting to send email via sendmail. You may need '
151 'to configure rss2email to use an SMTP server. Please refer '
152 'to the rss2email documentation or website ({}) for complete '
153 'documentation.').format(__url__))
156 class FeedError (RSS2EmailError):
157 def __init__(self, feed, message=None):
159 message = 'error with feed {}'.format(feed.name)
160 super(FeedError, self).__init__(message=message)
164 class InvalidFeedName (FeedError):
165 def __init__(self, name, **kwargs):
166 message = "invalid feed name '{}'".format(name)
167 super(InvalidFeedName, self).__init__(message=message, **kwargs)
170 class ProcessingError (FeedError):
171 def __init__(self, parsed, feed, **kwargs):
173 message = 'error processing feed {}'.format(feed)
174 super(FeedError, self).__init__(feed=feed, message=message)
178 super(ProcessingError, self).log()
179 if type(self) == ProcessingError: # not a more specific subclass
181 '=== rss2email encountered a problem with this feed ===')
183 '=== See the rss2email FAQ at {} for assistance ==='.format(
186 '=== If this occurs repeatedly, send this to {} ==='.format(
189 'error: {} {}'.format(
190 self.parsed.get('bozo_exception', "can't process"),
192 LOG.warning(_pprint.pformat(self.parsed))
193 LOG.warning('rss2email', __version__)
194 LOG.warning('feedparser', _feedparser.__version__)
195 LOG.warning('html2text', _html2text.__version__)
196 LOG.warning('Python', _sys.version)
197 LOG.warning('=== END HERE ===')
200 class HTTPError (ProcessingError):
201 def __init__(self, status, feed, **kwargs):
202 message = 'HTTP status {} fetching feed {}'.format(status, feed)
203 super(FeedError, self).__init__(feed=feed, message=message)
207 class FeedsError (RSS2EmailError):
208 def __init__(self, feeds=None, message=None, **kwargs):
210 message = 'error with feeds'
211 super(FeedsError, self).__init__(message=message, **kwargs)
215 class DataFileError (FeedsError):
216 def __init__(self, feeds, message=None):
218 message = 'problem with the feed data file {}'.format(
220 super(DataFileError, self).__init__(feeds=feeds, message=message)
223 class NoDataFile (DataFileError):
224 def __init__(self, feeds):
225 message = 'feed data file {} does not exist'.format(feeds.datafile)
226 super(NoDataFile, self).__init__(feeds=feeds, message=message)
229 super(NoDataFile, self).log()
231 "if you're using r2e for the first time, you have to run "
235 class NoToEmailAddress (FeedsError, FeedError):
236 def __init__(self, **kwargs):
237 message = 'no target email address has been defined'
238 super(NoToEmailAddress, self).__init__(message=message, **kwargs)
241 super(NoToEmailAddress, self).log()
243 "please run 'r2e email emailaddress' or "
244 "'r2e add name url emailaddress'.")
247 class OPMLReadError (RSS2EmailError):
248 def __init__(self, **kwargs):
249 message = 'error reading OPML'
250 super(RSS2EmailError, self).__init__(message=message, **kwargs)
253 class Config (_configparser.ConfigParser):
254 def __init__(self, **kwargs):
255 super(Config, self).__init__(dict_type=_collections.OrderedDict)
257 def _setup(self, section='DEFAULT'):
258 _html2text.UNICODE_SNOB = self.getboolean(
259 section, 'unicode-snob', fallback=False)
260 _html2text.LINKS_EACH_PARAGRAPH = self.getboolean(
261 section, 'links-after-each-paragaph', fallback=False)
262 _html2text.BODY_WIDTH = self.getint(section, 'body-width', fallback=0)
267 # setup defaults for feeds that don't customize
268 CONFIG['DEFAULT'] = _collections.OrderedDict((
270 # The email address messages are from by default
271 ('from', 'bozo@dev.null.invalid'),
272 # True: Only use the 'from' address.
273 # False: Use the email address specified by the feed, when possible.
274 ('force-from', str(False)),
275 # True: Use the publisher's email if you can't find the author's.
276 # False: Just use the 'from' email instead.
277 ('use-publisher-email', str(False)),
278 # Only use the feed email address rather than friendly name
280 ('friendly-name', str(True)),
281 # Set this to default To email addresses.
285 # Set an HTTP proxy (e.g. 'http://your.proxy.here:8080/')
287 # Set the timeout (in seconds) for feed server response
288 ('feed-timeout', str(60)),
291 # True: Fetch, process, and email feeds.
292 # False: Don't fetch, process, or email feeds
293 ('active', str(True)),
294 # True: Generate Date header based on item's date, when possible.
295 # False: Generate Date header based on time sent.
296 ('date-header', str(False)),
297 # A comma-delimited list of some combination of
298 # ('issued', 'created', 'modified', 'expired')
299 # expressing ordered list of preference in dates
300 # to use for the Date header of the email.
301 ('date-header-order', 'modified, issued, created, expired'),
302 # Set this to add bonus headers to all emails
303 # Example: bonus-header = 'Approved: joe@bob.org'
304 ('bonus-header', ''),
305 # True: Receive one email per post.
306 # False: Receive an email every time a post changes.
307 ('trust-guid', str(True)),
308 # To most correctly encode emails with international
309 # characters, we iterate through the list below and use the
310 # first character set that works Eventually (and
311 # theoretically) UTF-8 is our catch-all failsafe.
312 ('encodings', 'US-ASCII, BIG5, ISO-2022-JP, ISO-8859-1, UTF-8'),
314 # True: Send text/html messages when possible.
315 # False: Convert HTML to plain text.
316 ('html-mail', str(False)),
317 # Optional CSS styling
318 ('use-css', str(False)),
321 ' font: 18pt Georgia, "Times New Roman";\n'
324 ' font: 12pt Arial;\n'
327 ' font: 12pt Arial;\n'
328 ' font-weight: bold;\n'
332 ' font-family: monospace;\n'
335 ' background: #e0ecff;\n'
336 ' border-bottom: solid 4px #c3d9ff;\n'
338 ' margin-top: 0px;\n'
342 ' font-size: 20px;\n'
343 ' text-decoration: none;\n'
346 ' background: #c3d9ff;\n'
347 ' border-top: solid 4px #c3d9ff;\n'
349 ' margin-bottom: 0px;\n'
352 ' border: solid 4px #c3d9ff;\n'
355 ' margin-left: 5px;\n'
356 ' margin-right: 5px;\n'
359 # Use Unicode characters instead of their ascii psuedo-replacements
360 ('unicode-snob', str(False)),
361 # Put the links after each paragraph instead of at the end.
362 ('links-after-each-paragraph', str(False)),
363 # Wrap long lines at position. 0 for no wrapping.
364 ('body-width', str(0)),
367 # True: Use SMTP_SERVER to send mail.
368 # False: Call /usr/sbin/sendmail to send mail.
369 ('use-smtp', str(False)),
370 ('smtp-server', 'smtp.yourisp.net:25'), ('smtp-auth', str(False)), # set to True to use SMTP AUTH
371 ('smtp-username', 'username'), # username for SMTP AUTH
372 ('smtp-password', 'password'), # password for SMTP AUTH
373 ('smtp-ssl', str(False)), # Connect to the SMTP server using SSL
376 # Verbosity (one of 'error', 'warning', 'info', or 'debug').
377 ('verbose', 'warning'),
381 def guess_encoding(string, encodings=('US-ASCII', 'UTF-8')):
382 """Find an encodign capable of encoding `string`.
384 >>> guess_encoding('alpha', encodings=('US-ASCII', 'UTF-8'))
386 >>> guess_encoding('α', encodings=('US-ASCII', 'UTF-8'))
388 >>> guess_encoding('α', encodings=('US-ASCII', 'ISO-8859-1'))
389 Traceback (most recent call last):
391 rss2email.NoValidEncodingError: no valid encoding for α in ('US-ASCII', 'ISO-8859-1')
393 for encoding in encodings:
395 string.encode(encoding)
396 except (UnicodeError, LookupError):
400 raise NoValidEncodingError(string=string, encodings=encodings)
402 def get_message(sender, recipient, subject, body, content_type,
403 extra_headers=None, config=None, section='DEFAULT'):
404 """Generate a `Message` instance.
406 All arguments should be Unicode strings (plain ASCII works as well).
408 Only the real name part of sender and recipient addresses may contain
409 non-ASCII characters.
411 The email will be properly MIME encoded.
413 The charset of the email will be the first one out of the list
414 that can represent all the characters occurring in the email.
416 >>> message = get_message(
417 ... sender='John <jdoe@a.com>', recipient='Ζεύς <z@olympus.org>',
418 ... subject='Testing',
419 ... body='Hello, world!\\n',
420 ... content_type='plain',
421 ... extra_headers={'Approved': 'joe@bob.org'})
422 >>> print(message.as_string()) # doctest: +REPORT_UDIFF
424 Content-Type: text/plain; charset="us-ascii"
425 Content-Transfer-Encoding: 7bit
426 From: John <jdoe@a.com>
427 To: =?utf-8?b?zpbOtc+Nz4I=?= <z@olympus.org>
429 Approved: joe@bob.org
437 x.strip() for x in config.get(section, 'encodings').split(',')]
439 # Split real name (which is optional) and email address parts
440 sender_name,sender_addr = _parseaddr(sender)
441 recipient_name,recipient_addr = _parseaddr(recipient)
443 sender_encoding = guess_encoding(sender_name, encodings)
444 recipient_encoding = guess_encoding(recipient_name, encodings)
445 subject_encoding = guess_encoding(subject, encodings)
446 body_encoding = guess_encoding(body, encodings)
448 # We must always pass Unicode strings to Header, otherwise it will
449 # use RFC 2047 encoding even on plain ASCII strings.
450 sender_name = str(_Header(sender_name, sender_encoding).encode())
451 recipient_name = str(_Header(recipient_name, recipient_encoding).encode())
453 # Make sure email addresses do not contain non-ASCII characters
454 sender_addr.encode('ascii')
455 recipient_addr.encode('ascii')
457 # Create the message ('plain' stands for Content-Type: text/plain)
458 message = _MIMEText(body, content_type, body_encoding)
459 message['From'] = _formataddr((sender_name, sender_addr))
460 message['To'] = _formataddr((recipient_name, recipient_addr))
461 message['Subject'] = _Header(subject, subject_encoding)
462 for key,value in extra_headers.items():
463 encoding = guess_encoding(value, encodings)
464 message[key] = _Header(value, encoding)
467 def smtp_send(sender, recipient, message, config=None, section='DEFAULT'):
470 server = CONFIG.get(section, 'smtp-server')
471 LOG.debug('sending message to {} via {}'.format(recipient, server))
472 ssl = CONFIG.getboolean(section, 'smtp-ssl')
474 smtp = _smtplib.SMTP_SSL()
476 smtp = _smtplib.SMTP()
479 smtp.connect(SMTP_SERVER)
480 except KeyboardInterrupt:
482 except Exception as e:
483 raise SMTPConnectionError(server=server) from e
484 if CONFIG.getboolean(section, 'smtp-auth'):
485 username = CONFIG.get(section, 'smtp-username')
486 password = CONFIG.get(section, 'smtp-password')
490 smtp.login(username, password)
491 except KeyboardInterrupt:
493 except Exception as e:
494 raise SMTPAuthenticationError(server=server, username=username)
495 smtp.send_message(message, sender, [recipient])
498 def sendmail_send(sender, recipient, message, config=None, section='DEFAULT'):
502 'sending message to {} via /usr/sbin/sendmail'.format(recipient))
504 p = _subprocess.Popen(
505 ['/usr/sbin/sendmail', recipient],
506 stdin=_subprocess.PIPE, stdout=_subprocess.PIPE,
507 stderr=_subprocess.PIPE)
508 stdout,stderr = p.communicate(message.as_string().encode('ascii'))
511 raise SendmailError(status=status, stdout=stdout, stderr=stderr)
512 except Exception as e:
513 raise SendmailError() from e
515 def send(sender, recipient, message, config=None, section='DEFAULT'):
516 if config.getboolean(section, 'use-smtp'):
517 smtp_send(sender, recipient, message)
519 sendmail_send(sender, recipient, message)
522 class TimeLimitedFunction (_threading.Thread):
523 """Run `function` with a time limit of `timeout` seconds.
526 >>> def sleeping_return(sleep, x):
527 ... time.sleep(sleep)
529 >>> TimeLimitedFunction(0.5, sleeping_return)(0.1, 'x')
531 >>> TimeLimitedFunction(0.5, sleeping_return)(10, 'y')
532 Traceback (most recent call last):
534 rss2email.TimeoutError: 0.5 second timeout exceeded
535 >>> TimeLimitedFunction(0.5, time.sleep)('x')
536 Traceback (most recent call last):
538 rss2email.TimeoutError: error while running time limited function: a float is required
540 def __init__(self, timeout, target, **kwargs):
541 super(TimeLimitedFunction, self).__init__(target=target, **kwargs)
542 self.setDaemon(True) # daemon kwarg only added in Python 3.3.
543 self.timeout = timeout
548 """Based on Thread.run().
550 We add handling for self.result and self.error.
554 self.result = self._target(*self._args, **self._kwargs)
556 self.error = _sys.exc_info()
558 # Avoid a refcycle if the thread is running a function with
559 # an argument that has a member that points to the thread.
560 del self._target, self._args, self._kwargs
562 def __call__(self, *args, **kwargs):
564 self._kwargs = kwargs
566 self.join(self.timeout)
568 raise TimeoutError(time_limited_function=self) from self.error[1]
570 raise TimeoutError(time_limited_function=self)
575 """Utility class for feed manipulation and storage.
581 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
583 test-feed (http://example.com/feed.atom -> a@b.com)
587 'bozo@dev.null.invalid'
589 >>> feed.from_email = 'a@b.com'
590 >>> feed.save_to_config()
591 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
593 from = bozo@dev.null.invalid
598 url = http://example.com/feed.atom
603 >>> feed.etag = 'dummy etag'
604 >>> string = pickle.dumps(feed)
605 >>> feed = pickle.loads(string)
606 >>> feed.load_from_config(config=CONFIG)
610 'http://example.com/feed.atom'
612 Names can only contain ASCII letters, digits, and '._-'. Here the
613 invalid space causes an exception:
615 >>> Feed(name='invalid name')
616 Traceback (most recent call last):
618 rss2email.InvalidFeedName: invalid feed name 'invalid name'
622 >>> CONFIG['DEFAULT']['to'] = ''
623 >>> test_section = CONFIG.pop('feed.test-feed')
625 _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
627 # saved/loaded from feed.dat using __getstate__/__setstate__.
628 _dynamic_attributes = [
635 ## saved/loaded from ConfigParser instance
636 # attributes that aren't in DEFAULT
637 _non_default_configured_attributes = [
640 # attributes that are in DEFAULT
641 _default_configured_attributes = [
642 key.replace('-', '_') for key in CONFIG['DEFAULT'].keys()]
643 _default_configured_attributes[
644 _default_configured_attributes.index('from')
645 ] = 'from_email' # `from` is a Python keyword
646 # all attributes that are saved/loaded from .config
647 _configured_attributes = (
648 _non_default_configured_attributes + _default_configured_attributes)
649 # attribute name -> .config option
650 _configured_attribute_translations = dict(
651 (attr,attr) for attr in _non_default_configured_attributes)
652 _configured_attribute_translations.update(dict(
653 zip(_default_configured_attributes, CONFIG['DEFAULT'].keys())))
654 # .config option -> attribute name
655 _configured_attribute_inverse_translations = dict(
656 (v,k) for k,v in _configured_attribute_translations.items())
658 # hints for value conversion
659 _boolean_attributes = [
661 'use_publisher_email',
669 'links_after_each_paragraph',
674 _integer_attributes = [
684 def __init__(self, name=None, url=None, to=None, config=None):
685 self._set_name(name=name)
687 self.__setstate__(dict(
688 (attr, getattr(self, attr))
689 for attr in self._dynamic_attributes))
690 self.load_from_config(config=config)
697 return '{} ({} -> {})'.format(self.name, self.url, self.to)
700 return '<Feed {}>'.format(str(self))
702 def __getstate__(self):
703 "Save dyamic attributes"
705 (key,getattr(self,key)) for key in self._dynamic_attributes)
707 def __setstate__(self, state):
708 "Restore dynamic attributes"
709 keys = sorted(state.keys())
710 if keys != sorted(self._dynamic_attributes):
711 raise ValueError(state)
712 self._set_name(name=state['name'])
713 self.__dict__.update(state)
715 def save_to_config(self):
716 "Save configured attributes"
717 data = _collections.OrderedDict()
718 default = self.config['DEFAULT']
719 for attr in self._configured_attributes:
720 key = self._configured_attribute_translations[attr]
721 value = getattr(self, attr)
722 if value is not None:
723 value = self._get_configured_option_value(
724 attribute=attr, value=value)
725 if (attr in self._non_default_configured_attributes or
726 value != default[key]):
728 self.config[self.section] = data
730 def load_from_config(self, config=None):
731 "Restore configured attributes"
735 if self.section in self.config:
736 data = self.config[self.section]
738 data = self.config['DEFAULT']
739 keys = sorted(data.keys())
740 expected = sorted(self._configured_attribute_translations.values())
743 if (key not in keys and
744 key not in self._non_default_configured_attributes):
745 raise ValueError('missing key: {}'.format(key))
747 if key not in expected:
748 raise ValueError('extra key: {}'.format(key))
750 (self._configured_attribute_inverse_translations[k],
751 self._get_configured_attribute_value(
752 attribute=self._configured_attribute_inverse_translations[k],
754 for k in data.keys())
755 for attr in self._non_default_configured_attributes:
758 self.__dict__.update(data)
760 def _get_configured_option_value(self, attribute, value):
761 if value and attribute in self._list_attributes:
762 return ', '.join(value)
765 def _get_configured_attribute_value(self, attribute, key, data):
766 if attribute in self._boolean_attributes:
767 return data.getboolean(key)
768 elif attribute in self._integer_attributes:
769 return data.getint(key)
770 elif attribute in self._list_attributes:
771 return [x.strip() for x in data[key].split(',')]
775 """Reset dynamic data
781 def _set_name(self, name):
782 if not self._name_regexp.match(name):
783 raise InvalidFeedName(name=name, feed=self)
785 self.section = 'feed.{}'.format(self.name)
788 """Fetch and parse a feed using feedparser.
791 ... name='test-feed',
792 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
793 >>> parsed = feed._fetch()
797 LOG.info('fetch {}'.format(self))
798 if self.section in self.config:
799 config = self.config[self.section]
801 config = self.config['DEFAULT']
802 proxy = config['proxy']
803 timeout = config.getint('feed-timeout')
806 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
807 f = TimeLimitedFunction(timeout, _feedparser.parse)
808 return f(self.url, self.etag, modified=self.modified, **kwargs)
810 def _process(self, parsed):
811 LOG.info('process {}'.format(self))
812 self._check_for_errors(parsed)
813 for entry in reversed(parsed.entries):
814 LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
815 processed = self._process_entry(parsed=parsed, entry=entry)
819 def _check_for_errors(self, parsed):
821 status = getattr(parsed, 'status', 200)
822 LOG.debug('HTTP status {}'.format(status))
824 LOG.info('redirect {} from {} to {}'.format(
825 self.name, self.url, parsed['url']))
826 self.url = parsed['url']
827 elif status not in [200, 302, 304]:
828 raise HTTPError(status=status, feed=self)
830 http_headers = parsed.get('headers', {})
832 LOG.debug('HTTP headers: {}'.format(http_headers))
834 LOG.warning('could not get HTTP headers: {}'.format(self))
837 if 'html' in http_headers.get('content-type', 'rss'):
838 LOG.warning('looks like HTML: {}'.format(self))
840 if http_headers.get('content-length', '1') == '0':
841 LOG.warning('empty page: {}'.format(self))
844 version = parsed.get('version', None)
846 LOG.debug('feed version {}'.format(version))
848 LOG.warning('unrecognized version: {}'.format(self))
851 exc = parsed.get('bozo_exception', None)
852 if isinstance(exc, _socket.timeout):
853 LOG.error('timed out: {}'.format(self))
855 elif isinstance(exc, _SOCKET_ERRORS):
857 LOG.error('{}: {}'.format(exc, self))
859 elif (hasattr(exc, 'reason') and
860 isinstance(exc.reason, _urllib_error.URLError)):
861 if isinstance(exc.reason, _SOCKET_ERRORS):
862 reason = exc.reason.args[1]
865 LOG.error('{}: {}'.format(exc, self))
867 elif isinstance(exc, _feedparser.zlib.error):
868 LOG.error('broken compression: {}'.format(self))
870 elif isinstance(exc, (IOError, AttributeError)):
871 LOG.error('{}: {}'.format(exc, self))
873 elif isinstance(exc, KeyboardInterrupt):
875 elif isinstance(exc, _sax.SAXParseException):
876 LOG.error('sax parsing error: {}: {}'.format(exc, self))
878 elif parsed.bozo or exc:
880 exc = "can't process"
881 LOG.error('processing error: {}: {}'.format(exc, self))
885 status in [200, 302] and
886 not parsed.entries and
888 raise ProcessingError(parsed=parsed, feed=feed)
890 def _process_entry(self, parsed, entry):
891 id_ = self._get_entry_id(entry)
892 # If .trust_guid isn't set, we get back hashes of the content.
893 # Instead of letting these run wild, we put them in context
894 # by associating them with the actual ID (if it exists).
895 guid = entry['id'] or id_
896 if isinstance(guid, dict):
897 guid = guid.values()[0]
898 if guid in self.seen:
899 if self.seen[guid] == id_:
900 LOG.debug('already seen {}'.format(id_))
901 return # already seen
902 sender = self._get_entry_email(parsed=parsed, entry=entry)
903 link = entry.get('link', None)
904 subject = self._get_entry_title(entry)
905 extra_headers = _collections.OrderedDict((
906 ('Date', self._get_entry_date(entry)),
907 ('User-Agent', 'rss2email'),
908 ('X-RSS-Feed', self.url),
911 ('X-RSS-TAGS', self._get_entry_tags(entry)),
913 for k,v in extra_headers.items(): # remove empty tags, etc.
916 if self.bonus_header:
917 for header in self.bonus_header.splitlines():
919 key,value = header.split(':', 1)
920 extra_headers[key.strip()] = value.strip()
923 'malformed bonus-header: {}'.format(
926 content = self._get_entry_content(entry)
927 content = self._process_entry_content(
928 entry=entry, content=content, link=link, subject=subject)
929 message = get_message(
933 body=content['value'],
934 content_type=content['type'].split('/', 1)[1],
935 extra_headers=extra_headers)
936 return (guid, id_, sender, message)
938 def _get_entry_id(self, entry):
939 """Get best ID from an entry."""
941 if getattr(entry, 'id', None):
942 # Newer versions of feedparser could return a dictionary
943 if isinstance(entry.id, dict):
944 return entry.id.values()[0]
946 content_type,content_value = self._get_entry_content(entry)
947 content_value = content_value.strip()
949 return hash(content_value.encode('unicode-escape')).hexdigest()
950 elif getattr(entry, 'link', None):
951 return hash(entry.link.encode('unicode-escape')).hexdigest()
952 elif getattr(entry, 'title', None):
953 return hash(entry.title.encode('unicode-escape')).hexdigest()
955 def _get_entry_title(self, entry):
956 if hasattr(entry, 'title_detail') and entry.title_detail:
957 title = entry.title_detail.value
958 if 'html' in entry.title_detail.type:
959 title = _html2text.html2text(title)
961 title = self._get_entry_content(entry).content[:70]
962 title = title.replace('\n', ' ').strip()
965 def _get_entry_date(self, entry):
966 datetime = _time.gmtime()
968 for datetype in self.date_header_order:
969 kind = datetype + '_parsed'
970 if entry.get(kind, None):
971 datetime = entry[kind]
973 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
975 def _get_entry_name(self, parsed, entry):
977 if not self.friendly_name:
981 parts.append(feed.get('title', ''))
982 for x in [entry, feed]:
983 if 'name' in x.get('author_detail', []):
984 if x.author_detail.name:
987 parts.append(x.author_detail.name)
989 if not ''.join(parts) and self.use_publisher_email:
990 if 'name' in feed.get('publisher_detail', []):
993 parts.append(feed.publisher_detail.name)
994 return _html2text.unescape(''.join(parts))
996 def _validate_email(email, default=None):
997 """Do a basic quality check on email address
999 Return `default` if the address doesn't appear to be
1000 well-formed. If `default` is `None`, return
1003 parts = email.split('@')
1006 return self.from_email
1010 def _get_entry_address(self, parsed, entry):
1011 """Get the best From email address ('<jdoe@a.com>')
1013 If the best guess isn't well-formed (something@somthing.com),
1014 use `self.from_email` instead.
1017 return self.from_email
1019 if 'email' in entry.get('author_detail', []):
1020 return self._validate_email(entry.author_detail.email)
1021 elif 'email' in feed.get('author_detail', []):
1022 return self._validate_email(feed.author_detail.email)
1023 if self.use_publisher_email:
1024 if 'email' in feed.get('publisher_detail', []):
1025 return self._validate_email(feed.publisher_detail.email)
1026 if feed.get('errorreportsto', None):
1027 return self._validate_email(feed.errorreportsto)
1028 LOG.debug('no sender address found, fallback to default')
1029 return self.from_email
1031 def _get_entry_email(self, parsed, entry):
1032 """Get the best From email address ('John <jdoe@a.com>')
1034 name = self._get_entry_name(parsed=parsed, entry=entry)
1035 address = self._get_entry_address(parsed=parsed, entry=entry)
1036 return _formataddr((name, address))
1038 def _get_entry_tags(self, entry):
1039 "Add post tags, if available"
1040 taglist = [tag['term'] for tag in entry.get('tags', [])]
1042 return ','.join(taglist)
1044 def _get_entry_content(self, entry):
1045 """Select the best content from an entry.
1047 Returns a feedparser content dict.
1050 # * We have a bunch of potential contents.
1051 # * We go thru looking for our first choice.
1052 # (HTML or text, depending on self.html_mail)
1053 # * If that doesn't work, we go thru looking for our second choice.
1054 # * If that still doesn't work, we just take the first one.
1056 # Possible future improvement:
1057 # * Instead of just taking the first one
1058 # pick the one in the "best" language.
1059 # * HACK: hardcoded .html_mail, should take a tuple of media types
1060 contents = list(entry.get('content', []))
1061 if entry.get('summary_detail', None):
1062 contents.append(entry.summary_detail)
1064 types = ['text/html', 'text/plain']
1066 types = ['text/plain', 'text/html']
1067 for content_type in types:
1068 for content in contents:
1069 if content['type'] == content_type:
1073 return {type: 'text/plain', 'value': ''}
1075 def _process_entry_content(self, entry, content, link, subject):
1076 "Convert entry content to the requested format."
1083 if self.use_css and self.css:
1085 ' <style type="text/css">',
1093 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
1095 '<div id="body"><table><tr><td>',
1097 if content['type'] in ('text/html', 'application/xhtml+xml'):
1098 lines.append(content['value'].strip())
1100 lines.append(_saxutils.escape(content['value'].strip()))
1101 lines.append('</td></tr></table></div>')
1103 '<div class="footer">'
1104 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
1106 for enclosure in getattr(entry, 'enclosures', []):
1107 if getattr(enclosure, 'url', None):
1109 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
1111 if getattr(enclosure, 'src', None):
1113 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
1116 '<p><img src="{}" /></p>'.format(enclosure.src))
1117 for elink in getattr(entry, 'links', []):
1118 if elink.get('rel', None) == 'via':
1121 'http://www.google.com/reader/public/atom/',
1122 'http://www.google.com/reader/view/')
1124 if elink.get('title', None):
1125 title = elink['title']
1126 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
1134 content['type'] = 'text/html'
1135 content['value'] = '\n'.join(lines)
1137 else: # not self.html_mail
1138 if content['type'] in ('text/html', 'application/xhtml+xml'):
1139 lines = [_html2text.html2text(content['value'])]
1141 lines = [content['value']]
1143 lines.append('URL: {}'.format(link))
1144 for enclosure in getattr(entry, 'enclosures', []):
1145 if getattr(enclosure, 'url', None):
1146 lines.append('Enclosure: {}'.format(enclosure.url))
1147 if getattr(enclosure, 'src', None):
1148 lines.append('Enclosure: {}'.format(enclosure.src))
1149 for elink in getattr(entry, 'links', []):
1150 if elink.get('rel', None) == 'via':
1153 'http://www.google.com/reader/public/atom/',
1154 'http://www.google.com/reader/view/')
1156 if elink.get('title', None):
1157 title = elink['title']
1158 lines.append('Via: {} {}'.format(title, url))
1159 content['type'] = 'text/plain'
1160 content['value'] = '\n'.join(lines)
1163 def _send(self, sender, message):
1164 LOG.info('send message for {}'.format(self))
1165 section = self.section
1166 if section not in self.config:
1168 send(sender=sender, recipient=self.to, message=message,
1169 config=self.config, section=section)
1171 def run(self, send=True):
1172 """Fetch and process the feed, mailing entry emails.
1175 ... name='test-feed',
1176 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
1177 >>> def send(sender, message):
1178 ... print('send from {}:'.format(sender))
1179 ... print(message.as_string())
1180 >>> feed._send = send
1181 >>> feed.to = 'jdoe@dummy.invalid'
1182 >>> #parsed = feed.run() # enable for debugging
1185 raise NoToEmailAddress(feed=self)
1186 parsed = self._fetch()
1187 for (guid, id_, sender, message) in self._process(parsed):
1188 LOG.debug('new message: {}'.format(message['Subject']))
1190 self._send(sender=sender, message=message)
1191 self.seen[guid] = id_
1192 self.etag = parsed.get('etag', None)
1193 self.modified = parsed.get('modified', None)
1197 """Utility class for rss2email activity.
1202 Setup a temporary directory to load.
1204 >>> tmpdir = tempfile.TemporaryDirectory(prefix='rss2email-test-')
1205 >>> configfile = _os.path.join(tmpdir.name, 'config')
1206 >>> with open(configfile, 'w') as f:
1207 ... count = f.write('[DEFAULT]\\n')
1208 ... count = f.write('to = a@b.com\\n')
1209 ... count = f.write('[feed.f1]\\n')
1210 ... count = f.write('url = http://a.net/feed.atom\\n')
1211 ... count = f.write('to = x@y.net\\n')
1212 ... count = f.write('[feed.f2]\\n')
1213 ... count = f.write('url = http://b.com/rss.atom\\n')
1214 >>> datafile = _os.path.join(tmpdir.name, 'feeds.dat')
1215 >>> with open(datafile, 'wb') as f:
1217 ... Feed(name='f1'),
1218 ... Feed(name='f2'),
1221 >>> feeds = Feeds(configdir=tmpdir.name)
1223 >>> for feed in feeds:
1225 f1 (http://a.net/feed.atom -> x@y.net)
1226 f2 (http://b.com/rss.atom -> a@b.com)
1228 You can index feeds by array index or by feed name.
1231 <Feed f1 (http://a.net/feed.atom -> x@y.net)>
1233 <Feed f2 (http://b.com/rss.atom -> a@b.com)>
1235 <Feed f1 (http://a.net/feed.atom -> x@y.net)>
1236 >>> feeds['missing']
1237 Traceback (most recent call last):
1241 Tweak the feed configuration and save.
1243 >>> feeds[0].to = None
1245 >>> print(open(configfile, 'r').read().rstrip('\\n'))
1246 ... # doctest: +REPORT_UDIFF, +ELLIPSIS
1248 from = bozo@dev.null.invalid
1253 url = http://a.net/feed.atom
1256 url = http://b.com/rss.atom
1258 Cleanup the temporary directory.
1260 >>> tmpdir.cleanup()
1262 def __init__(self, configdir=None, datafile=None, configfiles=None,
1264 super(Feeds, self).__init__()
1265 if configdir is None:
1266 configdir = _os.path.expanduser(_os.path.join(
1267 '~', '.config', 'rss2email'))
1268 if datafile is None:
1269 datafile = _os.path.join(configdir, 'feeds.dat')
1270 self.datafile = datafile
1271 if configfiles is None:
1272 configfiles = [_os.path.join(configdir, 'config')]
1273 self.configfiles = configfiles
1276 self.config = config
1277 self._datafile_lock = None
1279 def __getitem__(self, key):
1281 if feed.name == key:
1285 except ValueError as e:
1286 raise IndexError(key) from e
1287 return super(Feeds, self).__getitem__(index)
1289 def __append__(self, feed):
1290 feed.load_from_config(self.config)
1291 feed = super(Feeds, self).append(feed)
1293 def __pop__(self, index=-1):
1294 feed = super(Feeds, self).pop(index=index)
1295 if feed.section in self.config:
1296 self.config.pop(feed.section)
1299 def index(self, index):
1300 if isinstance(index, int):
1302 elif isinstance(index, str):
1308 return self.index(index)
1310 if feed.name == index:
1312 super(Feeds, self).index(index)
1314 def remove(self, feed):
1315 super(Feeds, self).remove(feed)
1316 if feed.section in self.config:
1317 self.config.pop(feed.section)
1323 def load(self, lock=True, require=False):
1324 LOG.debug('load feed configuration from {}'.format(self.configfiles))
1325 if self.configfiles:
1326 self.read_configfiles = self.config.read(self.configfiles)
1328 self.read_configfiles = []
1329 LOG.debug('loaded confguration from {}'.format(self.read_configfiles))
1330 self._load_feeds(lock=lock, require=require)
1332 def _load_feeds(self, lock, require):
1333 LOG.debug('load feed data from {}'.format(self.datafile))
1334 if not _os.path.exists(self.datafile):
1336 raise NoDataFile(feeds=self)
1337 LOG.info('feed data file not found at {}'.format(self.datafile))
1338 LOG.debug('creating an empty data file')
1339 with open(self.datafile, 'wb') as f:
1342 self._datafile_lock = open(self.datafile, 'rb')
1343 except IOError as e:
1344 raise DataFileError(feeds=self) from e
1348 locktype = _fcntl.LOCK_EX
1349 _fcntl.flock(self._datafile_lock.fileno(), locktype)
1354 handlers = list(LOG.handlers)
1355 feeds = list(_pickle.load(self._datafile_lock))
1357 LOG.handlers = handlers
1361 self._datafile_lock.close()
1362 self._datafile_lock = None
1365 feed.load_from_config(self.config)
1367 feed_names = set(feed.name for feed in self)
1368 for section in self.config.sections():
1369 if section.startswith('feed.'):
1370 name = section[len('feed.'):]
1371 if name not in feed_names:
1373 ('feed {} not found in feed file, '
1374 'initializing from config').format(name))
1375 self.append(Feed(name=name, config=self.config))
1376 feed_names.add(name)
1379 LOG.debug('save feed configuration to {}'.format(self.configfiles[-1]))
1381 feed.save_to_config()
1382 dirname = _os.path.dirname(self.configfiles[-1])
1383 if dirname and not _os.path.isdir(dirname):
1384 _os.makedirs(dirname)
1385 with open(self.configfiles[-1], 'w') as f:
1386 self.config.write(f)
1389 def _save_feeds(self):
1390 LOG.debug('save feed data to {}'.format(self.datafile))
1391 dirname = _os.path.dirname(self.datafile)
1392 if dirname and not _os.path.isdir(dirname):
1393 _os.makedirs(dirname)
1395 tmpfile = self.datafile + '.tmp'
1396 with open(tmpfile, 'wb') as f:
1397 _pickle.dump(list(self), f)
1398 _os.rename(tmpfile, self.datafile)
1399 if self._datafile_lock is not None:
1400 self._datafile_lock.close() # release the lock
1401 self._datafile_lock = None
1403 _pickle.dump(list(self), open(self.datafile, 'wb'))
1405 def new_feed(self, name=None, prefix='feed-', **kwargs):
1406 """Return a new feed, possibly auto-generating a name.
1409 >>> print(feeds.new_feed(name='my-feed'))
1410 my-feed (None -> a@b.com)
1411 >>> print(feeds.new_feed())
1412 feed-0 (None -> a@b.com)
1413 >>> print(feeds.new_feed())
1414 feed-1 (None -> a@b.com)
1419 name = '{}{}'.format(prefix, i)
1420 feed_names = [feed.name for feed in self]
1421 if name not in feed_names:
1424 feed = Feed(name=name, **kwargs)
1429 ### Program Functions ###
1431 def cmd_new(feeds, args):
1432 "Create a new feed database."
1434 LOG.info('set the default target email to {}'.format(args.email))
1435 feeds.config['DEFAULT']['to'] = args.email
1438 def cmd_email(feeds, args):
1439 "Update the default target email address"
1441 LOG.info('unset the default target email')
1443 LOG.info('set the default target email to {}'.format(args.email))
1444 feeds.config['DEFAULT']['to'] = args.email
1447 def cmd_add(feeds, args):
1448 "Add a new feed to the database"
1449 feed = feeds.new_feed(name=args.name, url=args.url, to=args.email)
1450 LOG.info('add new feed {}'.format(feed))
1452 raise NoToEmailAddress(feeds=feeds)
1455 def cmd_run(feeds, args):
1456 "Fetch feeds and send entry emails."
1458 args.index = range(len(feeds))
1459 for index in args.index:
1460 feed = feeds.index(index)
1463 feed.run(send=args.send)
1464 except NoToEmailAddress as e:
1466 except ProcessingError as e:
1470 def cmd_list(feeds, args):
1471 "List all the feeds in the database"
1472 for i,feed in enumerate(feeds):
1477 print('{}: [{}] {}'.format(i, active_char, feed))
1479 def _cmd_set_active(feeds, args, active=True):
1480 "Shared by `cmd_pause` and `cmd_unpause`."
1486 args.index = range(len(feeds))
1487 for index in args.index:
1488 feed = feeds.index(index)
1489 LOG.info('{} feed {}'.format(action, feed))
1490 feed.active = active
1493 def cmd_pause(feeds, args):
1494 "Pause a feed (disable fetching)"
1495 _cmd_set_active(feeds=feeds, args=args, active=False)
1497 def cmd_unpause(feeds, args):
1498 "Unpause a feed (enable fetching)"
1499 _cmd_set_active(feeds=feeds, args=args, active=True)
1501 def cmd_delete(feeds, args):
1502 "Remove a feed from the database"
1504 for index in args.index:
1505 feed = feeds.index(index)
1506 to_remove.append(feed)
1507 for feed in to_remove:
1508 LOG.info('deleting feed {}'.format(feed))
1512 def cmd_reset(feeds, args):
1513 "Forget dynamic feed data (e.g. to re-send old entries)"
1515 args.index = range(len(feeds))
1516 for index in args.index:
1517 feed = feeds.index(index)
1518 LOG.info('resetting feed {}'.format(feed))
1522 def cmd_opmlimport(feeds, args):
1523 "Import configuration from OPML."
1525 LOG.info('importing feeds from {}'.format(args.file))
1526 f = open(args.file, 'rb')
1528 LOG.info('importing feeds from stdin')
1531 dom = _minidom.parse(f)
1532 new_feeds = dom.getElementsByTagName('outline')
1533 except Exception as e:
1534 raise OPMLReadError() from e
1537 for feed in new_feeds:
1538 if feed.hasAttribute('xmlUrl'):
1539 url = _saxutils.unescape(feed.getAttribute('xmlUrl'))
1540 feed = feeds.new_feed(url=url)
1541 LOG.info('add new feed {}'.format(feed))
1544 def cmd_opmlexport(feeds, args):
1545 "Export configuration to OPML."
1547 LOG.info('exporting feeds to {}'.format(args.file))
1548 f = open(args.file, 'rb')
1550 LOG.info('exporting feeds to stdout')
1553 '<?xml version="1.0" encoding="UTF-8"?>\n'
1554 '<opml version="1.0">\n'
1556 '<title>rss2email OPML export</title>\n'
1560 url = _saxutils.escape(feed.url)
1561 f.write('<outline type="rss" text="{0}" xmlUrl="{0}"/>'.format(url))
1569 ### Main Program ###
1571 def run(*args, **kwargs):
1572 """The rss2email command line interface
1574 Arguments passed to this function are forwarded to the parser's
1575 `.parse_args()` call without modification.
1577 parser = _argparse.ArgumentParser(
1578 description=_MODULE_DOCSTRING, version=__version__)
1580 parser.add_argument(
1581 '-c', '--config', metavar='PATH', default=[], action='append',
1582 help='path to the configuration file')
1583 parser.add_argument(
1584 '-d', '--data', metavar='PATH',
1585 help='path to the feed data file')
1586 parser.add_argument(
1587 '-V', '--verbose', default=0, action='count',
1588 help='increment verbosity')
1589 subparsers = parser.add_subparsers(title='commands')
1591 new_parser = subparsers.add_parser(
1592 'new', help=cmd_new.__doc__.splitlines()[0])
1593 new_parser.set_defaults(func=cmd_new)
1594 new_parser.add_argument(
1596 help='default target email for the new feed database')
1598 email_parser = subparsers.add_parser(
1599 'email', help=cmd_email.__doc__.splitlines()[0])
1600 email_parser.set_defaults(func=cmd_email)
1601 email_parser.add_argument(
1602 'email', default='',
1603 help='default target email for the email feed database')
1605 add_parser = subparsers.add_parser(
1606 'add', help=cmd_add.__doc__.splitlines()[0])
1607 add_parser.set_defaults(func=cmd_add)
1608 add_parser.add_argument(
1609 'name', help='name of the new feed')
1610 add_parser.add_argument(
1611 'url', help='location of the new feed')
1612 add_parser.add_argument(
1614 help='target email for the new feed')
1616 run_parser = subparsers.add_parser(
1617 'run', help=cmd_run.__doc__.splitlines()[0])
1618 run_parser.set_defaults(func=cmd_run)
1619 run_parser.add_argument(
1620 '-n', '--no-send', dest='send',
1621 default=True, action='store_const', const=False,
1622 help="fetch feeds, but don't send email")
1623 run_parser.add_argument(
1625 help='feeds to fetch (defaults to fetching all feeds)')
1627 list_parser = subparsers.add_parser(
1628 'list', help=cmd_list.__doc__.splitlines()[0])
1629 list_parser.set_defaults(func=cmd_list)
1631 pause_parser = subparsers.add_parser(
1632 'pause', help=cmd_pause.__doc__.splitlines()[0])
1633 pause_parser.set_defaults(func=cmd_pause)
1634 pause_parser.add_argument(
1636 help='feeds to pause (defaults to pausing all feeds)')
1638 unpause_parser = subparsers.add_parser(
1639 'unpause', help=cmd_unpause.__doc__.splitlines()[0])
1640 unpause_parser.set_defaults(func=cmd_unpause)
1641 unpause_parser.add_argument(
1643 help='feeds to ununpause (defaults to unpausing all feeds)')
1645 delete_parser = subparsers.add_parser(
1646 'delete', help=cmd_delete.__doc__.splitlines()[0])
1647 delete_parser.set_defaults(func=cmd_delete)
1648 delete_parser.add_argument(
1650 help='feeds to delete')
1652 reset_parser = subparsers.add_parser(
1653 'reset', help=cmd_reset.__doc__.splitlines()[0])
1654 reset_parser.set_defaults(func=cmd_reset)
1655 reset_parser.add_argument(
1657 help='feeds to reset (defaults to resetting all feeds)')
1659 opmlimport_parser = subparsers.add_parser(
1660 'opmlimport', help=cmd_opmlimport.__doc__.splitlines()[0])
1661 opmlimport_parser.set_defaults(func=cmd_opmlimport)
1662 opmlimport_parser.add_argument(
1663 'file', metavar='PATH', nargs='?',
1664 help='path for imported OPML (defaults to stdin)')
1666 opmlexport_parser = subparsers.add_parser(
1667 'opmlexport', help=cmd_opmlexport.__doc__.splitlines()[0])
1668 opmlexport_parser.set_defaults(func=cmd_opmlexport)
1669 opmlexport_parser.add_argument(
1670 'file', metavar='PATH', nargs='?',
1671 help='path for exported OPML (defaults to stdout)')
1673 args = parser.parse_args(*args, **kwargs)
1676 LOG.setLevel(max(_logging.DEBUG, _logging.ERROR - 10 * args.verbose))
1681 feeds = Feeds(datafile=args.data, configfiles=args.config)
1682 if args.func != cmd_new:
1683 lock = args.func not in [cmd_list, cmd_opmlexport]
1684 feeds.load(lock=lock)
1685 args.func(feeds=feeds, args=args)
1686 except RSS2EmailError as e:
1691 if __name__ == '__main__':