-# Copyright (C) 2004-2012 Aaron Swartz
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2013 Aaron Swartz
# Brian Lalor
# Dean Jackson
+# Dennis Keitzel <github@pinshot.net>
# Erik Hetzner
+# Etienne Millon <me@emillon.org>
+# J. Lewis Muir <jlmuir@imca-cat.org>
# Joey Hess
# Lindsey Smith <lindsey.smith@gmail.com>
# Marcel Ackermann
"""
import collections as _collections
+from email.mime.message import MIMEMessage as _MIMEMessage
+from email.mime.multipart import MIMEMultipart as _MIMEMultipart
from email.utils import formataddr as _formataddr
+import hashlib as _hashlib
+import html.parser as _html_parser
import re as _re
import socket as _socket
import time as _time
from . import util as _util
-_feedparser.USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
+_USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
+_feedparser.USER_AGENT = _USER_AGENT
_urllib_request.install_opener(_urllib_request.build_opener())
_SOCKET_ERRORS = []
-for e in ['error', 'gaierror']:
+for e in ['error', 'herror', 'gaierror']:
if hasattr(_socket, e):
_SOCKET_ERRORS.append(getattr(_socket, e))
+del e # cleanup namespace
_SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
+# drv_libxml2 raises:
+# TypeError: 'str' does not support the buffer interface
+_feedparser.PREFERRED_XML_PARSERS = []
+
class Feed (object):
"""Utility class for feed manipulation and storage.
>>> feed.section
'feed.test-feed'
>>> feed.from_email
- 'bozo@dev.null.invalid'
+ 'user@rss2email.invalid'
>>> feed.from_email = 'a@b.com'
>>> feed.save_to_config()
>>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
[DEFAULT]
- from = bozo@dev.null.invalid
+ from = user@rss2email.invalid
...
verbose = warning
<BLANKLINE>
>>> feed.url
'http://example.com/feed.atom'
- Names can only contain ASCII letters, digits, and '._-'. Here the
+ Names can only contain letters, digits, and '._-'. Here the
invalid space causes an exception:
>>> Feed(name='invalid name')
...
rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
+ However, you aren't restricted to ASCII letters:
+
+ >>> Feed(name='Αθήνα')
+ <Feed Αθήνα (None -> )>
+
+ You must define a URL:
+
+ >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
+ Traceback (most recent call last):
+ ...
+ rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
+
+
Cleanup `CONFIG`.
>>> CONFIG['DEFAULT']['to'] = ''
>>> test_section = CONFIG.pop('feed.test-feed')
+
"""
- _name_regexp = _re.compile('^[a-zA-Z0-9._-]+$')
+ _name_regexp = _re.compile('^[\w\d.-]+$')
# saved/loaded from feed.dat using __getstate__/__setstate__.
_dynamic_attributes = [
# hints for value conversion
_boolean_attributes = [
+ 'digest',
'force_from',
'use_publisher_email',
- 'friendly_name',
'active',
'date_header',
'trust_guid',
'encodings',
]
+ _function_attributes = [
+ 'post_process',
+ 'digest_post_process',
+ ]
+
def __init__(self, name=None, url=None, to=None, config=None):
self._set_name(name=name)
self.reset()
return dict(
(key,getattr(self,key)) for key in self._dynamic_attributes)
+ get_state = __getstate__ # make it publicly accessible
+
def __setstate__(self, state):
"Restore dynamic attributes"
keys = sorted(state.keys())
self._set_name(name=state['name'])
self.__dict__.update(state)
+ set_state = __setstate__ # make it publicly accessible
+
def save_to_config(self):
"Save configured attributes"
data = _collections.OrderedDict()
for key in expected:
if (key not in keys and
key not in self._non_default_configured_attributes):
- raise ValueError('missing key: {}'.format(key))
+ raise _error.InvalidFeedConfig(
+ setting=key, feed=self,
+ message='missing configuration key: {}'.format(key))
for key in keys:
if key not in expected:
- raise ValueError('extra key: {}'.format(key))
+ raise _error.InvalidFeedConfig(
+ setting=key, feed=self,
+ message='extra configuration key: {}'.format(key))
data = dict(
(self._configured_attribute_inverse_translations[k],
self._get_configured_attribute_value(
self.__dict__.update(data)
def _get_configured_option_value(self, attribute, value):
- if value and attribute in self._list_attributes:
+ if value is None:
+ return ''
+ elif attribute in self._list_attributes:
return ', '.join(value)
+ elif attribute in self._function_attributes:
+ return _util.import_name(value)
return str(value)
def _get_configured_attribute_value(self, attribute, key, data):
return data.getint(key)
elif attribute in self._list_attributes:
return [x.strip() for x in data[key].split(',')]
+ elif attribute in self._function_attributes:
+ if data[key]:
+ return _util.import_function(data[key])
+ return None
return data[key]
def reset(self):
200
"""
_LOG.info('fetch {}'.format(self))
+ if not self.url:
+ raise _error.InvalidFeedConfig(setting='url', feed=self)
if self.section in self.config:
config = self.config[self.section]
else:
_LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
processed = self._process_entry(parsed=parsed, entry=entry)
if processed:
- yield processed
+ guid,id_,sender,message = processed
+ if self.post_process:
+ message = self.post_process(
+ feed=self, parsed=parsed, entry=entry, guid=guid,
+ message=message)
+ if not message:
+ continue
+ yield (guid, id_, sender, message)
def _check_for_errors(self, parsed):
warned = False
if isinstance(exc, _socket.timeout):
_LOG.error('timed out: {}'.format(self))
warned = True
- elif isinstance(exc, _SOCKET_ERRORS):
- reason = exc.args[1]
+ elif isinstance(exc, OSError):
_LOG.error('{}: {}'.format(exc, self))
warned = True
- elif (hasattr(exc, 'reason') and
- isinstance(exc.reason, _urllib_error.URLError)):
- if isinstance(exc.reason, _SOCKET_ERRORS):
- reason = exc.reason.args[1]
- else:
- reason = exc.reason
+ elif isinstance(exc, _SOCKET_ERRORS):
_LOG.error('{}: {}'.format(exc, self))
warned = True
elif isinstance(exc, _feedparser.zlib.error):
elif isinstance(exc, _sax.SAXParseException):
_LOG.error('sax parsing error: {}: {}'.format(exc, self))
warned = True
+ elif (parsed.bozo and
+ isinstance(exc, _feedparser.CharacterEncodingOverride)):
+ _LOG.warning(
+ 'incorrectly declared encoding: {}: {}'.format(exc, self))
+ warned = True
elif parsed.bozo or exc:
if exc is None:
exc = "can't process"
not version):
raise _error.ProcessingError(parsed=parsed, feed=feed)
+ def _html2text(self, html, baseurl='', default=None):
+ self.config.setup_html2text(section=self.section)
+ try:
+ return _html2text.html2text(html=html, baseurl=baseurl)
+ except _html_parser.HTMLParseError as e:
+ if default is not None:
+ return default
+ raise
+
def _process_entry(self, parsed, entry):
id_ = self._get_entry_id(entry)
# If .trust_guid isn't set, we get back hashes of the content.
# Instead of letting these run wild, we put them in context
# by associating them with the actual ID (if it exists).
- guid = entry['id'] or id_
+ guid = entry.get('id', id_)
if isinstance(guid, dict):
guid = guid.values()[0]
if guid in self.seen:
- if self.seen[guid] == id_:
+ if self.seen[guid]['id'] == id_:
_LOG.debug('already seen {}'.format(id_))
return # already seen
sender = self._get_entry_email(parsed=parsed, entry=entry)
- link = entry.get('link', None)
subject = self._get_entry_title(entry)
extra_headers = _collections.OrderedDict((
('Date', self._get_entry_date(entry)),
('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
- ('User-Agent', 'rss2email'),
+ ('User-Agent', _USER_AGENT),
('X-RSS-Feed', self.url),
('X-RSS-ID', id_),
- ('X-RSS-URL', link),
+ ('X-RSS-URL', self._get_entry_link(entry)),
('X-RSS-TAGS', self._get_entry_tags(entry)),
))
for k,v in extra_headers.items(): # remove empty tags, etc.
self.bonus_header))
content = self._get_entry_content(entry)
- content = self._process_entry_content(
- entry=entry, content=content, link=link, subject=subject)
+ try:
+ content = self._process_entry_content(
+ entry=entry, content=content, subject=subject)
+ except _error.ProcessingError as e:
+ e.parsed = parsed
+ raise
message = _email.get_message(
sender=sender,
recipient=self.to,
subject=subject,
body=content['value'],
content_type=content['type'].split('/', 1)[1],
- extra_headers=extra_headers)
+ extra_headers=extra_headers,
+ config=self.config,
+ section=self.section)
return (guid, id_, sender, message)
def _get_entry_id(self, entry):
if isinstance(entry.id, dict):
return entry.id.values()[0]
return entry.id
- content_type,content_value = self._get_entry_content(entry)
- content_value = content_value.strip()
+ content = self._get_entry_content(entry)
+ content_value = content['value'].strip()
if content_value:
- return hash(content_value.encode('unicode-escape')).hexdigest()
+ return _hashlib.sha1(
+ content_value.encode('unicode-escape')).hexdigest()
elif getattr(entry, 'link', None):
- return hash(entry.link.encode('unicode-escape')).hexdigest()
+ return _hashlib.sha1(
+ entry.link.encode('unicode-escape')).hexdigest()
elif getattr(entry, 'title', None):
- return hash(entry.title.encode('unicode-escape')).hexdigest()
+ return _hashlib.sha1(
+ entry.title.encode('unicode-escape')).hexdigest()
+
+ def _get_entry_link(self, entry):
+ return entry.get('link', None)
def _get_entry_title(self, entry):
if hasattr(entry, 'title_detail') and entry.title_detail:
title = entry.title_detail.value
if 'html' in entry.title_detail.type:
- title = _html2text.html2text(title)
+ title = self._html2text(title, default=title)
else:
- title = self._get_entry_content(entry).content[:70]
+ content = self._get_entry_content(entry)
+ value = content['value']
+ if content['type'] in ('text/html', 'application/xhtml+xml'):
+ value = self._html2text(value, default=value)
+ title = value[:70]
title = title.replace('\n', ' ').strip()
return title
... '</feed>\\n'
... )
>>> entry = parsed.entries[0]
- >>> f.friendly_name = False
+ >>> f.name_format = ''
>>> f._get_entry_name(parsed, entry)
''
- >>> f.friendly_name = True
+ >>> f.name_format = '{author}'
>>> f._get_entry_name(parsed, entry)
'Example author'
+ >>> f.name_format = '{feed-title}: {author}'
+ >>> f._get_entry_name(parsed, entry)
+ ': Example author'
+ >>> f.name_format = '{author} ({feed.name})'
+ >>> f._get_entry_name(parsed, entry)
+ 'Example author (test-feed)'
"""
- if not self.friendly_name:
+ if not self.name_format:
return ''
- parts = ['']
+ data = {
+ 'feed': self,
+ 'feed-title': '<feed title>',
+ 'author': '<author>',
+ 'publisher': '<publisher>',
+ }
feed = parsed.feed
- parts.append(feed.get('title', ''))
+ data['feed-title'] = feed.get('title', '')
for x in [entry, feed]:
if 'name' in x.get('author_detail', []):
if x.author_detail.name:
- if ''.join(parts):
- parts.append(': ')
- parts.append(x.author_detail.name)
+ data['author'] = x.author_detail.name
break
- if not ''.join(parts) and self.use_publisher_email:
- if 'name' in feed.get('publisher_detail', []):
- if ''.join(parts):
- parts.append(': ')
- parts.append(feed.publisher_detail.name)
- return _html2text.unescape(''.join(parts))
+ if 'name' in feed.get('publisher_detail', []):
+ data['publisher'] = feed.publisher_detail.name
+ name = self.name_format.format(**data)
+ return _html2text.unescape(name)
def _validate_email(self, email, default=None):
"""Do a basic quality check on email address
return content
if contents:
return contents[0]
- return {type: 'text/plain', 'value': ''}
+ return {'type': 'text/plain', 'value': ''}
- def _process_entry_content(self, entry, content, link, subject):
+ def _process_entry_content(self, entry, content, subject):
"Convert entry content to the requested format."
+ link = self._get_entry_link(entry)
if self.html_mail:
lines = [
'<!DOCTYPE html>',
lines.extend([
'</head>',
'<body>',
- '<div id="entry>',
+ '<div id="entry">',
'<h1 class="header"><a href="{}">{}</a></h1>'.format(
link, subject),
- '<div id="body"><table><tr><td>',
+ '<div id="body">',
])
if content['type'] in ('text/html', 'application/xhtml+xml'):
lines.append(content['value'].strip())
else:
lines.append(_saxutils.escape(content['value'].strip()))
- lines.append('</td></tr></table></div>')
+ lines.append('</div>')
lines.extend([
'<div class="footer">'
'<p>URL: <a href="{0}">{0}</a></p>'.format(link),
for elink in getattr(entry, 'links', []):
if elink.get('rel', None) == 'via':
url = elink['href']
- url = url.replace(
- 'http://www.google.com/reader/public/atom/',
- 'http://www.google.com/reader/view/')
- title = url
- if elink.get('title', None):
- title = elink['title']
+ title = elink.get('title', url)
lines.append('<p>Via <a href="{}">{}</a></p>'.format(
url, title))
lines.extend([
return content
else: # not self.html_mail
if content['type'] in ('text/html', 'application/xhtml+xml'):
- lines = [_html2text.html2text(content['value'])]
+ try:
+ lines = [self._html2text(content['value'])]
+ except _html_parser.HTMLParseError as e:
+ raise _error.ProcessingError(parsed=None, feed=self)
else:
lines = [content['value']]
lines.append('')
for elink in getattr(entry, 'links', []):
if elink.get('rel', None) == 'via':
url = elink['href']
- url = url.replace(
- 'http://www.google.com/reader/public/atom/',
- 'http://www.google.com/reader/view/')
- title = url
- if elink.get('title', None):
- title = elink['title']
+ title = elink.get('title', url)
lines.append('Via: {} {}'.format(title, url))
content['type'] = 'text/plain'
content['value'] = '\n'.join(lines)
if not self.to:
raise _error.NoToEmailAddress(feed=self)
parsed = self._fetch()
+
+ if self.digest:
+ digest = self._new_digest()
+ seen = []
+
for (guid, id_, sender, message) in self._process(parsed):
_LOG.debug('new message: {}'.format(message['Subject']))
- if send:
- self._send(sender=sender, message=message)
- self.seen[guid] = id_
+ if self.digest:
+ seen.append((guid, id_))
+ self._append_to_digest(digest=digest, message=message)
+ else:
+ if send:
+ self._send(sender=sender, message=message)
+ if guid not in self.seen:
+ self.seen[guid] = {}
+ self.seen[guid]['id'] = id_
+
+ if self.digest and seen:
+ if self.digest_post_process:
+ digest = self.digest_post_process(
+ feed=self, parsed=parsed, seen=seen, message=digest)
+ if not digest:
+ return
+ self._send_digest(
+ digest=digest, seen=seen, sender=sender, send=send)
+
self.etag = parsed.get('etag', None)
self.modified = parsed.get('modified', None)
+
+ def _new_digest(self):
+ digest = _MIMEMultipart('digest')
+ digest['To'] = self.to # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
+ digest['Subject'] = 'digest for {}'.format(self.name)
+ digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
+ digest['User-Agent'] = _USER_AGENT
+ digest['X-RSS-Feed'] = self.url
+ return digest
+
+ def _append_to_digest(self, digest, message):
+ part = _MIMEMessage(message)
+ part.add_header('Content-Disposition', 'attachment')
+ digest.attach(part)
+
+ def _send_digest(self, digest, seen, sender, send=True):
+ """Send a digest message
+
+ The date is extracted from the last message in the digest
+ payload. We assume that this part exists. If you don't have
+ any messages in the digest, don't call this function.
+ """
+ digest['From'] = sender # TODO: _Header(), _formataddr()...
+ last_part = digest.get_payload()[-1]
+ last_message = last_part.get_payload()[0]
+ digest['Date'] = last_message['Date']
+
+ _LOG.debug('new digest for {}'.format(self))
+ if send:
+ self._send(sender=sender, message=digest)
+ for (guid, id_) in seen:
+ if guid not in self.seen:
+ self.seen[guid] = {}
+ self.seen[guid]['id'] = id_