2 """rss2email: get RSS feeds emailed to you
3 http://rss2email.infogami.com
6 new [emailaddress] (create new feedfile)
7 email newemailaddress (update default email)
9 add feedurl [emailaddress]
14 __author__ = "Aaron Swartz (me@aaronsw.com)"
15 __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3."
16 ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess",
17 "Matej Cepl", "Martin 'Joey' Schulze",
18 "Marcel Ackermann (http://www.DreamFlasher.de)",
19 "Lindsey Smith (lindsey.smith@gmail.com)" ]
21 ### Vaguely Customizable Options ###
23 # The email address messages are from by default:
24 DEFAULT_FROM = "bozo@dev.null.invalid"
26 # 1: Send text/html messages when possible.
27 # 0: Convert HTML to plain text.
30 # 1: Only use the DEFAULT_FROM address.
31 # 0: Use the email address specified by the feed, when possible.
34 # 1: Receive one email per post.
35 # 0: Receive an email every time a post changes.
38 # 1: Generate Date header based on item's date, when possible.
39 # 0: Generate Date header based on time sent.
42 # A tuple consisting of some combination of
43 # ('issued', 'created', 'modified', 'expired')
44 # expressing ordered list of preference in dates
45 # to use for the Date header of the email.
46 DATE_HEADER_ORDER = ('modified', 'issued', 'created')
48 # 1: Apply Q-P conversion (required for some MUAs).
49 # 0: Send message in 8-bits.
50 # http://cr.yp.to/smtp/8bitmime.html
53 # 1: Name feeds as they're being processed.
57 # 1: Use the publisher's email if you can't find the author's.
58 # 0: Just use the DEFAULT_FROM email instead.
59 USE_PUBLISHER_EMAIL = 0
61 # 1: Use SMTP_SERVER to send mail.
62 # 0: Call /usr/sbin/sendmail to send mail.
65 SMTP_SERVER = "smtp.yourisp.net:25"
66 AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
67 SMTP_USER = 'username' # for SMTP AUTH, set SMTP username here
68 SMTP_PASS = 'password' # for SMTP AUTH, set SMTP password here
70 # Set this to add a bonus header to all emails (start with '\n').
72 # Example: BONUS_HEADER = '\nApproved: joe@bob.org'
74 # Set this to override From addresses. Keys are feed URLs, values are new titles.
77 # Set this to override the timeout (in seconds) for feed server response
80 # Optional CSS styling
82 STYLE_SHEET='h1 {font: 18pt Georgia, "Times New Roman";} body {font: 12pt Arial;} a:link {font: 12pt Arial; font-weight: bold; color: #0000cc} blockquote {font-family: monospace; } .header { background: #e0ecff; border-bottom: solid 4px #c3d9ff; padding: 5px; margin-top: 0px; color: red;} .header a { font-size: 20px; text-decoration: none; } .footer { background: #c3d9ff; border-top: solid 4px #c3d9ff; padding: 5px; margin-bottom: 0px; } #entry {border: solid 4px #c3d9ff; } #body { margin-left: 5px; margin-right: 5px; }'
84 # If you have an HTTP Proxy set this in the format 'http://your.proxy.here:8080/'
87 # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works
88 # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes
89 CHARSET_LIST='US-ASCII', 'BIG5', 'ISO-2022-JP', 'ISO-8859-1', 'UTF-8'
91 from email.MIMEText import MIMEText
92 from email.Header import Header
93 from email.Utils import parseaddr, formataddr
95 # Note: You can also override the send function.
97 def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None):
100 All arguments should be Unicode strings (plain ASCII works as well).
102 Only the real name part of sender and recipient addresses may contain
103 non-ASCII characters.
105 The email will be properly MIME encoded and delivered though SMTP to
106 localhost port 25. This is easy to change if you want something different.
108 The charset of the email will be the first one out of the list
109 that can represent all the characters occurring in the email.
112 # Header class is smart enough to try US-ASCII, then the charset we
113 # provide, then fall back to UTF-8.
114 header_charset = 'ISO-8859-1'
116 # We must choose the body charset manually
117 for body_charset in CHARSET_LIST:
119 body.encode(body_charset)
120 except (UnicodeError, LookupError):
125 # Split real name (which is optional) and email address parts
126 sender_name, sender_addr = parseaddr(sender)
127 recipient_name, recipient_addr = parseaddr(recipient)
129 # We must always pass Unicode strings to Header, otherwise it will
130 # use RFC 2047 encoding even on plain ASCII strings.
131 sender_name = str(Header(unicode(sender_name), header_charset))
132 recipient_name = str(Header(unicode(recipient_name), header_charset))
134 # Make sure email addresses do not contain non-ASCII characters
135 sender_addr = sender_addr.encode('ascii')
136 recipient_addr = recipient_addr.encode('ascii')
138 # Create the message ('plain' stands for Content-Type: text/plain)
139 msg = MIMEText(body.encode(body_charset), contenttype, body_charset)
140 msg['To'] = formataddr((recipient_name, recipient_addr))
141 msg['Subject'] = Header(unicode(subject), header_charset)
142 for hdr in extraheaders.keys():
143 msg[hdr] = Header(unicode(extraheaders[hdr], header_charset))
145 fromhdr = formataddr((sender_name, sender_addr))
146 msg['From'] = fromhdr
148 msg_as_string = msg.as_string()
150 ins, outs = SIO(msg_as_string), SIO()
151 mimify.mimify(ins, outs)
152 msg_as_string = outs.getvalue()
159 smtpserver = smtplib.SMTP(SMTP_SERVER)
160 except KeyboardInterrupt:
164 print >>warn, ('Fatal error: could not connect to mail server "%s"' % SMTP_SERVER)
165 if hasattr(e, 'reason'):
166 print >>warn, "Reason:", e.reason
172 smtpserver.starttls()
174 smtpserver.login(SMTP_USER, SMTP_PASS)
175 except KeyboardInterrupt:
179 print >>warn, ('Fatal error: could not authenticate with mail server "%s" as user "%s"' % (SMTP_SERVER, SMTP_USER))
180 if hasattr(e, 'reason'):
181 print >>warn, "Reason:", e.reason
184 smtpserver.sendmail(sender, recipient, msg_as_string)
189 i, o = os.popen2(["/usr/sbin/sendmail", recipient])
190 i.write(msg_as_string)
194 print '''Error attempting to send email via sendmail. Possibly you need to configure your config.py to use a SMTP server? Please refer to the rss2email documentation or website (http://rss2email.infogami.com) for complete documentation of config.py. The options below may suffice for configuring email:
195 # 1: Use SMTP_SERVER to send mail.
196 # 0: Call /usr/sbin/sendmail to send mail.
199 SMTP_SERVER = "smtp.yourisp.net:25"
200 AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
201 SMTP_USER = 'username' # for SMTP AUTH, set SMTP username here
202 SMTP_PASS = 'password' # for SMTP AUTH, set SMTP password here
207 ## html2text options ##
209 # Use Unicode characters instead of their ascii psuedo-replacements
212 # Put the links after each paragraph instead of at the end.
213 LINKS_EACH_PARAGRAPH = 0
215 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
218 ### Load the Options ###
220 # Read options from config file if present.
228 ### Import Modules ###
230 import cPickle as pickle, md5, time, os, traceback, urllib2, sys, types
238 import socket; socket_errors = []
239 for e in ['error', 'gaierror']:
240 if hasattr(socket, e): socket_errors.append(getattr(socket, e))
241 import mimify; from StringIO import StringIO as SIO; mimify.CHARSET = 'utf-8'
244 feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.aaronsw.com/2002/rss2email/"
246 import html2text as h2t
248 h2t.UNICODE_SNOB = UNICODE_SNOB
249 h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
250 h2t.BODY_WIDTH = BODY_WIDTH
251 html2text = h2t.html2text
253 ### Utility Functions ###
256 class TimeoutError(Exception): pass
258 class InputError(Exception): pass
260 def timelimit(timeout, function):
261 # def internal(function):
262 def internal2(*args, **kw):
264 from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/473878
266 class Calculator(threading.Thread):
268 threading.Thread.__init__(self)
274 self.result = function(*args, **kw)
276 self.error = sys.exc_info()
279 c.setDaemon(True) # don't hold up exiting
285 raise c.error[0], c.error[1]
293 def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
294 def ishtml(t): return type(t) is type(())
295 def contains(a,b): return a.find(b) != -1
296 def unu(s): # I / freakin' hate / that unicode
297 if type(s) is types.UnicodeType: return s.encode('utf-8')
301 """Quote names in email according to RFC822."""
302 return '"' + unu(s).replace("\\", "\\\\").replace('"', '\\"') + '"'
305 """QP_CORRUPT headers."""
306 #return mimify.mime_encode_header(s + ' ')[:-1]
307 # XXX due to mime_encode_header bug
309 p = re.compile('=\n([^ \t])');
310 return p.sub(r'\1', mimify.mime_encode_header(s + ' ')[:-1])
312 ### Parsing Utilities ###
314 def getContent(entry, HTMLOK=0):
315 """Select the best content from an entry, deHTMLizing if necessary.
316 If raw HTML is best, an ('HTML', best) tuple is returned. """
319 # * We have a bunch of potential contents.
320 # * We go thru looking for our first choice.
321 # (HTML or text, depending on HTMLOK)
322 # * If that doesn't work, we go thru looking for our second choice.
323 # * If that still doesn't work, we just take the first one.
325 # Possible future improvement:
326 # * Instead of just taking the first one
327 # pick the one in the "best" language.
328 # * HACK: hardcoded HTMLOK, should take a tuple of media types
330 conts = entry.get('content', [])
332 if entry.get('summary_detail', {}):
333 conts += [entry.summary_detail]
338 if contains(c.type, 'html'): return ('HTML', c.value)
340 if not HTMLOK: # Only need to convert to text if HTML isn't OK
342 if contains(c.type, 'html'):
343 return html2text(c.value)
346 if c.type == 'text/plain': return c.value
348 return conts[0].value
353 """Get best ID from an entry."""
355 if 'id' in entry and entry.id: return entry.id
357 content = getContent(entry)
358 if content and content != "\n": return md5.new(unu(content)).hexdigest()
359 if 'link' in entry: return entry.link
360 if 'title' in entry: return md5.new(unu(entry.title)).hexdigest()
362 def getName(r, entry):
363 """Get the best name."""
366 if hasattr(r, "url") and r.url in OVERRIDE_FROM.keys():
367 return OVERRIDE_FROM[r.url]
369 name = feed.get('title', '')
371 if 'name' in entry.get('author_detail', []): # normally {} but py2.1
372 if entry.author_detail.name:
373 if name: name += ": "
374 det=entry.author_detail.name
376 name += entry.author_detail.name
377 except UnicodeDecodeError:
378 name += unicode(entry.author_detail.name, 'utf-8')
380 elif 'name' in feed.get('author_detail', []):
381 if feed.author_detail.name:
382 if name: name += ", "
383 name += feed.author_detail.name
387 def getEmail(feed, entry):
388 """Get the best email_address."""
390 if FORCE_FROM: return DEFAULT_FROM
392 if 'email' in entry.get('author_detail', []):
393 return entry.author_detail.email
395 if 'email' in feed.get('author_detail', []):
396 return feed.author_detail.email
400 if USE_PUBLISHER_EMAIL:
401 if 'email' in feed.get('publisher_detail', []):
402 return feed.publisher_detail.email
404 if feed.get("errorreportsto", ''):
405 return feed.errorreportsto
409 ### Simple Database of Feeds ###
412 def __init__(self, url, to):
413 self.url, self.etag, self.modified, self.seen = url, None, None, {}
417 if not os.path.exists(feedfile):
418 print 'Feedfile "%s" does not exist. If you\'re using r2e for the first time, you' % feedfile
419 print "have to run 'r2e new' first."
422 feedfileObject = open(feedfile, 'r')
424 print "Feedfile could not be opened: %s" % e
426 feeds = pickle.load(feedfileObject)
430 locktype = fcntl.LOCK_EX
431 if (sys.platform.find('sunos')): locktype = fcntl.LOCK_SH
432 fcntl.flock(feedfileObject.fileno(), locktype)
433 #HACK: to deal with lock caching
434 feedfileObject = open(feedfile, 'r')
435 feeds = pickle.load(feedfileObject)
436 if unix: fcntl.flock(feedfileObject.fileno(), locktype)
438 return feeds, feedfileObject
440 def unlock(feeds, feedfileObject):
442 pickle.dump(feeds, open(feedfile, 'w'))
444 pickle.dump(feeds, open(feedfile+'.tmp', 'w'))
445 os.rename(feedfile+'.tmp', feedfile)
446 fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN)
448 #@timelimit(FEED_TIMEOUT)
449 def parse(url, etag, modified):
451 return feedparser.parse(url, etag, modified)
453 proxy = urllib2.ProxyHandler( {"http":PROXY} )
454 return feedparser.parse(url, etag, modified, handlers = [proxy])
457 ### Program Functions ###
460 if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'):
461 urls, to = [args[0]], args[1]
463 urls, to = args, None
465 feeds, feedfileObject = load()
466 if (feeds and not isstr(feeds[0]) and to is None) or (not len(feeds) and to is None):
467 print "No email address has been defined. Please run 'r2e email emailaddress' or"
468 print "'r2e add url emailaddress'."
470 for url in urls: feeds.append(Feed(url, to))
471 unlock(feeds, feedfileObject)
474 feeds, feedfileObject = load()
476 # We store the default to address as the first item in the feeds list.
477 # Here we take it out and save it for later.
479 if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:]
482 if num: ifeeds = [feeds[num]]
490 if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
493 r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
495 print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
498 # Handle various status conditions, as required
500 if r.status == 301: f.url = r['url']
501 elif r.status == 410:
502 print >>warn, "W: feed gone; deleting", f.url
506 http_status = r.get('status', 200)
507 http_headers = r.get('headers', {
508 'content-type': 'application/rss+xml',
509 'content-length':'1'})
510 exc_type = r.get("bozo_exception", Exception()).__class__
511 if http_status != 304 and not r.get('version', ''):
512 if http_status not in [200, 302]:
513 print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)
515 elif contains(http_headers.get('content-type', 'rss'), 'html'):
516 print >>warn, "W: looks like HTML [%d] %s" % (feednum, f.url)
518 elif http_headers.get('content-length', '1') == '0':
519 print >>warn, "W: empty page [%d] %s" % (feednum, f.url)
521 elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
522 print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
524 elif exc_type == IOError:
525 print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
527 elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
528 print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
530 elif exc_type in socket_errors:
531 exc_reason = r.bozo_exception.args[1]
532 print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
534 elif exc_type == urllib2.URLError:
535 if r.bozo_exception.reason.__class__ in socket_errors:
536 exc_reason = r.bozo_exception.reason.args[1]
538 exc_reason = r.bozo_exception.reason
539 print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
541 elif exc_type == AttributeError:
542 print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
544 elif exc_type == KeyboardInterrupt:
545 raise r.bozo_exception
548 print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))
551 print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
552 print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
554 print >>warn, "rss2email", __version__
555 print >>warn, "feedparser", feedparser.__version__
556 print >>warn, "html2text", h2t.__version__
557 print >>warn, "Python", sys.version
558 print >>warn, "=== END HERE ==="
563 for entry in r.entries:
566 # If TRUST_GUID isn't set, we get back hashes of the content.
567 # Instead of letting these run wild, we put them in context
568 # by associating them with the actual ID (if it exists).
570 frameid = entry.get('id', id)
572 # If this item's ID is in our database
573 # then it's already been sent
574 # and we don't need to do anything more.
576 if f.seen.has_key(frameid) and f.seen[frameid] == id: continue
578 if not (f.to or default_to):
579 print "No default email address defined. Please run 'r2e email emailaddress'"
580 print "Ignoring feed %s" % f.url
583 if 'title_detail' in entry and entry.title_detail:
584 title = entry.title_detail.value
585 if contains(entry.title_detail.type, 'html'):
586 title = html2text(title)
588 title = getContent(entry)[:70]
590 title = title.replace("\n", " ").strip()
592 datetime = time.gmtime()
595 for datetype in DATE_HEADER_ORDER:
596 kind = datetype+"_parsed"
597 if kind in entry and entry[kind]: datetime = entry[kind]
599 link = entry.get('link', "")
601 from_addr = getEmail(r.feed, entry)
603 name = getName(r, entry)
604 fromhdr = '"'+ name + '" <' + from_addr + ">"
605 tohdr = (f.to or default_to)
607 datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
608 useragenthdr = "rss2email"
609 extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr}
610 if BONUS_HEADER != '':
611 for hdr in BONUS_HEADER.strip().splitlines():
612 pos = hdr.strip().find(':')
614 extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
616 print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER
618 entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
619 contenttype = 'plain'
621 if USE_CSS_STYLING and HTML_MAIL:
624 content += '<head><style><!--' + STYLE_SHEET + '//--></style></head>\n'
625 content += '<body>\n'
626 content += '<div id="entry">\n'
628 content += ' class="header"'
629 content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n'
630 if ishtml(entrycontent):
631 body = entrycontent[1].strip()
633 body = entrycontent.strip()
635 content += '<div id="body"><table><tr><td>\n' + body + '</td></tr></table></div>\n'
636 content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
637 if hasattr(entry,'enclosures'):
638 for enclosure in entry.enclosures:
639 if enclosure.url != "":
640 content += ('<br/>Enclosure: <a href="'+unu(enclosure.url)+'">'+unu(enclosure.url)+"</a>\n")
641 content += '</p></div>\n'
642 content += "\n\n</body></html>"
644 if ishtml(entrycontent):
647 content = ("<html><body>\n\n" +
648 '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
649 entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
650 '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
652 if hasattr(entry,'enclosures'):
653 for enclosure in entry.enclosures:
654 if enclosure.url != "":
655 content += ('Enclosure: <a href="'+unu(enclosure.url)+'">'+unu(enclosure.url)+"</a><br/>\n")
657 content += ("\n</body></html>")
659 content = entrycontent.strip() + "\n\nURL: "+link
660 if hasattr(entry,'enclosures'):
661 for enclosure in entry.enclosures:
662 if enclosure.url != "":
663 content += ('\nEnclosure: '+unu(enclosure.url)+"\n")
665 smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)
669 f.etag, f.modified = r.get('etag', None), r.get('modified', None)
670 except (KeyboardInterrupt, SystemExit):
673 print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
674 print >>warn, "E: could not parse", f.url
675 traceback.print_exc(file=warn)
676 print >>warn, "rss2email", __version__
677 print >>warn, "feedparser", feedparser.__version__
678 print >>warn, "html2text", h2t.__version__
679 print >>warn, "Python", sys.version
680 print >>warn, "=== END HERE ==="
684 unlock(feeds, feedfileObject)
689 feeds, feedfileObject = load(lock=0)
692 if feeds and isstr(feeds[0]):
693 default_to = feeds[0]; ifeeds = feeds[1:]; i=1
694 print "default email:", default_to
695 else: ifeeds = feeds; i = 0
697 print `i`+':', f.url, '('+(f.to or ('default: '+default_to))+')'
698 if not (f.to or default_to):
699 print " W: Please define a default address with 'r2e email emailaddress'"
703 feeds, feedfileObject = load()
704 if (n == 0) and (feeds and isstr(feeds[0])):
705 print >>warn, "W: ID has to be equal to or higher than 1"
706 elif n >= len(feeds):
707 print >>warn, "W: no such feed"
709 print >>warn, "W: deleting feed %s" % feeds[n].url
710 feeds = feeds[:n] + feeds[n+1:]
712 print >>warn, "W: feed IDs have changed, list before deleting again"
713 unlock(feeds, feedfileObject)
716 feeds, feedfileObject = load()
717 if feeds and isstr(feeds[0]): feeds[0] = addr
718 else: feeds = [addr] + feeds
719 unlock(feeds, feedfileObject)
721 if __name__ == '__main__':
724 if len(args) < 3: raise InputError, "insufficient args"
725 feedfile, action, args = args[1], args[2], args[3:]
728 if args and args[0] == "--no-send":
729 def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None):
730 if VERBOSE: print 'Not sending:', unu(subject)
732 if args and args[-1].isdigit(): run(int(args[-1]))
735 elif action == "email":
737 raise InputError, "Action '%s' requires an argument" % action
741 elif action == "add": add(*args)
743 elif action == "new":
744 if len(args) == 1: d = [args[0]]
746 pickle.dump(d, open(feedfile, 'w'))
748 elif action == "list": list()
750 elif action in ("help", "--help", "-h"): print __doc__
752 elif action == "delete":
754 raise InputError, "Action '%s' requires an argument" % action
755 elif args[0].isdigit():
758 raise InputError, "Action '%s' requires a number as its argument" % action
761 raise InputError, "Invalid action"
763 except InputError, e: