From: Aaron Swartz Date: Fri, 25 Jun 2004 12:00:00 +0000 (+0000) Subject: Add rss2email v2.5 X-Git-Tag: v2.5 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=2182fc8aaaebf237f5043e90cf853b949dff7b53;p=rss2email.git Add rss2email v2.5 HTML mail, SMTP, feedparser 3.0, and html2text 2.0 support. Downloaded from: http://web.archive.org/web/20040825081604/http://www.aaronsw.com/2002/rss2email/rss2email-2.5.py --- diff --git a/rss2email.py b/rss2email.py index a64a879..0b93330 100644 --- a/rss2email.py +++ b/rss2email.py @@ -2,46 +2,50 @@ """rss2email: get RSS feeds emailed to you http://www.aaronsw.com/2002/rss2email -Usage: python rss2email.py feedfile action [options] - feedfile: name of the file to store feed info in - action [options]: - new [youremail] (create new feedfile) - email [yournewemail] (update default email) - run [--no-send] - add feedurl [youremail] - list - delete n +Usage: + new [youremail] (create new feedfile) + email [yournewemail] (update default email) + run [--no-send] [num] + add feedurl [youremail] + list + delete n """ -__version__ = "2.32" +__version__ = "2.5" __author__ = "Aaron Swartz (me@aaronsw.com)" __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2." ___contributors__ = ["Dean Jackson (dino@grorg.org)", - "Brian Lalor (blalor@ithacabands.org)", - "Joey Hess", 'Matej Cepl'] + "Brian Lalor (blalor@ithacabands.org)", + "Joey Hess", 'Matej Cepl'] ### Vaguely Customizable Options ### # The email address messages are from by default: -DEFAULT_FROM = "bozo@dev.null" +DEFAULT_FROM = "bozo@dev.null.invalid" + +# 1: Send text/html messages when possible. +# 0: Convert HTML to plain text. +HTML_MAIL = 0 # 1: Only use the DEFAULT_FROM address. # 0: Use the email address specified by the feed, when possible. FORCE_FROM = 0 -# 1: Receive one email per post -# 0: Receive an email every time a post changes +# 1: Receive one email per post. +# 0: Receive an email every time a post changes. TRUST_GUID = 1 -# 1: Generate Date header based on item's date, when possible -# 0: Generate Date header based on time sent +# 1: Generate Date header based on item's date, when possible. +# 0: Generate Date header based on time sent. DATE_HEADER = 0 -# 1: Treat the contents of as HTML -# 0: Send the contents of as is, without conversion -TREAT_DESCRIPTION_AS_HTML = 1 +# A tuple consisting of some combination of +# ('issued', 'created', 'modified', 'expired') +# expressing ordered list of preference in dates +# to use for the Date header of the email. +DATE_HEADER_ORDER = ('modified', 'issued', 'created') -# 1: Apply Q-P conversion (required for some MUAs) -# 0: Send message in 8-bits +# 1: Apply Q-P conversion (required for some MUAs). +# 0: Send message in 8-bits. # http://cr.yp.to/smtp/8bitmime.html QP_REQUIRED = 0 @@ -49,18 +53,37 @@ QP_REQUIRED = 0 # 0: Keep quiet. VERBOSE = 0 +# 1: Use the publisher's email if you can't find the author's. +# 0: Just use the DEFAULT_FROM email instead. +USE_PUBLISHER_EMAIL = 0 + +# 1: Use SMTP_SERVER to send mail. +# 0: Call /usr/bin/sendmail to send mail. +SMTP_SEND = 0 + +SMTP_SERVER = "smtp.yourisp.net:25" + +# Note: You can also override the send function. def send(fr, to, message): - i, o = os.popen2(["/usr/sbin/sendmail", to]) - i.write(message) - i.close(); o.close() - del i, o - -# def send(fr, to, message): -# import smtplib -# s = smtplib.SMTP("vorpal.notabug.com:26") -# s.sendmail(fr, [to], message) + if SMTP_SEND: + smtpserver.sendmail(fr, [to], message) + else: + i, o = os.popen2(["/usr/sbin/sendmail", to]) + i.write(message) + i.close(); o.close() + del i, o + +## html2text options ## + +# 1: Use Unicode characters +# 0: Use ASCII psuedo-replacements +UNICODE_SNOB = 0 + +# 1: Put the links after each paragraph +# 0: Put all links at the end +LINKS_EACH_PARAGRAPH = 0 -### End of Options ### +### Load the Options ### # Read options from config file if present. import sys @@ -69,49 +92,126 @@ try: from config import * except: pass + +### Import Modules ### -from html2text import html2text, expandEntities -import feedparser -import cPickle as pickle, fcntl, md5, time, os, traceback +import cPickle as pickle, fcntl, md5, time, os, traceback, socket, urllib2, sys if QP_REQUIRED: import mimify; from StringIO import StringIO as SIO -def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u'')) +if SMTP_SEND: import smtplib; smtpserver = smtplib.SMTP(SMTP_SERVER) +else: smtpserver = None + +import feedparser +feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.aaronsw.com/2002/rss2email/" + +import html2text + +html2text.UNICODE_SNOB = UNICODE_SNOB +html2text.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH +html2text = html2text.html2text + +### Utility Functions ### -def e(obj, val, ee=1): - x = obj[val] - if ee: x = expandEntities(x) - if type(x) is unicode: x = x.encode('utf-8') - return x.strip() +warn = sys.stderr -def quoteEmailName(s): - return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"' +def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u'')) +def ishtml(t): return type(t) is tuple +def contains(a,b): return a.find(b) != -1 +def unu(s): # I / freakin' hate / that unicode + if type(s) is unicode: return s.encode('utf-8') + else: return s + +def quote822(s): + """Quote names in email according to RFC822.""" + return '"' + unu(s).replace("\\", "\\\\").replace('"', '\\"') + '"' -def getContent(item, url): - if item.has_key('content') and item['content']: - for c in item['content']: - if c['type'] == 'text/plain': return e(c, 'value') +### Parsing Utilities ### - for c in item['content']: - if c['type'].find('html') != -1: - return html2text(e(c, 'value', ee=0), c['base']) +def getContent(entry, HTMLOK=0): + """Select the best content from an entry, deHTMLizing if necessary. + If raw HTML is best, an ('HTML', best) tuple is returned. """ + + # How this works: + # * We have a bunch of potential contents. + # * We go thru looking for our first choice. + # (HTML or text, depending on HTMLOK) + # * If that doesn't work, we go thru looking for our second choice. + # * If that still doesn't work, we just take the first one. + # + # Possible future improvement: + # * Instead of just taking the first one + # pick the one in the "best" language. + # * HACK: hardcoded HTMLOK, should take a tuple of media types + + if entry.get('content', []): + if HTMLOK: + for c in entry.content: + if contains(c.type, 'html'): return ('HTML', c.value) + + for c in entry.content: + if c.type == 'text/plain': return c.value + + if not HTMLOK: # Only need to convert to text if HTML isn't OK + for c in entry.content: + if contains(c.type, 'html'): + return html2text(c.value) - return e(item['content'][0], 'value') - - if item.has_key('description'): - if TREAT_DESCRIPTION_AS_HTML: - return html2text(e(item, 'description', ee=0), url) - else: - return e(item, 'description') + return entry.content[0].value - if item.has_key('summary'): return e(item, 'summary') + if entry.get('summary_detail', {}): + s = entry.summary_detail.value + if contains(entry.summary_detail.type, 'html'): s = html2text(s) + return s + return "" -def getID(item, content): +def getID(entry): + """Get best ID from an entry.""" if TRUST_GUID: - if item.has_key('id') and item['id']: return e(item, 'id') + if 'id' in entry and entry.id: return entry.id + + content = getContent(entry) + if content: return md5.new(unu(content)).hexdigest() + if 'link' in entry: return entry.link + if 'title' in entry: return md5.new(unu(entry.title)).hexdigest() + +def getName(feed, entry): + """Get the best name.""" + + name = feed.get('title', '') + + if 'name' in entry.get('author_detail', []): # normally {} but py2.1 + if name: name += ", " + name += entry.author_detail.name + + elif 'name' in feed.get('author_detail', []): + if name: name += ", " + name += feed.author_detail.name + + return name + +def getEmail(feed, entry): + """Get the best email_address.""" - if content: return md5.new(content).hexdigest() - if item.has_key('link'): return e(item, 'link') - if item.has_key('title'): return md5.new(e(item, 'title')).hexdigest() + if FORCE_FROM: return DEFAULT_FROM + + if 'email' in entry.get('author_detail', []): + return entry.author_detail.email + + if 'email' in feed.get('author_detail', []): + return feed.author_detail.email + + #TODO: contributors + + if USE_PUBLISHER_EMAIL: + if 'email' in feed.get('publisher_detail', []): + return feed.publisher_detail.email + + if feed.get("errorreportsto", ''): + return feed.errorreportsto + + return DEFAULT_FROM + +### Simple Database of Feeds ### class Feed: def __init__(self, url, to): @@ -119,116 +219,170 @@ class Feed: self.to = to def load(lock=1): - ff2 = open(feedfile, 'r') - feeds = pickle.load(ff2) + feedfileObject = open(feedfile, 'r') + feeds = pickle.load(feedfileObject) if lock: - fcntl.flock(ff2, fcntl.LOCK_EX) + fcntl.flock(feedfileObject, fcntl.LOCK_EX) #HACK: to deal with lock caching - ff2 = open(feedfile, 'r') - feeds = pickle.load(ff2) - fcntl.flock(ff2, fcntl.LOCK_EX) + feedfileObject = open(feedfile, 'r') + feeds = pickle.load(feedfileObject) + fcntl.flock(feedfileObject, fcntl.LOCK_EX) - return feeds, ff2 + return feeds, feedfileObject -def unlock(feeds, ff2): +def unlock(feeds, feedfileObject): pickle.dump(feeds, open(feedfile+'.tmp', 'w')) os.rename(feedfile+'.tmp', feedfile) - fcntl.flock(ff2, fcntl.LOCK_UN) + fcntl.flock(feedfileObject, fcntl.LOCK_UN) + +### Program Functions ### + +def add(*args): + if len(args) == 2 and contains(args[1], '@') and not contain(args[1], '://'): + urls, to = [args[0]], args[1] + else: + urls, to = args, None -def add(url, to=None): - feeds, ff2 = load() + feeds, feedfileObject = load() if feeds and not isstr(feeds[0]) and to is None: raise 'NoEmail', "Run `email newaddr` or `add url addr`." - feeds.append(Feed(url, to)) - unlock(feeds, ff2) + for url in urls: feeds.append(Feed(url, to)) + unlock(feeds, feedfileObject) -def run(): - feeds, ff2 = load() +def run(num=None): + feeds, feedfileObject = load() try: - if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] + # We store the default to address as the first item in the feeds list. + # Here we take it out and save it for later. + if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds + if num: ifeeds = [feeds[num]] + for f in ifeeds: try: - if VERBOSE: print "Processing", f.url - result = feedparser.parse(f.url, f.etag, f.modified) - - if result.has_key('status') and result['status'] == 301: f.url = result['url'] - - if result.has_key('encoding'): enc = result['encoding'] - else: enc = 'utf-8' + if VERBOSE: print >>warn, "I: Processing", f.url + r = feedparser.parse(f.url, f.etag, f.modified) - c, ert = result['channel'], 'errorreportsto' + # Handle various status conditions, as required + if 'status' in r: + if r.status == 301: f.url = r['url'] + elif r.status == 410: + print >>warn, "W: feed gone; deleting", f.url + feeds.remove(f) + continue - headers = "From: " - if c.has_key('title'): headers += quoteEmailName(e(c, 'title')) + ' ' - if (not FORCE_FROM) and c.has_key(ert) and c[ert].startswith('mailto:'): - fr = c[ert][7:] - else: - fr = DEFAULT_FROM - - headers += '<'+fr+'>' - - headers += "\nTo: " + (f.to or default_to) # set a default email! - if not QP_REQUIRED: - headers += '\nContent-Type: text/plain; charset="' + enc + '"' - - if not result['items'] and ((not result.has_key('status') or (result.has_key('status') and result['status'] != 304))): - print "W: no items; invalid feed? (" + f.url + ")" + http_status = r.get('status', 200) + http_headers = r.get('headers', { + 'content-type': 'application/rss+xml', + 'content-length':'1'}) + exc_type = r.get("bozo_exception", Exception()).__class__ + if http_status != 304 and not r.entries and not r.get('version', ''): + if http_status not in [200, 302]: + print >>warn, "W: error", http_status, f.url + elif contains(http_headers.get('content-type', 'rss'), 'html'): + print >>warn, "W: looks like HTML", f.url + elif http_headers.get('content-length', '1') == '0': + print >>warn, "W: empty page", f.url + elif exc_type == socket.timeout: + print >>warn, "W: timed out on", f.url + elif exc_type == urllib2.URLError: + if r.bozo_exception.reason.__class__ is socket.gaierror: + exc_reason = r.bozo_exception.reason.args[1] + else: + exc_reason = r.bozo_exception.reason + print >>warn, "W:", exc_reason, f.url + else: + print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ===" + print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url + print >>warn, r + print >>warn, "=== END HERE ===" continue - - for i in result['items']: - content = getContent(i, f.url) - id = getID(i, content) - if i.has_key('link') and i['link']: link = e(i, 'link') - else: link = None + r.entries.reverse() + + for entry in r.entries: + id = getID(entry) - if i.has_key('id') and i['id']: frameid = e(i, 'id') - else: frameid = id + # If TRUST_GUID isn't set, we get back hashes of the content. + # Instead of letting these run wild, we put them in context + # by associating them with the actual ID (if it exists). - if f.seen.has_key(frameid) and f.seen[frameid] == id: - continue # have seen - - if i.has_key('title'): title = e(i, 'title') - else: title = content[:70].replace("\n", " ") + frameid = entry.get('id', id) + + # If this item's ID is in our database + # then it's already been sent + # and we don't need to do anything more. - if DATE_HEADER and i.has_key('date_parsed'): - datetime = i['date_parsed'] + if f.seen.has_key(frameid) and f.seen[frameid] == id: continue + + if 'title_detail' in entry and entry.title_detail: + title = entry.title_detail.value + if contains(entry.title_detail.type, 'html'): + title = html2text(title) else: - datetime = time.gmtime() + title = getContent(entry)[:70] + + title = unu(title).replace("\n", " ") - message = (headers - + "\nSubject: " + title - + "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) - + "\nUser-Agent: rss2email" - + "\n") + datetime = time.gmtime() + + if DATE_HEADER: + for datetype in DATE_HEADER_ORDER: + kind = datetype+"_parsed" + if kind in entry: datetime = entry[kind] + + content = getContent(entry, HTMLOK=HTML_MAIL) - message += "\n" + content.strip() + "\n" + link = unu(entry.get('link', "")) + + from_addr = unu(getEmail(r.feed, entry)) + + message = ( + "From: " + quote822(getName(r.feed, entry)) + " <"+from_addr+">" + + "\nTo: " + unu(f.to or default_to) + # set a default email! + "\nSubject: " + title + + "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) + + "\nUser-Agent: rss2email" + # really should be X-Mailer + "\nContent-Type: ") # but backwards-compatibility - if link: message += "\nURL: " + link + "\n" + if ishtml(content): + message += "text/html" + + content = ("\n\n" + + '

'+title+'

\n\n' + + unu(content[1]).strip() + # drop type tag (HACK: bad abstraction) + '

URL: '+link+'

' + + "\n\n") + else: + message += "text/plain" + content = unu(content).strip() + "\n\nURL: "+link + message += '; charset="utf-8"\n\n' + content + "\n" + if QP_REQUIRED: - mimify.CHARSET = enc + mimify.CHARSET = 'utf-8' ins, outs = SIO(message), SIO() - mimify.mimify(ins, outs); outs.seek(0) - message = outs.read() + mimify.mimify(ins, outs) + message = outs.getvalue() - send(fr, (f.to or default_to), message) + send(from_addr, (f.to or default_to), message) f.seen[frameid] = id - f.etag, f.modified = result.get('etag', None), result.get('modified', None) + f.etag, f.modified = r.get('etag', None), r.get('modified', None) except: - print "E: could not parse", f.url - traceback.print_exc() + print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ===" + print >>warn, "E: could not parse", f.url + traceback.print_exc(file=warn) + print>>warn, "=== END HERE ===" continue finally: - unlock(feeds, ff2) + unlock(feeds, feedfileObject) def list(): - feeds, ff2 = load(lock=0) + feeds, feedfileObject = load(lock=0) if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:]; i=1 @@ -239,15 +393,16 @@ def list(): i+= 1 def delete(n): - feeds, ff2 = load() + feeds, feedfileObject = load() feeds = feeds[:n] + feeds[n+1:] - unlock(feeds, ff2) + print >>warn, "W: feed IDs may have changed, list before deleting again" + unlock(feeds, feedfileObject) def email(addr): - feeds, ff2 = load() + feeds, feedfileObject = load() if feeds and isstr(feeds[0]): feeds[0] = addr else: feeds = [addr] + feeds - unlock(feeds, ff2) + unlock(feeds, feedfileObject) if __name__ == '__main__': ie, args = "InputError", sys.argv @@ -260,11 +415,12 @@ if __name__ == '__main__': def send(x,y,z): if VERBOSE: print 'Not sending', ( [x for x in z.splitlines() if x.startswith("Subject:")][0]) - run() + + if args and args[-1].isdigit(): run(int(args[-1])) + else: run() elif action == "email": email(args[0]) - print "W: Feed IDs may have changed. Run `list` before `delete`." elif action == "add": add(*args) @@ -279,7 +435,10 @@ if __name__ == '__main__': else: raise ie, "invalid action" - + + if smtpserver: + smtpserver.quit() + except ie, e: print "E:", e print