Add rss2email v2.5

author Aaron Swartz <me@aaronsw.com>

Fri, 25 Jun 2004 12:00:00 +0000 (12:00 +0000)

committer W. Trevor King <wking@tremily.us>

Fri, 15 Feb 2013 16:56:51 +0000 (11:56 -0500)
author Aaron Swartz <me@aaronsw.com>
Fri, 25 Jun 2004 12:00:00 +0000 (12:00 +0000)
committer W. Trevor King <wking@tremily.us>
Fri, 15 Feb 2013 16:56:51 +0000 (11:56 -0500)
diff --git a/rss2email.py b/rss2email.py

index a64a879650a065349a60cee22d7a65ab78f4d4a2..0b9333057a4779f753a996c339140e507c10c1bc 100644 (file)
--- a/rss2email.py
+++ b/rss2email.py
@@ -2,46 +2,50 @@
  """rss2email: get RSS feeds emailed to you
  http://www.aaronsw.com/2002/rss2email
  
-Usage: python rss2email.py feedfile action [options]
-  feedfile: name of the file to store feed info in
-  action [options]:
-       new [youremail] (create new feedfile)
-       email [yournewemail] (update default email)
-       run [--no-send]
-       add feedurl [youremail]
-       list
-       delete n
+Usage:
+  new [youremail] (create new feedfile)
+  email [yournewemail] (update default email)
+  run [--no-send] [num]
+  add feedurl [youremail]
+  list
+  delete n
  """
-__version__ = "2.32"
+__version__ = "2.5"
  __author__ = "Aaron Swartz (me@aaronsw.com)"
  __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2."
  ___contributors__ = ["Dean Jackson (dino@grorg.org)", 
-                                        "Brian Lalor (blalor@ithacabands.org)",
-                                        "Joey Hess", 'Matej Cepl']
+                     "Brian Lalor (blalor@ithacabands.org)",
+                     "Joey Hess", 'Matej Cepl']
  
  ### Vaguely Customizable Options ###
  
  # The email address messages are from by default:
-DEFAULT_FROM = "bozo@dev.null"
+DEFAULT_FROM = "bozo@dev.null.invalid"
+
+# 1: Send text/html messages when possible.
+# 0: Convert HTML to plain text.
+HTML_MAIL = 0
  
  # 1: Only use the DEFAULT_FROM address.
  # 0: Use the email address specified by the feed, when possible.
  FORCE_FROM = 0
  
-# 1: Receive one email per post
-# 0: Receive an email every time a post changes
+# 1: Receive one email per post.
+# 0: Receive an email every time a post changes.
  TRUST_GUID = 1
  
-# 1: Generate Date header based on item's date, when possible
-# 0: Generate Date header based on time sent
+# 1: Generate Date header based on item's date, when possible.
+# 0: Generate Date header based on time sent.
  DATE_HEADER = 0
  
-# 1: Treat the contents of <description> as HTML
-# 0: Send the contents of <description> as is, without conversion
-TREAT_DESCRIPTION_AS_HTML = 1
+# A tuple consisting of some combination of
+# ('issued', 'created', 'modified', 'expired')
+# expressing ordered list of preference in dates 
+# to use for the Date header of the email.
+DATE_HEADER_ORDER = ('modified', 'issued', 'created')
  
-# 1: Apply Q-P conversion (required for some MUAs)
-# 0: Send message in 8-bits
+# 1: Apply Q-P conversion (required for some MUAs).
+# 0: Send message in 8-bits.
  # http://cr.yp.to/smtp/8bitmime.html
  QP_REQUIRED = 0
  
@@ -49,18 +53,37 @@ QP_REQUIRED = 0
  # 0: Keep quiet.
  VERBOSE = 0
  
+# 1: Use the publisher's email if you can't find the author's.
+# 0: Just use the DEFAULT_FROM email instead.
+USE_PUBLISHER_EMAIL = 0
+
+# 1: Use SMTP_SERVER to send mail.
+# 0: Call /usr/bin/sendmail to send mail.
+SMTP_SEND = 0
+
+SMTP_SERVER = "smtp.yourisp.net:25"
+
+# Note: You can also override the send function.
  def send(fr, to, message):
-       i, o = os.popen2(["/usr/sbin/sendmail", to])
-       i.write(message)
-       i.close(); o.close()
-       del i, o
-       
-# def send(fr, to, message):
-#      import smtplib
-#      s = smtplib.SMTP("vorpal.notabug.com:26")
-#      s.sendmail(fr, [to], message)
+       if SMTP_SEND:
+               smtpserver.sendmail(fr, [to], message)
+       else:
+               i, o = os.popen2(["/usr/sbin/sendmail", to])
+               i.write(message)
+               i.close(); o.close()
+               del i, o
+
+## html2text options ##
+
+# 1: Use Unicode characters
+# 0: Use ASCII psuedo-replacements
+UNICODE_SNOB = 0
+
+# 1: Put the links after each paragraph
+# 0: Put all links at the end
+LINKS_EACH_PARAGRAPH = 0
  
-### End of Options ###
+### Load the Options ###
  
  # Read options from config file if present.
  import sys
@@ -69,49 +92,126 @@ try:
         from config import *
  except:
         pass
+       
+### Import Modules ###
  
-from html2text import html2text, expandEntities
-import feedparser
-import cPickle as pickle, fcntl, md5, time, os, traceback
+import cPickle as pickle, fcntl, md5, time, os, traceback, socket, urllib2, sys
  if QP_REQUIRED: import mimify; from StringIO import StringIO as SIO
-def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
+if SMTP_SEND: import smtplib; smtpserver = smtplib.SMTP(SMTP_SERVER)
+else: smtpserver = None
+
+import feedparser
+feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.aaronsw.com/2002/rss2email/"
+
+import html2text
+
+html2text.UNICODE_SNOB = UNICODE_SNOB
+html2text.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
+html2text = html2text.html2text
+
+### Utility Functions ###
  
-def e(obj, val, ee=1):
-       x = obj[val]
-       if ee: x = expandEntities(x)
-       if type(x) is unicode: x = x.encode('utf-8')
-       return x.strip()
+warn = sys.stderr
  
-def quoteEmailName(s):
-       return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
+def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
+def ishtml(t): return type(t) is tuple
+def contains(a,b): return a.find(b) != -1
+def unu(s): # I / freakin' hate / that unicode
+       if type(s) is unicode: return s.encode('utf-8')
+       else: return s
+
+def quote822(s):
+       """Quote names in email according to RFC822."""
+       return '"' + unu(s).replace("\\", "\\\\").replace('"', '\\"') + '"'
  
-def getContent(item, url):
-       if item.has_key('content') and item['content']:
-               for c in item['content']:
-                       if c['type'] == 'text/plain': return e(c, 'value')
+### Parsing Utilities ###
  
-               for c in item['content']:
-                       if c['type'].find('html') != -1:
-                               return html2text(e(c, 'value', ee=0), c['base'])
+def getContent(entry, HTMLOK=0):
+       """Select the best content from an entry, deHTMLizing if necessary.
+       If raw HTML is best, an ('HTML', best) tuple is returned. """
+       
+       # How this works:
+       #  * We have a bunch of potential contents. 
+       #  * We go thru looking for our first choice. 
+       #    (HTML or text, depending on HTMLOK)
+       #  * If that doesn't work, we go thru looking for our second choice.
+       #  * If that still doesn't work, we just take the first one.
+       #
+       # Possible future improvement:
+       #  * Instead of just taking the first one
+       #    pick the one in the "best" language.
+       #  * HACK: hardcoded HTMLOK, should take a tuple of media types
+       
+       if entry.get('content', []):
+               if HTMLOK:
+                       for c in entry.content:
+                               if contains(c.type, 'html'): return ('HTML', c.value)
+       
+               for c in entry.content:
+                       if c.type == 'text/plain': return c.value
+       
+               if not HTMLOK: # Only need to convert to text if HTML isn't OK
+                       for c in entry.content:
+                               if contains(c.type, 'html'):
+                                       return html2text(c.value)
                 
-               return e(item['content'][0], 'value')
-                       
-       if item.has_key('description'): 
-               if TREAT_DESCRIPTION_AS_HTML:
-                       return html2text(e(item, 'description', ee=0), url)
-               else:
-                       return e(item, 'description')
+               return entry.content[0].value   
         
-       if item.has_key('summary'): return e(item, 'summary')
+       if entry.get('summary_detail', {}):
+               s = entry.summary_detail.value
+               if contains(entry.summary_detail.type, 'html'): s = html2text(s)
+               return s
+
         return ""
  
-def getID(item, content):
+def getID(entry):
+       """Get best ID from an entry."""
         if TRUST_GUID:
-               if item.has_key('id') and item['id']: return e(item, 'id')
+               if 'id' in entry and entry.id: return entry.id
+
+       content = getContent(entry)
+       if content: return md5.new(unu(content)).hexdigest()
+       if 'link' in entry: return entry.link
+       if 'title' in entry: return md5.new(unu(entry.title)).hexdigest()
+
+def getName(feed, entry):
+       """Get the best name."""
+       
+       name = feed.get('title', '')
+       
+       if 'name' in entry.get('author_detail', []): # normally {} but py2.1
+               if name: name += ", "
+               name +=  entry.author_detail.name
+
+       elif 'name' in feed.get('author_detail', []):
+               if name: name += ", "
+               name += feed.author_detail.name
+       
+       return name
+
+def getEmail(feed, entry):
+       """Get the best email_address."""
  
-       if content: return md5.new(content).hexdigest()
-       if item.has_key('link'): return e(item, 'link')
-       if item.has_key('title'): return md5.new(e(item, 'title')).hexdigest()
+       if FORCE_FROM: return DEFAULT_FROM
+       
+       if 'email' in entry.get('author_detail', []):
+               return entry.author_detail.email
+       
+       if 'email' in feed.get('author_detail', []):
+               return feed.author_detail.email
+               
+       #TODO: contributors
+       
+       if USE_PUBLISHER_EMAIL:
+               if 'email' in feed.get('publisher_detail', []):
+                       return feed.publisher_detail.email
+               
+               if feed.get("errorreportsto", ''):
+                       return feed.errorreportsto
+                       
+       return DEFAULT_FROM
+
+### Simple Database of Feeds ###
  
  class Feed:
         def __init__(self, url, to):
@@ -119,116 +219,170 @@ class Feed:
                 self.to = to            
  
  def load(lock=1):
-       ff2 = open(feedfile, 'r')
-       feeds = pickle.load(ff2)
+       feedfileObject = open(feedfile, 'r')
+       feeds = pickle.load(feedfileObject)
         if lock:
-               fcntl.flock(ff2, fcntl.LOCK_EX)
+               fcntl.flock(feedfileObject, fcntl.LOCK_EX)
                 #HACK: to deal with lock caching
-               ff2 = open(feedfile, 'r')
-               feeds = pickle.load(ff2)
-               fcntl.flock(ff2, fcntl.LOCK_EX)
+               feedfileObject = open(feedfile, 'r')
+               feeds = pickle.load(feedfileObject)
+               fcntl.flock(feedfileObject, fcntl.LOCK_EX)
  
-       return feeds, ff2
+       return feeds, feedfileObject
  
-def unlock(feeds, ff2):
+def unlock(feeds, feedfileObject):
         pickle.dump(feeds, open(feedfile+'.tmp', 'w'))
         os.rename(feedfile+'.tmp', feedfile)
-       fcntl.flock(ff2, fcntl.LOCK_UN)
+       fcntl.flock(feedfileObject, fcntl.LOCK_UN)
+
+### Program Functions ###
+
+def add(*args):
+       if len(args) == 2 and contains(args[1], '@') and not contain(args[1], '://'):
+               urls, to = [args[0]], args[1]
+       else:
+               urls, to = args, None
         
-def add(url, to=None):
-       feeds, ff2 = load()
+       feeds, feedfileObject = load()
         if feeds and not isstr(feeds[0]) and to is None:
                 raise 'NoEmail', "Run `email newaddr` or `add url addr`."
-       feeds.append(Feed(url, to))
-       unlock(feeds, ff2)
+       for url in urls: feeds.append(Feed(url, to))
+       unlock(feeds, feedfileObject)
  
-def run():
-       feeds, ff2 = load()
+def run(num=None):
+       feeds, feedfileObject = load()
         try:
-               if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:]
+               # We store the default to address as the first item in the feeds list.
+               # Here we take it out and save it for later.
+               if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
                 else: ifeeds = feeds
                 
+               if num: ifeeds = [feeds[num]]
+               
                 for f in ifeeds:
                         try: 
-                               if VERBOSE: print "Processing", f.url
-                               result = feedparser.parse(f.url, f.etag, f.modified)
-                               
-                               if result.has_key('status') and result['status'] == 301: f.url = result['url']
-                               
-                               if result.has_key('encoding'): enc = result['encoding']
-                               else: enc = 'utf-8'
+                               if VERBOSE: print >>warn, "I: Processing", f.url
+                               r = feedparser.parse(f.url, f.etag, f.modified)
                                 
-                               c, ert = result['channel'], 'errorreportsto'
+                               # Handle various status conditions, as required
+                               if 'status' in r:
+                                       if r.status == 301: f.url = r['url']
+                                       elif r.status == 410:
+                                               print >>warn, "W: feed gone; deleting", f.url
+                                               feeds.remove(f)
+                                               continue
                                 
-                               headers = "From: "
-                               if c.has_key('title'): headers += quoteEmailName(e(c, 'title')) + ' '
-                               if (not FORCE_FROM) and c.has_key(ert) and c[ert].startswith('mailto:'):
-                                       fr = c[ert][7:]
-                               else:
-                                       fr = DEFAULT_FROM
-                               
-                               headers += '<'+fr+'>'
-                                               
-                               headers += "\nTo: " + (f.to or default_to) # set a default email!
-                               if not QP_REQUIRED:
-                                       headers += '\nContent-Type: text/plain; charset="' + enc + '"'
-                               
-                               if not result['items'] and ((not result.has_key('status') or (result.has_key('status') and result['status'] != 304))):
-                                       print "W: no items; invalid feed? (" + f.url + ")"
+                               http_status = r.get('status', 200)
+                               http_headers = r.get('headers', {
+                                 'content-type': 'application/rss+xml', 
+                                 'content-length':'1'})
+                               exc_type = r.get("bozo_exception", Exception()).__class__
+                               if http_status != 304 and not r.entries and not r.get('version', ''):
+                                       if http_status not in [200, 302]: 
+                                               print >>warn, "W: error", http_status, f.url
+                                       elif contains(http_headers.get('content-type', 'rss'), 'html'):
+                                               print >>warn, "W: looks like HTML", f.url
+                                       elif http_headers.get('content-length', '1') == '0':
+                                               print >>warn, "W: empty page", f.url
+                                       elif exc_type == socket.timeout:
+                                               print >>warn, "W: timed out on", f.url
+                                       elif exc_type == urllib2.URLError:
+                                               if r.bozo_exception.reason.__class__ is socket.gaierror:
+                                                       exc_reason = r.bozo_exception.reason.args[1]
+                                               else:
+                                                       exc_reason = r.bozo_exception.reason
+                                               print >>warn, "W:", exc_reason, f.url
+                                       else:
+                                               print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
+                                               print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
+                                               print >>warn, r
+                                               print >>warn, "=== END HERE ==="
                                         continue
-                       
-                               for i in result['items']:
-                                       content = getContent(i, f.url)
-                                       id = getID(i, content)
                                 
-                                       if i.has_key('link') and i['link']: link = e(i, 'link')
-                                       else: link = None
+                               r.entries.reverse()
+                               
+                               for entry in r.entries:
+                                       id = getID(entry)
                                         
-                                       if i.has_key('id') and i['id']: frameid = e(i, 'id')
-                                       else: frameid = id
+                                       # If TRUST_GUID isn't set, we get back hashes of the content.
+                                       # Instead of letting these run wild, we put them in context
+                                       # by associating them with the actual ID (if it exists).
                                         
-                                       if f.seen.has_key(frameid) and f.seen[frameid] == id:
-                                               continue # have seen
-                       
-                                       if i.has_key('title'): title = e(i, 'title')
-                                       else: title = content[:70].replace("\n", " ")
+                                       frameid = entry.get('id', id)
+                                       
+                                       # If this item's ID is in our database
+                                       # then it's already been sent
+                                       # and we don't need to do anything more.
                                         
-                                       if DATE_HEADER and i.has_key('date_parsed'):
-                                               datetime = i['date_parsed']     
+                                       if f.seen.has_key(frameid) and f.seen[frameid] == id: continue
+                                                                               
+                                       if 'title_detail' in entry and entry.title_detail:
+                                               title = entry.title_detail.value
+                                               if contains(entry.title_detail.type, 'html'):
+                                                       title = html2text(title)
                                         else:
-                                               datetime = time.gmtime()
+                                               title = getContent(entry)[:70]
+
+                                       title = unu(title).replace("\n", " ")
                                         
-                                       message = (headers
-                                                          + "\nSubject: " + title
-                                                          + "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
-                                                          + "\nUser-Agent: rss2email"
-                                                          + "\n")
+                                       datetime = time.gmtime()
+
+                                       if DATE_HEADER:
+                                               for datetype in DATE_HEADER_ORDER:
+                                                       kind = datetype+"_parsed"
+                                                       if kind in entry: datetime = entry[kind]
+                                               
+                                       content = getContent(entry, HTMLOK=HTML_MAIL)
                                         
-                                       message += "\n" + content.strip() + "\n"
+                                       link = unu(entry.get('link', ""))
+
+                                       from_addr = unu(getEmail(r.feed, entry))
+
+                                       message = (
+                                       "From: " + quote822(getName(r.feed, entry)) + " <"+from_addr+">" +
+                                       "\nTo: " + unu(f.to or default_to) + # set a default email!
+                                       "\nSubject: " + title +
+                                       "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) +
+                                       "\nUser-Agent: rss2email" + # really should be X-Mailer 
+                                       "\nContent-Type: ")         # but backwards-compatibility
                                         
-                                       if link: message += "\nURL: " + link + "\n"
+                                       if ishtml(content):
+                                               message += "text/html"
+                                               
+                                               content = ("<html><body>\n\n" + 
+                                                          '<h1><a href="'+link+'">'+title+'</a></h1>\n\n' +
+                                                          unu(content[1]).strip() + # drop type tag (HACK: bad abstraction)
+                                                          '<p>URL: <a href="'+link+'">'+link+'</a></p>' +
+                                                          "\n\n</body></html>")
+                                       else:
+                                               message += "text/plain"
+                                               content = unu(content).strip() + "\n\nURL: "+link
                                         
+                                       message += '; charset="utf-8"\n\n' + content + "\n"
+
                                         if QP_REQUIRED:
-                                               mimify.CHARSET = enc
+                                               mimify.CHARSET = 'utf-8'
                                                 ins, outs = SIO(message), SIO()
-                                               mimify.mimify(ins, outs); outs.seek(0)
-                                               message = outs.read()
+                                               mimify.mimify(ins, outs)
+                                               message = outs.getvalue()
                                         
-                                       send(fr, (f.to or default_to), message)
+                                       send(from_addr, (f.to or default_to), message)
                         
                                         f.seen[frameid] = id
                                         
-                               f.etag, f.modified = result.get('etag', None), result.get('modified', None)
+                               f.etag, f.modified = r.get('etag', None), r.get('modified', None)
                         except:
-                               print "E: could not parse", f.url
-                               traceback.print_exc()
+                               print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
+                               print >>warn, "E: could not parse", f.url
+                               traceback.print_exc(file=warn)
+                               print>>warn, "=== END HERE ==="
                                 continue
  
         finally:                
-               unlock(feeds, ff2)
+               unlock(feeds, feedfileObject)
  
  def list():
-       feeds, ff2 = load(lock=0)
+       feeds, feedfileObject = load(lock=0)
         
         if feeds and isstr(feeds[0]):
                 default_to = feeds[0]; ifeeds = feeds[1:]; i=1
@@ -239,15 +393,16 @@ def list():
                 i+= 1
  
  def delete(n):
-       feeds, ff2 = load()
+       feeds, feedfileObject = load()
         feeds = feeds[:n] + feeds[n+1:]
-       unlock(feeds, ff2)
+       print >>warn, "W: feed IDs may have changed, list before deleting again"
+       unlock(feeds, feedfileObject)
         
  def email(addr):
-       feeds, ff2 = load()
+       feeds, feedfileObject = load()
         if feeds and isstr(feeds[0]): feeds[0] = addr
         else: feeds = [addr] + feeds
-       unlock(feeds, ff2)
+       unlock(feeds, feedfileObject)
  
  if __name__ == '__main__':
         ie, args = "InputError", sys.argv
@@ -260,11 +415,12 @@ if __name__ == '__main__':
                                 def send(x,y,z):
                                         if VERBOSE: print 'Not sending', (
                                         [x for x in z.splitlines() if x.startswith("Subject:")][0])
-                       run()
+
+                       if args and args[-1].isdigit(): run(int(args[-1]))
+                       else: run()
  
                 elif action == "email":
                         email(args[0])
-                       print "W: Feed IDs may have changed. Run `list` before `delete`."
  
                 elif action == "add": add(*args)
  
@@ -279,7 +435,10 @@ if __name__ == '__main__':
  
                 else:
                         raise ie, "invalid action"
-                       
+               
+               if smtpserver:
+                       smtpserver.quit()
+               
         except ie, e:
                 print "E:", e
                 print
author	Aaron Swartz <me@aaronsw.com>
	Fri, 25 Jun 2004 12:00:00 +0000 (12:00 +0000)
committer	W. Trevor King <wking@tremily.us>
	Fri, 15 Feb 2013 16:56:51 +0000 (11:56 -0500)