rss2email.py

   1 #!/usr/bin/python
   2 """rss2email: get RSS feeds emailed to you
   3 http://www.aaronsw.com/2002/rss2email
   4
   5 Usage:
   6   new [youremail] (create new feedfile)
   7   email [yournewemail] (update default email)
   8   run [--no-send] [num]
   9   add feedurl [youremail]
  10   list
  11   delete n
  12 """
  13 __version__ = "2.56"
  14 __author__ = "Aaron Swartz (me@aaronsw.com)"
  15 __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2."
  16 ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess",
  17                      "Matej Cepl", "Martin 'Joey' Schulze", "Marcel Ackermann (http://www.DreamFlasher.de)", "Lindsey Smith (lindsey.smith@gmail.com)" ]
  18
  19 ### Vaguely Customizable Options ###
  20
  21 # The email address messages are from by default:
  22 DEFAULT_FROM = "bozo@dev.null.invalid"
  23
  24 # 1: Send text/html messages when possible.
  25 # 0: Convert HTML to plain text.
  26 HTML_MAIL = 0
  27
  28 # 1: Only use the DEFAULT_FROM address.
  29 # 0: Use the email address specified by the feed, when possible.
  30 FORCE_FROM = 0
  31
  32 # 1: Receive one email per post.
  33 # 0: Receive an email every time a post changes.
  34 TRUST_GUID = 1
  35
  36 # 1: Generate Date header based on item's date, when possible.
  37 # 0: Generate Date header based on time sent.
  38 DATE_HEADER = 0
  39
  40 # A tuple consisting of some combination of
  41 # ('issued', 'created', 'modified', 'expired')
  42 # expressing ordered list of preference in dates
  43 # to use for the Date header of the email.
  44 DATE_HEADER_ORDER = ('modified', 'issued', 'created')
  45
  46 # 1: Apply Q-P conversion (required for some MUAs).
  47 # 0: Send message in 8-bits.
  48 # http://cr.yp.to/smtp/8bitmime.html
  49 QP_REQUIRED = 0
  50
  51 # 1: Name feeds as they're being processed.
  52 # 0: Keep quiet.
  53 VERBOSE = 0
  54
  55 # 1: Use the publisher's email if you can't find the author's.
  56 # 0: Just use the DEFAULT_FROM email instead.
  57 USE_PUBLISHER_EMAIL = 0
  58
  59 # 1: Use SMTP_SERVER to send mail.
  60 # 0: Call /usr/bin/sendmail to send mail.
  61 SMTP_SEND = 0
  62
  63 SMTP_SERVER = "smtp.yourisp.net:25"
  64 AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
  65 SMTP_USER = ' username'  # for SMTP AUTH, set SMTP username here
  66 SMTP_PASS = 'password'  # for SMTP AUTH, set SMTP password here
  67
  68 # Set this to add a bonus header to all emails (start with '\n').
  69 BONUS_HEADER = ''
  70 # Example: BONUS_HEADER = '\nApproved: joe@bob.org'
  71
  72 # Set this to override From addresses. Keys are feed URLs, values are new titles.
  73 OVERRIDE_FROM = {}
  74
  75 # Note: You can also override the send function.
  76 def send(fr, to, message):
  77         if SMTP_SEND:
  78                 import smtplib
  79                 session = smtplib.SMTP(SMTP_SERVER)
  80                 if AUTHREQUIRED:
  81                         session.login(SMTP_USER, SMTP_PASS)
  82                 session.sendmail(fr, [to], message)
  83         else:
  84                 i, o = os.popen2(["/usr/sbin/sendmail", to])
  85                 i.write(message)
  86                 i.close(); o.close()
  87                 del i, o
  88
  89 ## html2text options ##
  90
  91 # Use Unicode characters instead of their ascii psuedo-replacements
  92 UNICODE_SNOB = 0
  93
  94 # Put the links after each paragraph instead of at the end.
  95 LINKS_EACH_PARAGRAPH = 0
  96
  97 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
  98 BODY_WIDTH = 0
  99
 100 ### Load the Options ###
 101
 102 # Read options from config file if present.
 103 import sys
 104 sys.path.append(".")
 105 try:
 106         from config import *
 107 except:
 108         pass
 109
 110 ### Import Modules ###
 111
 112 import cPickle as pickle, md5, time, os, traceback, urllib2, sys, types
 113 unix = 0
 114 try:
 115         import fcntl
 116         unix = 1
 117 except:
 118         pass
 119
 120 import socket; socket_errors = []
 121 for e in ['error', 'gaierror']:
 122         if hasattr(socket, e): socket_errors.append(getattr(socket, e))
 123 import mimify; from StringIO import StringIO as SIO; mimify.CHARSET = 'utf-8'
 124 if SMTP_SEND: import smtplib; smtpserver = smtplib.SMTP(SMTP_SERVER)
 125 else: smtpserver = None
 126
 127 import feedparser
 128 feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.aaronsw.com/2002/rss2email/"
 129
 130 import html2text as h2t
 131
 132 h2t.UNICODE_SNOB = UNICODE_SNOB
 133 h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
 134 h2t.BODY_WIDTH = BODY_WIDTH
 135 html2text = h2t.html2text
 136
 137 ### Utility Functions ###
 138
 139 warn = sys.stderr
 140
 141 def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
 142 def ishtml(t): return type(t) is type(())
 143 def contains(a,b): return a.find(b) != -1
 144 def unu(s): # I / freakin' hate / that unicode
 145         if type(s) is types.UnicodeType: return s.encode('utf-8')
 146         else: return s
 147
 148 def quote822(s):
 149         """Quote names in email according to RFC822."""
 150         return '"' + unu(s).replace("\\", "\\\\").replace('"', '\\"') + '"'
 151
 152 def header7bit(s):
 153         """QP_CORRUPT headers."""
 154         return mimify.mime_encode_header(s + ' ')[:-1]
 155
 156 ### Parsing Utilities ###
 157
 158 def getContent(entry, HTMLOK=0):
 159         """Select the best content from an entry, deHTMLizing if necessary.
 160         If raw HTML is best, an ('HTML', best) tuple is returned. """
 161
 162         # How this works:
 163         #  * We have a bunch of potential contents.
 164         #  * We go thru looking for our first choice.
 165         #    (HTML or text, depending on HTMLOK)
 166         #  * If that doesn't work, we go thru looking for our second choice.
 167         #  * If that still doesn't work, we just take the first one.
 168         #
 169         # Possible future improvement:
 170         #  * Instead of just taking the first one
 171         #    pick the one in the "best" language.
 172         #  * HACK: hardcoded HTMLOK, should take a tuple of media types
 173
 174         conts = entry.get('content', [])
 175
 176         if entry.get('summary_detail', {}):
 177                 conts += [entry.summary_detail]
 178
 179         if conts:
 180                 if HTMLOK:
 181                         for c in conts:
 182                                 if contains(c.type, 'html'): return ('HTML', c.value)
 183
 184                 for c in conts:
 185                         if c.type == 'text/plain': return c.value
 186
 187                 if not HTMLOK: # Only need to convert to text if HTML isn't OK
 188                         for c in conts:
 189                                 if contains(c.type, 'html'):
 190                                         return html2text(c.value)
 191
 192                 return conts[0].value
 193
 194         return ""
 195
 196 def getID(entry):
 197         """Get best ID from an entry."""
 198         if TRUST_GUID:
 199                 if 'id' in entry and entry.id: return entry.id
 200
 201         content = getContent(entry)
 202         if content: return md5.new(unu(content)).hexdigest()
 203         if 'link' in entry: return entry.link
 204         if 'title' in entry: return md5.new(unu(entry.title)).hexdigest()
 205
 206 def getName(r, entry):
 207         """Get the best name."""
 208
 209         feed = r.feed
 210         if r.url in OVERRIDE_FROM.keys():
 211                 return unu(OVERRIDE_FROM[r.url])
 212
 213         name = feed.get('title', '')
 214
 215         if 'name' in entry.get('author_detail', []): # normally {} but py2.1
 216                 if entry.author_detail.name:
 217                         if name: name += ", "
 218                         name +=  entry.author_detail.name
 219
 220         elif 'name' in feed.get('author_detail', []):
 221                 if feed.author_detail.name:
 222                         if name: name += ", "
 223                         name += feed.author_detail.name
 224
 225         return name
 226
 227 def getEmail(feed, entry):
 228         """Get the best email_address."""
 229
 230         if FORCE_FROM: return DEFAULT_FROM
 231
 232         if 'email' in entry.get('author_detail', []):
 233                 return entry.author_detail.email
 234
 235         if 'email' in feed.get('author_detail', []):
 236                 return feed.author_detail.email
 237
 238         #TODO: contributors
 239
 240         if USE_PUBLISHER_EMAIL:
 241                 if 'email' in feed.get('publisher_detail', []):
 242                         return feed.publisher_detail.email
 243
 244                 if feed.get("errorreportsto", ''):
 245                         return feed.errorreportsto
 246
 247         return DEFAULT_FROM
 248
 249 ### Simple Database of Feeds ###
 250
 251 class Feed:
 252         def __init__(self, url, to):
 253                 self.url, self.etag, self.modified, self.seen = url, None, None, {}
 254                 self.to = to
 255
 256 def load(lock=1):
 257         feedfileObject = open(feedfile, 'r')
 258         feeds = pickle.load(feedfileObject)
 259         if lock:
 260                 if unix: fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_EX)
 261                 #HACK: to deal with lock caching
 262                 feedfileObject = open(feedfile, 'r')
 263                 feeds = pickle.load(feedfileObject)
 264                 if unix: fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_EX)
 265
 266         return feeds, feedfileObject
 267
 268 def unlock(feeds, feedfileObject):
 269         if not unix:
 270                 pickle.dump(feeds, open(feedfile, 'w'))
 271         else:
 272                 pickle.dump(feeds, open(feedfile+'.tmp', 'w'))
 273                 os.rename(feedfile+'.tmp', feedfile)
 274                 fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN)
 275
 276 ### Program Functions ###
 277
 278 def add(*args):
 279         if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'):
 280                 urls, to = [args[0]], args[1]
 281         else:
 282                 urls, to = args, None
 283
 284         feeds, feedfileObject = load()
 285         if feeds and not isstr(feeds[0]) and to is None:
 286                 raise 'NoEmail', "Run `email newaddr` or `add url addr`."
 287         for url in urls: feeds.append(Feed(url, to))
 288         unlock(feeds, feedfileObject)
 289
 290 def run(num=None):
 291         feeds, feedfileObject = load()
 292         try:
 293                 # We store the default to address as the first item in the feeds list.
 294                 # Here we take it out and save it for later.
 295                 if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:]
 296                 else: ifeeds = feeds
 297
 298                 if num: ifeeds = [feeds[num]]
 299
 300                 for f in ifeeds:
 301                         try:
 302                                 if VERBOSE: print >>warn, "I: Processing", f.url
 303                                 r = feedparser.parse(f.url, f.etag, f.modified)
 304
 305                                 # Handle various status conditions, as required
 306                                 if 'status' in r:
 307                                         if r.status == 301: f.url = r['url']
 308                                         elif r.status == 410:
 309                                                 print >>warn, "W: feed gone; deleting", f.url
 310                                                 feeds.remove(f)
 311                                                 continue
 312
 313                                 http_status = r.get('status', 200)
 314                                 http_headers = r.get('headers', {
 315                                   'content-type': 'application/rss+xml',
 316                                   'content-length':'1'})
 317                                 exc_type = r.get("bozo_exception", Exception()).__class__
 318                                 if http_status != 304 and not r.entries and not r.get('version', ''):
 319                                         if http_status not in [200, 302]:
 320                                                 print >>warn, "W: error", http_status, f.url
 321
 322                                         elif contains(http_headers.get('content-type', 'rss'), 'html'):
 323                                                 print >>warn, "W: looks like HTML", f.url
 324
 325                                         elif http_headers.get('content-length', '1') == '0':
 326                                                 print >>warn, "W: empty page", f.url
 327
 328                                         elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
 329                                                 print >>warn, "W: timed out on", f.url
 330
 331                                         elif exc_type == IOError:
 332                                                 print >>warn, "W:", r.bozo_exception, f.url
 333
 334                                         elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
 335                                                 print >>warn, "W: broken compression", f.url
 336
 337                                         elif exc_type in socket_errors:
 338                                                 exc_reason = r.bozo_exception.args[1]
 339                                                 print >>warn, "W:", exc_reason, f.url
 340
 341                                         elif exc_type == urllib2.URLError:
 342                                                 if r.bozo_exception.reason.__class__ in socket_errors:
 343                                                         exc_reason = r.bozo_exception.reason.args[1]
 344                                                 else:
 345                                                         exc_reason = r.bozo_exception.reason
 346                                                 print >>warn, "W:", exc_reason, f.url
 347
 348                                         elif exc_type == KeyboardInterrupt:
 349                                                 raise r.bozo_exception
 350
 351                                         else:
 352                                                 print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
 353                                                 print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
 354                                                 print >>warn, r
 355                                                 print >>warn, "rss2email", __version__
 356                                                 print >>warn, "feedparser", feedparser.__version__
 357                                                 print >>warn, "html2text", h2t.__version__
 358                                                 print >>warn, "Python", sys.version
 359                                                 print >>warn, "=== END HERE ==="
 360                                         continue
 361
 362                                 r.entries.reverse()
 363
 364                                 for entry in r.entries:
 365                                         id = getID(entry)
 366
 367                                         # If TRUST_GUID isn't set, we get back hashes of the content.
 368                                         # Instead of letting these run wild, we put them in context
 369                                         # by associating them with the actual ID (if it exists).
 370
 371                                         frameid = entry.get('id', id)
 372
 373                                         # If this item's ID is in our database
 374                                         # then it's already been sent
 375                                         # and we don't need to do anything more.
 376
 377                                         if f.seen.has_key(frameid) and f.seen[frameid] == id: continue
 378
 379                                         if 'title_detail' in entry and entry.title_detail:
 380                                                 title = entry.title_detail.value
 381                                                 if contains(entry.title_detail.type, 'html'):
 382                                                         title = html2text(title)
 383                                         else:
 384                                                 title = getContent(entry)[:70]
 385
 386                                         title = unu(title).replace("\n", " ")
 387
 388                                         datetime = time.gmtime()
 389
 390                                         if DATE_HEADER:
 391                                                 for datetype in DATE_HEADER_ORDER:
 392                                                         kind = datetype+"_parsed"
 393                                                         if kind in entry and entry[kind]: datetime = entry[kind]
 394
 395                                         content = getContent(entry, HTMLOK=HTML_MAIL)
 396
 397                                         link = unu(entry.get('link', ""))
 398
 399                                         from_addr = unu(getEmail(r.feed, entry))
 400
 401                                         message = (
 402                                         "From: " + quote822(header7bit(getName(r, entry))) + " <"+from_addr+">" +
 403                                         "\nTo: " + header7bit(unu(f.to or default_to)) + # set a default email!
 404                                         "\nSubject: " + unu(html2text(header7bit(title))).strip() +
 405                                         "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) +
 406                                         "\nUser-Agent: rss2email" + # really should be X-Mailer
 407                                         BONUS_HEADER +
 408                                         "\nContent-Type: ")         # but backwards-compatibility
 409
 410                                         if ishtml(content):
 411                                                 message += "text/html"
 412
 413                                                 content = ("<html><body>\n\n" +
 414                                                            '<h1><a href="'+link+'">'+title+'</a></h1>\n\n' +
 415                                                            unu(content[1]).strip() + # drop type tag (HACK: bad abstraction)
 416                                                            '<p>URL: <a href="'+link+'">'+link+'</a></p>' +
 417                                                            "\n\n</body></html>")
 418                                         else:
 419                                                 message += "text/plain"
 420                                                 content = unu(content).strip() + "\n\nURL: "+link
 421
 422                                         message += '; charset="utf-8"\n\n' + content + "\n"
 423
 424                                         if QP_REQUIRED:
 425                                                 ins, outs = SIO(message), SIO()
 426                                                 mimify.mimify(ins, outs)
 427                                                 message = outs.getvalue()
 428
 429                                         send(from_addr, (f.to or default_to), message)
 430
 431                                         f.seen[frameid] = id
 432
 433                                 f.etag, f.modified = r.get('etag', None), r.get('modified', None)
 434                         except KeyboardInterrupt:
 435                                 raise
 436                         except:
 437                                 print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
 438                                 print >>warn, "E: could not parse", f.url
 439                                 #if title: print >>warn, "Entry entitled: ", title
 440                                 traceback.print_exc(file=warn)
 441                                 print >>warn, "rss2email", __version__
 442                                 print >>warn, "feedparser", feedparser.__version__
 443                                 print >>warn, "html2text", h2t.__version__
 444                                 print >>warn, "Python", sys.version
 445                                 print >>warn, "=== END HERE ==="
 446                                 continue
 447
 448         finally:
 449                 unlock(feeds, feedfileObject)
 450
 451 def list():
 452         feeds, feedfileObject = load(lock=0)
 453
 454         if feeds and isstr(feeds[0]):
 455                 default_to = feeds[0]; ifeeds = feeds[1:]; i=1
 456                 print "default email:", default_to
 457         else: ifeeds = feeds; i = 0
 458         for f in ifeeds:
 459                 print `i`+':', f.url, '('+(f.to or ('default: '+default_to))+')'
 460                 i+= 1
 461
 462 def delete(n):
 463         feeds, feedfileObject = load()
 464         feeds = feeds[:n] + feeds[n+1:]
 465         print >>warn, "W: feed IDs may have changed, list before deleting again"
 466         unlock(feeds, feedfileObject)
 467
 468 def email(addr):
 469         feeds, feedfileObject = load()
 470         if feeds and isstr(feeds[0]): feeds[0] = addr
 471         else: feeds = [addr] + feeds
 472         unlock(feeds, feedfileObject)
 473
 474 if __name__ == '__main__':
 475         ie, args = "InputError", sys.argv
 476         try:
 477                 if VERBOSE: print 'args == %s' % args
 478                 if len(args) < 3: raise ie, "insufficient args"
 479                 feedfile, action, args = args[1], args[2], args[3:]
 480
 481                 if action == "run":
 482                         if args and args[0] == "--no-send":
 483                                 def send(x,y,z):
 484                                         if VERBOSE: print 'Not sending', (
 485                                         [x for x in z.splitlines() if x.startswith("Subject:")][0])
 486
 487                         if args and args[-1].isdigit(): run(int(args[-1]))
 488                         else: run()
 489
 490                 elif action == "email":
 491                         email(args[0])
 492
 493                 elif action == "add": add(*args)
 494
 495                 elif action == "new":
 496                         if len(args) == 1: d = [args[0]]
 497                         else: d = []
 498                         pickle.dump(d, open(feedfile, 'w'))
 499
 500                 elif action == "list": list()
 501
 502                 elif action == "delete": delete(int(args[0]))
 503
 504                 else:
 505                         raise ie, "invalid action"
 506
 507                 if smtpserver:
 508                         smtpserver.quit()
 509
 510         except ie, e:
 511                 print "E:", e
 512                 print
 513                 print __doc__
 514
 515