Add rss2email v2.56
[rss2email.git] / rss2email.py
1 #!/usr/bin/python
2 """rss2email: get RSS feeds emailed to you
3 http://www.aaronsw.com/2002/rss2email
4
5 Usage:
6   new [youremail] (create new feedfile)
7   email [yournewemail] (update default email)
8   run [--no-send] [num]
9   add feedurl [youremail]
10   list
11   delete n
12 """
13 __version__ = "2.56"
14 __author__ = "Aaron Swartz (me@aaronsw.com)"
15 __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2."
16 ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", 
17                      "Matej Cepl", "Martin 'Joey' Schulze", "Marcel Ackermann (http://www.DreamFlasher.de)", "Lindsey Smith (lindsey.smith@gmail.com)" ]
18
19 ### Vaguely Customizable Options ###
20
21 # The email address messages are from by default:
22 DEFAULT_FROM = "bozo@dev.null.invalid"
23
24 # 1: Send text/html messages when possible.
25 # 0: Convert HTML to plain text.
26 HTML_MAIL = 0
27
28 # 1: Only use the DEFAULT_FROM address.
29 # 0: Use the email address specified by the feed, when possible.
30 FORCE_FROM = 0
31
32 # 1: Receive one email per post.
33 # 0: Receive an email every time a post changes.
34 TRUST_GUID = 1
35
36 # 1: Generate Date header based on item's date, when possible.
37 # 0: Generate Date header based on time sent.
38 DATE_HEADER = 0
39
40 # A tuple consisting of some combination of
41 # ('issued', 'created', 'modified', 'expired')
42 # expressing ordered list of preference in dates 
43 # to use for the Date header of the email.
44 DATE_HEADER_ORDER = ('modified', 'issued', 'created')
45
46 # 1: Apply Q-P conversion (required for some MUAs).
47 # 0: Send message in 8-bits.
48 # http://cr.yp.to/smtp/8bitmime.html
49 QP_REQUIRED = 0
50
51 # 1: Name feeds as they're being processed.
52 # 0: Keep quiet.
53 VERBOSE = 0
54
55 # 1: Use the publisher's email if you can't find the author's.
56 # 0: Just use the DEFAULT_FROM email instead.
57 USE_PUBLISHER_EMAIL = 0
58
59 # 1: Use SMTP_SERVER to send mail.
60 # 0: Call /usr/bin/sendmail to send mail.
61 SMTP_SEND = 0
62
63 SMTP_SERVER = "smtp.yourisp.net:25"
64 AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
65 SMTP_USER = ' username'  # for SMTP AUTH, set SMTP username here
66 SMTP_PASS = 'password'  # for SMTP AUTH, set SMTP password here
67
68 # Set this to add a bonus header to all emails (start with '\n').
69 BONUS_HEADER = ''
70 # Example: BONUS_HEADER = '\nApproved: joe@bob.org'
71
72 # Set this to override From addresses. Keys are feed URLs, values are new titles.
73 OVERRIDE_FROM = {}
74
75 # Note: You can also override the send function.
76 def send(fr, to, message):
77         if SMTP_SEND:
78                 import smtplib
79                 session = smtplib.SMTP(SMTP_SERVER)
80                 if AUTHREQUIRED:
81                         session.login(SMTP_USER, SMTP_PASS)
82                 session.sendmail(fr, [to], message)     
83         else:
84                 i, o = os.popen2(["/usr/sbin/sendmail", to])
85                 i.write(message)
86                 i.close(); o.close()
87                 del i, o
88
89 ## html2text options ##
90
91 # Use Unicode characters instead of their ascii psuedo-replacements
92 UNICODE_SNOB = 0
93
94 # Put the links after each paragraph instead of at the end.
95 LINKS_EACH_PARAGRAPH = 0
96
97 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
98 BODY_WIDTH = 0
99
100 ### Load the Options ###
101
102 # Read options from config file if present.
103 import sys
104 sys.path.append(".")
105 try:
106         from config import *
107 except:
108         pass
109         
110 ### Import Modules ###
111
112 import cPickle as pickle, md5, time, os, traceback, urllib2, sys, types
113 unix = 0
114 try:
115         import fcntl
116         unix = 1
117 except:
118         pass
119                 
120 import socket; socket_errors = []
121 for e in ['error', 'gaierror']:
122         if hasattr(socket, e): socket_errors.append(getattr(socket, e))
123 import mimify; from StringIO import StringIO as SIO; mimify.CHARSET = 'utf-8'
124 if SMTP_SEND: import smtplib; smtpserver = smtplib.SMTP(SMTP_SERVER)
125 else: smtpserver = None
126
127 import feedparser
128 feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.aaronsw.com/2002/rss2email/"
129
130 import html2text as h2t
131
132 h2t.UNICODE_SNOB = UNICODE_SNOB
133 h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
134 h2t.BODY_WIDTH = BODY_WIDTH
135 html2text = h2t.html2text
136
137 ### Utility Functions ###
138
139 warn = sys.stderr
140
141 def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
142 def ishtml(t): return type(t) is type(())
143 def contains(a,b): return a.find(b) != -1
144 def unu(s): # I / freakin' hate / that unicode
145         if type(s) is types.UnicodeType: return s.encode('utf-8')
146         else: return s
147
148 def quote822(s):
149         """Quote names in email according to RFC822."""
150         return '"' + unu(s).replace("\\", "\\\\").replace('"', '\\"') + '"'
151
152 def header7bit(s):
153         """QP_CORRUPT headers."""
154         return mimify.mime_encode_header(s + ' ')[:-1]
155
156 ### Parsing Utilities ###
157
158 def getContent(entry, HTMLOK=0):
159         """Select the best content from an entry, deHTMLizing if necessary.
160         If raw HTML is best, an ('HTML', best) tuple is returned. """
161         
162         # How this works:
163         #  * We have a bunch of potential contents. 
164         #  * We go thru looking for our first choice. 
165         #    (HTML or text, depending on HTMLOK)
166         #  * If that doesn't work, we go thru looking for our second choice.
167         #  * If that still doesn't work, we just take the first one.
168         #
169         # Possible future improvement:
170         #  * Instead of just taking the first one
171         #    pick the one in the "best" language.
172         #  * HACK: hardcoded HTMLOK, should take a tuple of media types
173         
174         conts = entry.get('content', [])
175         
176         if entry.get('summary_detail', {}):
177                 conts += [entry.summary_detail]
178         
179         if conts:
180                 if HTMLOK:
181                         for c in conts:
182                                 if contains(c.type, 'html'): return ('HTML', c.value)
183         
184                 for c in conts:
185                         if c.type == 'text/plain': return c.value
186         
187                 if not HTMLOK: # Only need to convert to text if HTML isn't OK
188                         for c in conts:
189                                 if contains(c.type, 'html'):
190                                         return html2text(c.value)
191                 
192                 return conts[0].value   
193         
194         return ""
195
196 def getID(entry):
197         """Get best ID from an entry."""
198         if TRUST_GUID:
199                 if 'id' in entry and entry.id: return entry.id
200
201         content = getContent(entry)
202         if content: return md5.new(unu(content)).hexdigest()
203         if 'link' in entry: return entry.link
204         if 'title' in entry: return md5.new(unu(entry.title)).hexdigest()
205
206 def getName(r, entry):
207         """Get the best name."""
208
209         feed = r.feed
210         if r.url in OVERRIDE_FROM.keys():
211                 return unu(OVERRIDE_FROM[r.url])
212         
213         name = feed.get('title', '')
214         
215         if 'name' in entry.get('author_detail', []): # normally {} but py2.1
216                 if entry.author_detail.name:
217                         if name: name += ", "
218                         name +=  entry.author_detail.name
219
220         elif 'name' in feed.get('author_detail', []):
221                 if feed.author_detail.name:
222                         if name: name += ", "
223                         name += feed.author_detail.name
224         
225         return name
226
227 def getEmail(feed, entry):
228         """Get the best email_address."""
229
230         if FORCE_FROM: return DEFAULT_FROM
231         
232         if 'email' in entry.get('author_detail', []):
233                 return entry.author_detail.email
234         
235         if 'email' in feed.get('author_detail', []):
236                 return feed.author_detail.email
237                 
238         #TODO: contributors
239         
240         if USE_PUBLISHER_EMAIL:
241                 if 'email' in feed.get('publisher_detail', []):
242                         return feed.publisher_detail.email
243                 
244                 if feed.get("errorreportsto", ''):
245                         return feed.errorreportsto
246                         
247         return DEFAULT_FROM
248
249 ### Simple Database of Feeds ###
250
251 class Feed:
252         def __init__(self, url, to):
253                 self.url, self.etag, self.modified, self.seen = url, None, None, {}
254                 self.to = to            
255
256 def load(lock=1):
257         feedfileObject = open(feedfile, 'r')
258         feeds = pickle.load(feedfileObject)
259         if lock:
260                 if unix: fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_EX)
261                 #HACK: to deal with lock caching
262                 feedfileObject = open(feedfile, 'r')
263                 feeds = pickle.load(feedfileObject)
264                 if unix: fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_EX)
265
266         return feeds, feedfileObject
267
268 def unlock(feeds, feedfileObject):
269         if not unix: 
270                 pickle.dump(feeds, open(feedfile, 'w'))
271         else:   
272                 pickle.dump(feeds, open(feedfile+'.tmp', 'w'))
273                 os.rename(feedfile+'.tmp', feedfile)
274                 fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN)
275
276 ### Program Functions ###
277
278 def add(*args):
279         if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'):
280                 urls, to = [args[0]], args[1]
281         else:
282                 urls, to = args, None
283         
284         feeds, feedfileObject = load()
285         if feeds and not isstr(feeds[0]) and to is None:
286                 raise 'NoEmail', "Run `email newaddr` or `add url addr`."
287         for url in urls: feeds.append(Feed(url, to))
288         unlock(feeds, feedfileObject)
289
290 def run(num=None):
291         feeds, feedfileObject = load()
292         try:
293                 # We store the default to address as the first item in the feeds list.
294                 # Here we take it out and save it for later.
295                 if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
296                 else: ifeeds = feeds
297                 
298                 if num: ifeeds = [feeds[num]]
299                 
300                 for f in ifeeds:
301                         try: 
302                                 if VERBOSE: print >>warn, "I: Processing", f.url
303                                 r = feedparser.parse(f.url, f.etag, f.modified)
304                                 
305                                 # Handle various status conditions, as required
306                                 if 'status' in r:
307                                         if r.status == 301: f.url = r['url']
308                                         elif r.status == 410:
309                                                 print >>warn, "W: feed gone; deleting", f.url
310                                                 feeds.remove(f)
311                                                 continue
312                                 
313                                 http_status = r.get('status', 200)
314                                 http_headers = r.get('headers', {
315                                   'content-type': 'application/rss+xml', 
316                                   'content-length':'1'})
317                                 exc_type = r.get("bozo_exception", Exception()).__class__
318                                 if http_status != 304 and not r.entries and not r.get('version', ''):
319                                         if http_status not in [200, 302]: 
320                                                 print >>warn, "W: error", http_status, f.url
321
322                                         elif contains(http_headers.get('content-type', 'rss'), 'html'):
323                                                 print >>warn, "W: looks like HTML", f.url
324
325                                         elif http_headers.get('content-length', '1') == '0':
326                                                 print >>warn, "W: empty page", f.url
327
328                                         elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
329                                                 print >>warn, "W: timed out on", f.url
330                                         
331                                         elif exc_type == IOError:
332                                                 print >>warn, "W:", r.bozo_exception, f.url
333                                         
334                                         elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
335                                                 print >>warn, "W: broken compression", f.url
336                                         
337                                         elif exc_type in socket_errors:
338                                                 exc_reason = r.bozo_exception.args[1]
339                                                 print >>warn, "W:", exc_reason, f.url
340
341                                         elif exc_type == urllib2.URLError:
342                                                 if r.bozo_exception.reason.__class__ in socket_errors:
343                                                         exc_reason = r.bozo_exception.reason.args[1]
344                                                 else:
345                                                         exc_reason = r.bozo_exception.reason
346                                                 print >>warn, "W:", exc_reason, f.url
347                                         
348                                         elif exc_type == KeyboardInterrupt:
349                                                 raise r.bozo_exception
350
351                                         else:
352                                                 print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
353                                                 print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
354                                                 print >>warn, r
355                                                 print >>warn, "rss2email", __version__
356                                                 print >>warn, "feedparser", feedparser.__version__
357                                                 print >>warn, "html2text", h2t.__version__
358                                                 print >>warn, "Python", sys.version
359                                                 print >>warn, "=== END HERE ==="
360                                         continue
361                                 
362                                 r.entries.reverse()
363                                 
364                                 for entry in r.entries:
365                                         id = getID(entry)
366                                         
367                                         # If TRUST_GUID isn't set, we get back hashes of the content.
368                                         # Instead of letting these run wild, we put them in context
369                                         # by associating them with the actual ID (if it exists).
370                                         
371                                         frameid = entry.get('id', id)
372                                         
373                                         # If this item's ID is in our database
374                                         # then it's already been sent
375                                         # and we don't need to do anything more.
376                                         
377                                         if f.seen.has_key(frameid) and f.seen[frameid] == id: continue
378                                                                                 
379                                         if 'title_detail' in entry and entry.title_detail:
380                                                 title = entry.title_detail.value
381                                                 if contains(entry.title_detail.type, 'html'):
382                                                         title = html2text(title)
383                                         else:
384                                                 title = getContent(entry)[:70]
385
386                                         title = unu(title).replace("\n", " ")
387                                         
388                                         datetime = time.gmtime()
389
390                                         if DATE_HEADER:
391                                                 for datetype in DATE_HEADER_ORDER:
392                                                         kind = datetype+"_parsed"
393                                                         if kind in entry and entry[kind]: datetime = entry[kind]
394                                                 
395                                         content = getContent(entry, HTMLOK=HTML_MAIL)
396                                         
397                                         link = unu(entry.get('link', ""))
398                                         
399                                         from_addr = unu(getEmail(r.feed, entry))
400
401                                         message = (
402                                         "From: " + quote822(header7bit(getName(r, entry))) + " <"+from_addr+">" +
403                                         "\nTo: " + header7bit(unu(f.to or default_to)) + # set a default email!
404                                         "\nSubject: " + unu(html2text(header7bit(title))).strip() +
405                                         "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) +
406                                         "\nUser-Agent: rss2email" + # really should be X-Mailer 
407                                         BONUS_HEADER +
408                                         "\nContent-Type: ")         # but backwards-compatibility
409                                         
410                                         if ishtml(content):
411                                                 message += "text/html"
412                                                 
413                                                 content = ("<html><body>\n\n" + 
414                                                            '<h1><a href="'+link+'">'+title+'</a></h1>\n\n' +
415                                                            unu(content[1]).strip() + # drop type tag (HACK: bad abstraction)
416                                                            '<p>URL: <a href="'+link+'">'+link+'</a></p>' +
417                                                            "\n\n</body></html>")
418                                         else:
419                                                 message += "text/plain"
420                                                 content = unu(content).strip() + "\n\nURL: "+link
421                                                 
422                                         message += '; charset="utf-8"\n\n' + content + "\n"
423
424                                         if QP_REQUIRED:
425                                                 ins, outs = SIO(message), SIO()
426                                                 mimify.mimify(ins, outs)
427                                                 message = outs.getvalue()
428                                         
429                                         send(from_addr, (f.to or default_to), message)
430                         
431                                         f.seen[frameid] = id
432                                         
433                                 f.etag, f.modified = r.get('etag', None), r.get('modified', None)
434                         except KeyboardInterrupt:
435                                 raise
436                         except:
437                                 print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
438                                 print >>warn, "E: could not parse", f.url
439                                 #if title: print >>warn, "Entry entitled: ", title 
440                                 traceback.print_exc(file=warn)
441                                 print >>warn, "rss2email", __version__
442                                 print >>warn, "feedparser", feedparser.__version__
443                                 print >>warn, "html2text", h2t.__version__
444                                 print >>warn, "Python", sys.version
445                                 print >>warn, "=== END HERE ==="
446                                 continue
447
448         finally:                
449                 unlock(feeds, feedfileObject)
450
451 def list():
452         feeds, feedfileObject = load(lock=0)
453         
454         if feeds and isstr(feeds[0]):
455                 default_to = feeds[0]; ifeeds = feeds[1:]; i=1
456                 print "default email:", default_to
457         else: ifeeds = feeds; i = 0
458         for f in ifeeds:
459                 print `i`+':', f.url, '('+(f.to or ('default: '+default_to))+')'
460                 i+= 1
461
462 def delete(n):
463         feeds, feedfileObject = load()
464         feeds = feeds[:n] + feeds[n+1:]
465         print >>warn, "W: feed IDs may have changed, list before deleting again"
466         unlock(feeds, feedfileObject)
467         
468 def email(addr):
469         feeds, feedfileObject = load()
470         if feeds and isstr(feeds[0]): feeds[0] = addr
471         else: feeds = [addr] + feeds
472         unlock(feeds, feedfileObject)
473
474 if __name__ == '__main__':
475         ie, args = "InputError", sys.argv
476         try:
477                 if VERBOSE: print 'args == %s' % args
478                 if len(args) < 3: raise ie, "insufficient args"
479                 feedfile, action, args = args[1], args[2], args[3:]
480                 
481                 if action == "run": 
482                         if args and args[0] == "--no-send":
483                                 def send(x,y,z):
484                                         if VERBOSE: print 'Not sending', (
485                                         [x for x in z.splitlines() if x.startswith("Subject:")][0])
486
487                         if args and args[-1].isdigit(): run(int(args[-1]))
488                         else: run()
489
490                 elif action == "email":
491                         email(args[0])
492
493                 elif action == "add": add(*args)
494
495                 elif action == "new": 
496                         if len(args) == 1: d = [args[0]]
497                         else: d = []
498                         pickle.dump(d, open(feedfile, 'w'))
499
500                 elif action == "list": list()
501
502                 elif action == "delete": delete(int(args[0]))
503
504                 else:
505                         raise ie, "invalid action"
506                 
507                 if smtpserver:
508                         smtpserver.quit()
509                 
510         except ie, e:
511                 print "E:", e
512                 print
513                 print __doc__
514
515