2 """rss2email: get RSS feeds emailed to you
3 http://www.aaronsw.com/2002/rss2email
6 new [youremail] (create new feedfile)
7 email [yournewemail] (update default email)
9 add feedurl [youremail]
14 __author__ = "Aaron Swartz (me@aaronsw.com)"
15 __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2."
16 ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess",
17 "Matej Cepl", "Martin 'Joey' Schulze", "Marcel Ackermann (http://www.DreamFlasher.de)", "Lindsey Smith (lindsey.smith@gmail.com)" ]
19 ### Vaguely Customizable Options ###
21 # The email address messages are from by default:
22 DEFAULT_FROM = "bozo@dev.null.invalid"
24 # 1: Send text/html messages when possible.
25 # 0: Convert HTML to plain text.
28 # 1: Only use the DEFAULT_FROM address.
29 # 0: Use the email address specified by the feed, when possible.
32 # 1: Receive one email per post.
33 # 0: Receive an email every time a post changes.
36 # 1: Generate Date header based on item's date, when possible.
37 # 0: Generate Date header based on time sent.
40 # A tuple consisting of some combination of
41 # ('issued', 'created', 'modified', 'expired')
42 # expressing ordered list of preference in dates
43 # to use for the Date header of the email.
44 DATE_HEADER_ORDER = ('modified', 'issued', 'created')
46 # 1: Apply Q-P conversion (required for some MUAs).
47 # 0: Send message in 8-bits.
48 # http://cr.yp.to/smtp/8bitmime.html
51 # 1: Name feeds as they're being processed.
55 # 1: Use the publisher's email if you can't find the author's.
56 # 0: Just use the DEFAULT_FROM email instead.
57 USE_PUBLISHER_EMAIL = 0
59 # 1: Use SMTP_SERVER to send mail.
60 # 0: Call /usr/bin/sendmail to send mail.
63 SMTP_SERVER = "smtp.yourisp.net:25"
64 AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
65 SMTP_USER = ' username' # for SMTP AUTH, set SMTP username here
66 SMTP_PASS = 'password' # for SMTP AUTH, set SMTP password here
68 # Set this to add a bonus header to all emails (start with '\n').
70 # Example: BONUS_HEADER = '\nApproved: joe@bob.org'
72 # Set this to override From addresses. Keys are feed URLs, values are new titles.
75 # Note: You can also override the send function.
76 def send(fr, to, message):
79 session = smtplib.SMTP(SMTP_SERVER)
81 session.login(SMTP_USER, SMTP_PASS)
82 session.sendmail(fr, [to], message)
84 i, o = os.popen2(["/usr/sbin/sendmail", to])
89 ## html2text options ##
91 # Use Unicode characters instead of their ascii psuedo-replacements
94 # Put the links after each paragraph instead of at the end.
95 LINKS_EACH_PARAGRAPH = 0
97 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
100 ### Load the Options ###
102 # Read options from config file if present.
110 ### Import Modules ###
112 import cPickle as pickle, md5, time, os, traceback, urllib2, sys, types
120 import socket; socket_errors = []
121 for e in ['error', 'gaierror']:
122 if hasattr(socket, e): socket_errors.append(getattr(socket, e))
123 import mimify; from StringIO import StringIO as SIO; mimify.CHARSET = 'utf-8'
124 if SMTP_SEND: import smtplib; smtpserver = smtplib.SMTP(SMTP_SERVER)
125 else: smtpserver = None
128 feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.aaronsw.com/2002/rss2email/"
130 import html2text as h2t
132 h2t.UNICODE_SNOB = UNICODE_SNOB
133 h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
134 h2t.BODY_WIDTH = BODY_WIDTH
135 html2text = h2t.html2text
137 ### Utility Functions ###
141 def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
142 def ishtml(t): return type(t) is type(())
143 def contains(a,b): return a.find(b) != -1
144 def unu(s): # I / freakin' hate / that unicode
145 if type(s) is types.UnicodeType: return s.encode('utf-8')
149 """Quote names in email according to RFC822."""
150 return '"' + unu(s).replace("\\", "\\\\").replace('"', '\\"') + '"'
153 """QP_CORRUPT headers."""
154 return mimify.mime_encode_header(s + ' ')[:-1]
156 ### Parsing Utilities ###
158 def getContent(entry, HTMLOK=0):
159 """Select the best content from an entry, deHTMLizing if necessary.
160 If raw HTML is best, an ('HTML', best) tuple is returned. """
163 # * We have a bunch of potential contents.
164 # * We go thru looking for our first choice.
165 # (HTML or text, depending on HTMLOK)
166 # * If that doesn't work, we go thru looking for our second choice.
167 # * If that still doesn't work, we just take the first one.
169 # Possible future improvement:
170 # * Instead of just taking the first one
171 # pick the one in the "best" language.
172 # * HACK: hardcoded HTMLOK, should take a tuple of media types
174 conts = entry.get('content', [])
176 if entry.get('summary_detail', {}):
177 conts += [entry.summary_detail]
182 if contains(c.type, 'html'): return ('HTML', c.value)
185 if c.type == 'text/plain': return c.value
187 if not HTMLOK: # Only need to convert to text if HTML isn't OK
189 if contains(c.type, 'html'):
190 return html2text(c.value)
192 return conts[0].value
197 """Get best ID from an entry."""
199 if 'id' in entry and entry.id: return entry.id
201 content = getContent(entry)
202 if content: return md5.new(unu(content)).hexdigest()
203 if 'link' in entry: return entry.link
204 if 'title' in entry: return md5.new(unu(entry.title)).hexdigest()
206 def getName(r, entry):
207 """Get the best name."""
210 if r.url in OVERRIDE_FROM.keys():
211 return unu(OVERRIDE_FROM[r.url])
213 name = feed.get('title', '')
215 if 'name' in entry.get('author_detail', []): # normally {} but py2.1
216 if entry.author_detail.name:
217 if name: name += ", "
218 name += entry.author_detail.name
220 elif 'name' in feed.get('author_detail', []):
221 if feed.author_detail.name:
222 if name: name += ", "
223 name += feed.author_detail.name
227 def getEmail(feed, entry):
228 """Get the best email_address."""
230 if FORCE_FROM: return DEFAULT_FROM
232 if 'email' in entry.get('author_detail', []):
233 return entry.author_detail.email
235 if 'email' in feed.get('author_detail', []):
236 return feed.author_detail.email
240 if USE_PUBLISHER_EMAIL:
241 if 'email' in feed.get('publisher_detail', []):
242 return feed.publisher_detail.email
244 if feed.get("errorreportsto", ''):
245 return feed.errorreportsto
249 ### Simple Database of Feeds ###
252 def __init__(self, url, to):
253 self.url, self.etag, self.modified, self.seen = url, None, None, {}
257 feedfileObject = open(feedfile, 'r')
258 feeds = pickle.load(feedfileObject)
260 if unix: fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_EX)
261 #HACK: to deal with lock caching
262 feedfileObject = open(feedfile, 'r')
263 feeds = pickle.load(feedfileObject)
264 if unix: fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_EX)
266 return feeds, feedfileObject
268 def unlock(feeds, feedfileObject):
270 pickle.dump(feeds, open(feedfile, 'w'))
272 pickle.dump(feeds, open(feedfile+'.tmp', 'w'))
273 os.rename(feedfile+'.tmp', feedfile)
274 fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN)
276 ### Program Functions ###
279 if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'):
280 urls, to = [args[0]], args[1]
282 urls, to = args, None
284 feeds, feedfileObject = load()
285 if feeds and not isstr(feeds[0]) and to is None:
286 raise 'NoEmail', "Run `email newaddr` or `add url addr`."
287 for url in urls: feeds.append(Feed(url, to))
288 unlock(feeds, feedfileObject)
291 feeds, feedfileObject = load()
293 # We store the default to address as the first item in the feeds list.
294 # Here we take it out and save it for later.
295 if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:]
298 if num: ifeeds = [feeds[num]]
302 if VERBOSE: print >>warn, "I: Processing", f.url
303 r = feedparser.parse(f.url, f.etag, f.modified)
305 # Handle various status conditions, as required
307 if r.status == 301: f.url = r['url']
308 elif r.status == 410:
309 print >>warn, "W: feed gone; deleting", f.url
313 http_status = r.get('status', 200)
314 http_headers = r.get('headers', {
315 'content-type': 'application/rss+xml',
316 'content-length':'1'})
317 exc_type = r.get("bozo_exception", Exception()).__class__
318 if http_status != 304 and not r.entries and not r.get('version', ''):
319 if http_status not in [200, 302]:
320 print >>warn, "W: error", http_status, f.url
322 elif contains(http_headers.get('content-type', 'rss'), 'html'):
323 print >>warn, "W: looks like HTML", f.url
325 elif http_headers.get('content-length', '1') == '0':
326 print >>warn, "W: empty page", f.url
328 elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
329 print >>warn, "W: timed out on", f.url
331 elif exc_type == IOError:
332 print >>warn, "W:", r.bozo_exception, f.url
334 elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
335 print >>warn, "W: broken compression", f.url
337 elif exc_type in socket_errors:
338 exc_reason = r.bozo_exception.args[1]
339 print >>warn, "W:", exc_reason, f.url
341 elif exc_type == urllib2.URLError:
342 if r.bozo_exception.reason.__class__ in socket_errors:
343 exc_reason = r.bozo_exception.reason.args[1]
345 exc_reason = r.bozo_exception.reason
346 print >>warn, "W:", exc_reason, f.url
348 elif exc_type == KeyboardInterrupt:
349 raise r.bozo_exception
352 print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
353 print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
355 print >>warn, "rss2email", __version__
356 print >>warn, "feedparser", feedparser.__version__
357 print >>warn, "html2text", h2t.__version__
358 print >>warn, "Python", sys.version
359 print >>warn, "=== END HERE ==="
364 for entry in r.entries:
367 # If TRUST_GUID isn't set, we get back hashes of the content.
368 # Instead of letting these run wild, we put them in context
369 # by associating them with the actual ID (if it exists).
371 frameid = entry.get('id', id)
373 # If this item's ID is in our database
374 # then it's already been sent
375 # and we don't need to do anything more.
377 if f.seen.has_key(frameid) and f.seen[frameid] == id: continue
379 if 'title_detail' in entry and entry.title_detail:
380 title = entry.title_detail.value
381 if contains(entry.title_detail.type, 'html'):
382 title = html2text(title)
384 title = getContent(entry)[:70]
386 title = unu(title).replace("\n", " ")
388 datetime = time.gmtime()
391 for datetype in DATE_HEADER_ORDER:
392 kind = datetype+"_parsed"
393 if kind in entry and entry[kind]: datetime = entry[kind]
395 content = getContent(entry, HTMLOK=HTML_MAIL)
397 link = unu(entry.get('link', ""))
399 from_addr = unu(getEmail(r.feed, entry))
402 "From: " + quote822(header7bit(getName(r, entry))) + " <"+from_addr+">" +
403 "\nTo: " + header7bit(unu(f.to or default_to)) + # set a default email!
404 "\nSubject: " + unu(html2text(header7bit(title))).strip() +
405 "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) +
406 "\nUser-Agent: rss2email" + # really should be X-Mailer
408 "\nContent-Type: ") # but backwards-compatibility
411 message += "text/html"
413 content = ("<html><body>\n\n" +
414 '<h1><a href="'+link+'">'+title+'</a></h1>\n\n' +
415 unu(content[1]).strip() + # drop type tag (HACK: bad abstraction)
416 '<p>URL: <a href="'+link+'">'+link+'</a></p>' +
417 "\n\n</body></html>")
419 message += "text/plain"
420 content = unu(content).strip() + "\n\nURL: "+link
422 message += '; charset="utf-8"\n\n' + content + "\n"
425 ins, outs = SIO(message), SIO()
426 mimify.mimify(ins, outs)
427 message = outs.getvalue()
429 send(from_addr, (f.to or default_to), message)
433 f.etag, f.modified = r.get('etag', None), r.get('modified', None)
434 except KeyboardInterrupt:
437 print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ==="
438 print >>warn, "E: could not parse", f.url
439 #if title: print >>warn, "Entry entitled: ", title
440 traceback.print_exc(file=warn)
441 print >>warn, "rss2email", __version__
442 print >>warn, "feedparser", feedparser.__version__
443 print >>warn, "html2text", h2t.__version__
444 print >>warn, "Python", sys.version
445 print >>warn, "=== END HERE ==="
449 unlock(feeds, feedfileObject)
452 feeds, feedfileObject = load(lock=0)
454 if feeds and isstr(feeds[0]):
455 default_to = feeds[0]; ifeeds = feeds[1:]; i=1
456 print "default email:", default_to
457 else: ifeeds = feeds; i = 0
459 print `i`+':', f.url, '('+(f.to or ('default: '+default_to))+')'
463 feeds, feedfileObject = load()
464 feeds = feeds[:n] + feeds[n+1:]
465 print >>warn, "W: feed IDs may have changed, list before deleting again"
466 unlock(feeds, feedfileObject)
469 feeds, feedfileObject = load()
470 if feeds and isstr(feeds[0]): feeds[0] = addr
471 else: feeds = [addr] + feeds
472 unlock(feeds, feedfileObject)
474 if __name__ == '__main__':
475 ie, args = "InputError", sys.argv
477 if VERBOSE: print 'args == %s' % args
478 if len(args) < 3: raise ie, "insufficient args"
479 feedfile, action, args = args[1], args[2], args[3:]
482 if args and args[0] == "--no-send":
484 if VERBOSE: print 'Not sending', (
485 [x for x in z.splitlines() if x.startswith("Subject:")][0])
487 if args and args[-1].isdigit(): run(int(args[-1]))
490 elif action == "email":
493 elif action == "add": add(*args)
495 elif action == "new":
496 if len(args) == 1: d = [args[0]]
498 pickle.dump(d, open(feedfile, 'w'))
500 elif action == "list": list()
502 elif action == "delete": delete(int(args[0]))
505 raise ie, "invalid action"