From: Aaron Swartz Date: Wed, 11 Feb 2004 12:00:00 +0000 (+0000) Subject: Add rss2email v2.3 X-Git-Tag: v2.3 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=a0ce90775dd4b746c7552ee55c88524e67d7666c;p=rss2email.git Add rss2email v2.3 Far more robust. Fix for Unicode crash. Use guid instead of link for seen frame. (Warning: resends!) Downloaded from: http://web.archive.org/web/20040803080251/http://www.aaronsw.com/2002/rss2email/rss2email-2.3.py --- diff --git a/rss2email.py b/rss2email.py index b7d1d75..124e423 100644 --- a/rss2email.py +++ b/rss2email.py @@ -12,7 +12,7 @@ Usage: python rss2email.py feedfile action [options] list delete n """ -__version__ = "2.28" +__version__ = "2.3" __author__ = "Aaron Swartz (me@aaronsw.com)" __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2." ___contributors__ = ["Dean Jackson (dino@grorg.org)", @@ -76,8 +76,9 @@ import cPickle as pickle, fcntl, md5, time, os, traceback if QP_REQUIRED: import mimify; from StringIO import StringIO as SIO def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u'')) -def e(obj, val): - x = expandEntities(obj[val]) +def e(obj, val, ee=1): + x = obj[val] + if ee: x = expandEntities(x) if type(x) is unicode: x = x.encode('utf-8') return x.strip() @@ -87,30 +88,30 @@ def quoteEmailName(s): def getContent(item, url): if item.has_key('content') and item['content']: for c in item['content']: - if c['type'] == 'text/plain': return c['value'] + if c['type'] == 'text/plain': return e(c, 'value') for c in item['content']: if c['type'].find('html') != -1: - return html2text(c['value'], c['base']) + return html2text(e(c, 'value', ee=0), c['base']) - return item['content'][0]['value'] + return e(item['content'][0], 'value') if item.has_key('description'): if TREAT_DESCRIPTION_AS_HTML: - return html2text(item['description'], url) + return html2text(e(item, 'description', ee=0), url) else: - return item['description'] + return e(item, 'description') - if item.has_key('summary'): return item['summary'] + if item.has_key('summary'): return e(item, 'summary') return "" def getID(item, content): if TRUST_GUID: - if item.has_key('id') and item['id']: return item['id'] + if item.has_key('id') and item['id']: return e(item, 'id') if content: return md5.new(content).hexdigest() - if item.has_key('link'): return item['link'] - if item.has_key('title'): return md5.new(item['title']).hexdigest() + if item.has_key('link'): return e(item, 'link') + if item.has_key('title'): return md5.new(e(item, 'title')).hexdigest() class Feed: def __init__(self, url, to): @@ -120,11 +121,18 @@ class Feed: def load(lock=1): ff2 = open(feedfile, 'r') feeds = pickle.load(ff2) - if lock: fcntl.flock(ff2, fcntl.LOCK_EX) + if lock: + fcntl.flock(ff2, fcntl.LOCK_EX) + #HACK: to deal with lock caching + ff2 = open(feedfile, 'r') + feeds = pickle.load(ff2) + fcntl.flock(ff2, fcntl.LOCK_EX) + return feeds, ff2 def unlock(feeds, ff2): - pickle.dump(feeds, open(feedfile, 'w')) + pickle.dump(feeds, open(feedfile+'.tmp', 'w')) + os.rename(feedfile+'.tmp', feedfile) fcntl.flock(ff2, fcntl.LOCK_UN) def add(url, to=None): @@ -136,83 +144,88 @@ def add(url, to=None): def run(): feeds, ff2 = load() - - if isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] - else: ifeeds = feeds - - for f in ifeeds: - if VERBOSE: print "Processing", f.url - try: result = feedparser.parse(f.url, f.etag, f.modified) - except: - print "E: could not parse", f.url - traceback.print_exc() - continue - - if result.has_key('status') and result['status'] == 301: f.url = result['url'] - - if result.has_key('encoding'): enc = result['encoding'] - else: enc = 'utf-8' - - c, ert = result['channel'], 'errorreportsto' - - headers = "From: " - if c.has_key('title'): headers += quoteEmailName(e(c, 'title')) + ' ' - if FORCE_FROM and c.has_key(ert) and c[ert].startswith('mailto:'): - fr = c[ert][7:] - else: - fr = DEFAULT_FROM + try: + if isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] + else: ifeeds = feeds - headers += '<'+fr+'>' + for f in ifeeds: + try: + if VERBOSE: print "Processing", f.url + result = feedparser.parse(f.url, f.etag, f.modified) - headers += "\nTo: " + (f.to or default_to) # set a default email! - if not QP_REQUIRED: - headers += '\nContent-Type: text/plain; charset="' + enc + '"' - - if not result['items'] and ((not result.has_key('status') or (result.has_key('status') and result['status'] != 304))): - print "W: no items; invalid feed? (" + f.url + ")" - continue - - for i in result['items']: - content = getContent(i, f.url) - id = getID(i, content) - - if i.has_key('link') and i['link']: frameid = link = e(i, 'link') - else: frameid = id; link = None - - if f.seen.has_key(frameid) and f.seen[frameid] == id: - continue # have seen - - if i.has_key('title'): title = e(i, 'title') - else: title = content[:70].replace("\n", " ") - - if DATE_HEADER and i.has_key('date_parsed'): - datetime = i['date_parsed'] - else: - datetime = time.gmtime() - - message = (headers - + "\nSubject: " + title - + "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) - + "\nUser-Agent: rss2email" - + "\n") - - message += "\n" + content.strip() + "\n" - - if link: message += "\nURL: " + link + "\n" + if result.has_key('status') and result['status'] == 301: f.url = result['url'] + + if result.has_key('encoding'): enc = result['encoding'] + else: enc = 'utf-8' + + c, ert = result['channel'], 'errorreportsto' + + headers = "From: " + if c.has_key('title'): headers += quoteEmailName(e(c, 'title')) + ' ' + if FORCE_FROM and c.has_key(ert) and c[ert].startswith('mailto:'): + fr = c[ert][7:] + else: + fr = DEFAULT_FROM + + headers += '<'+fr+'>' + + headers += "\nTo: " + (f.to or default_to) # set a default email! + if not QP_REQUIRED: + headers += '\nContent-Type: text/plain; charset="' + enc + '"' + + if not result['items'] and ((not result.has_key('status') or (result.has_key('status') and result['status'] != 304))): + print "W: no items; invalid feed? (" + f.url + ")" + continue - if QP_REQUIRED: - mimify.CHARSET = enc - ins, outs = SIO(message), SIO() - mimify.mimify(ins, outs); outs.seek(0) - message = outs.read() + for i in result['items']: + content = getContent(i, f.url) + id = getID(i, content) + + if i.has_key('link') and i['link']: link = e(i, 'link') + else: link = None + + if i.has_key('id') and i['id']: frameid = e(i, 'id') + else: frameid = id + + if f.seen.has_key(frameid) and f.seen[frameid] == id: + continue # have seen - send(fr, (f.to or default_to), message) - - f.seen[frameid] = id + if i.has_key('title'): title = e(i, 'title') + else: title = content[:70].replace("\n", " ") + + if DATE_HEADER and i.has_key('date_parsed'): + datetime = i['date_parsed'] + else: + datetime = time.gmtime() + + message = (headers + + "\nSubject: " + title + + "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) + + "\nUser-Agent: rss2email" + + "\n") + + message += "\n" + content.strip() + "\n" + + if link: message += "\nURL: " + link + "\n" + + if QP_REQUIRED: + mimify.CHARSET = enc + ins, outs = SIO(message), SIO() + mimify.mimify(ins, outs); outs.seek(0) + message = outs.read() + + send(fr, (f.to or default_to), message) - f.etag, f.modified = result.get('etag', None), result.get('modified', None) - - unlock(feeds, ff2) + f.seen[frameid] = id + + f.etag, f.modified = result.get('etag', None), result.get('modified', None) + except: + print "E: could not parse", f.url + traceback.print_exc() + continue + + finally: + unlock(feeds, ff2) def list(): feeds, ff2 = load(lock=0) @@ -270,4 +283,4 @@ if __name__ == '__main__': except ie, e: print "E:", e print - print __doc__ \ No newline at end of file + print __doc__