Add remaining Debian artifacts.
[rss2email.git] / rss2email.py
1 #!/usr/bin/python
2 """rss2email: get RSS feeds emailed to you
3 http://rss2email.infogami.com
4
5 Usage:
6   new [emailaddress] (create new feedfile)
7   email newemailaddress (update default email)
8   run [--no-send] [num]
9   add feedurl [emailaddress]
10   list
11   reset
12   delete n
13   pause n
14   unpause n
15   opmlexport
16   opmlimport filename
17 """
18 __version__ = "2.71"
19 __author__ = "Lindsey Smith (lindsey@allthingsrss.com)"
20 __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3."
21 ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", 
22                      "Matej Cepl", "Martin 'Joey' Schulze", 
23                      "Marcel Ackermann (http://www.DreamFlasher.de)", 
24                      "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
25
26 import urllib2
27 urllib2.install_opener(urllib2.build_opener())
28
29 ### Vaguely Customizable Options ###
30
31 # The email address messages are from by default:
32 DEFAULT_FROM = "user@rss2email.invalid"
33
34 # 1: Send text/html messages when possible.
35 # 0: Convert HTML to plain text.
36 HTML_MAIL = 0
37
38 # 1: Only use the DEFAULT_FROM address.
39 # 0: Use the email address specified by the feed, when possible.
40 FORCE_FROM = 0
41
42 # 1: Receive one email per post.
43 # 0: Receive an email every time a post changes.
44 TRUST_GUID = 1
45
46 # 1: Generate Date header based on item's date, when possible.
47 # 0: Generate Date header based on time sent.
48 DATE_HEADER = 0
49
50 # A tuple consisting of some combination of
51 # ('issued', 'created', 'modified', 'expired')
52 # expressing ordered list of preference in dates 
53 # to use for the Date header of the email.
54 DATE_HEADER_ORDER = ('modified', 'issued', 'created')
55
56 # 1: Apply Q-P conversion (required for some MUAs).
57 # 0: Send message in 8-bits.
58 # http://cr.yp.to/smtp/8bitmime.html
59 #DEPRECATED 
60 QP_REQUIRED = 0
61 #DEPRECATED 
62         
63 # 1: Name feeds as they're being processed.
64 # 0: Keep quiet.
65 VERBOSE = 0
66
67 # 1: Use the publisher's email if you can't find the author's.
68 # 0: Just use the DEFAULT_FROM email instead.
69 USE_PUBLISHER_EMAIL = 0
70
71 # 1: Use SMTP_SERVER to send mail.
72 # 0: Call /usr/sbin/sendmail to send mail.
73 SMTP_SEND = 0
74
75 SMTP_SERVER = "smtp.yourisp.net:25"
76 AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
77 SMTP_USER = 'username'  # for SMTP AUTH, set SMTP username here
78 SMTP_PASS = 'password'  # for SMTP AUTH, set SMTP password here
79
80 # Connect to the SMTP server using SSL
81 SMTP_SSL = 0
82
83 # Set this to add a bonus header to all emails (start with '\n').
84 BONUS_HEADER = ''
85 # Example: BONUS_HEADER = '\nApproved: joe@bob.org'
86
87 # Set this to override From addresses. Keys are feed URLs, values are new titles.
88 OVERRIDE_FROM = {}
89
90 # Set this to override From email addresses. Keys are feed URLs, values are new emails.
91 OVERRIDE_EMAIL = {}
92
93 # Set this to default From email addresses. Keys are feed URLs, values are new email addresses.
94 DEFAULT_EMAIL = {}
95
96 # Only use the email from address rather than friendly name plus email address
97 NO_FRIENDLY_NAME = 0
98
99 # Set this to override the timeout (in seconds) for feed server response
100 FEED_TIMEOUT = 60
101
102 # Optional CSS styling
103 USE_CSS_STYLING = 0
104 STYLE_SHEET='h1 {font: 18pt Georgia, "Times New Roman";} body {font: 12pt Arial;} a:link {font: 12pt Arial; font-weight: bold; color: #0000cc} blockquote {font-family: monospace; }  .header { background: #e0ecff; border-bottom: solid 4px #c3d9ff; padding: 5px; margin-top: 0px; color: red;} .header a { font-size: 20px; text-decoration: none; } .footer { background: #c3d9ff; border-top: solid 4px #c3d9ff; padding: 5px; margin-bottom: 0px; } #entry {border: solid 4px #c3d9ff; } #body { margin-left: 5px; margin-right: 5px; }'
105
106 # If you have an HTTP Proxy set this in the format 'http://your.proxy.here:8080/'
107 PROXY=""
108
109 # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works
110 # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes
111 CHARSET_LIST='US-ASCII', 'ISO-8859-1', 'UTF-8', 'BIG5', 'ISO-2022-JP'
112
113 from email.MIMEText import MIMEText
114 from email.Header import Header as _Header
115 from email.Utils import parseaddr, formataddr
116
117 class Header(_Header):
118     # Work-around for <http://bugs.python.org/issue5871>
119     def append(self, s=None, *args, **kwargs):
120         if s is not None:
121             s = s.replace('\n', ' ').replace('\r', ' ')
122         _Header.append(self, s, *args, **kwargs)
123
124 # Note: You can also override the send function.
125
126 def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None):
127         """Send an email.
128         
129         All arguments should be Unicode strings (plain ASCII works as well).
130         
131         Only the real name part of sender and recipient addresses may contain
132         non-ASCII characters.
133         
134         The email will be properly MIME encoded and delivered though SMTP to
135         localhost port 25.  This is easy to change if you want something different.
136         
137         The charset of the email will be the first one out of the list
138         that can represent all the characters occurring in the email.
139         """
140
141         # Header class is smart enough to try US-ASCII, then the charset we
142         # provide, then fall back to UTF-8.
143         header_charset = 'ISO-8859-1'
144         
145         # We must choose the body charset manually
146         for body_charset in CHARSET_LIST:
147             try:
148                 body.encode(body_charset)
149             except (UnicodeError, LookupError):
150                 pass
151             else:
152                 break
153
154         # Split real name (which is optional) and email address parts
155         sender_name, sender_addr = parseaddr(sender)
156         recipient_name, recipient_addr = parseaddr(recipient)
157         
158         # We must always pass Unicode strings to Header, otherwise it will
159         # use RFC 2047 encoding even on plain ASCII strings.
160         sender_name = str(Header(unicode(sender_name), header_charset))
161         recipient_name = str(Header(unicode(recipient_name), header_charset))
162         
163         # Make sure email addresses do not contain non-ASCII characters
164         sender_addr = sender_addr.encode('ascii')
165         recipient_addr = recipient_addr.encode('ascii')
166         
167         # Create the message ('plain' stands for Content-Type: text/plain)
168         msg = MIMEText(body.encode(body_charset), contenttype, body_charset)
169         msg['To'] = formataddr((recipient_name, recipient_addr))
170         msg['Subject'] = Header(unicode(subject), header_charset)
171         for hdr in extraheaders.keys():
172                 try:
173                         msg[hdr] = Header(unicode(extraheaders[hdr], header_charset))
174                 except:
175                         msg[hdr] = Header(extraheaders[hdr])
176                 
177         fromhdr = formataddr((sender_name, sender_addr))
178         msg['From'] = fromhdr
179
180         msg_as_string = msg.as_string()
181 #DEPRECATED     if QP_REQUIRED:
182 #DEPRECATED             ins, outs = SIO(msg_as_string), SIO()
183 #DEPRECATED             mimify.mimify(ins, outs)
184 #DEPRECATED             msg_as_string = outs.getvalue()
185
186         if SMTP_SEND:
187                 if not smtpserver: 
188                         import smtplib
189                         
190                         try:
191                                 if SMTP_SSL:
192                                         smtpserver = smtplib.SMTP_SSL()
193                                 else:
194                                         smtpserver = smtplib.SMTP()
195                                 smtpserver.connect(SMTP_SERVER)
196                         except KeyboardInterrupt:
197                                 raise
198                         except Exception, e:
199                                 print >>warn, ""
200                                 print >>warn, ('Fatal error: could not connect to mail server "%s"' % SMTP_SERVER)
201                                 print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly')
202                                 if hasattr(e, 'reason'):
203                                         print >>warn, "Reason:", e.reason
204                                 sys.exit(1)
205                                         
206                         if AUTHREQUIRED:
207                                 try:
208                                         smtpserver.ehlo()
209                                         if not SMTP_SSL: smtpserver.starttls()
210                                         smtpserver.ehlo()
211                                         smtpserver.login(SMTP_USER, SMTP_PASS)
212                                 except KeyboardInterrupt:
213                                         raise
214                                 except Exception, e:
215                                         print >>warn, ""
216                                         print >>warn, ('Fatal error: could not authenticate with mail server "%s" as user "%s"' % (SMTP_SERVER, SMTP_USER))
217                                         print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly')
218                                         if hasattr(e, 'reason'):
219                                                 print >>warn, "Reason:", e.reason
220                                         sys.exit(1)
221                                         
222                 smtpserver.sendmail(sender, recipient, msg_as_string)
223                 return smtpserver
224
225         else:
226                 try:
227                         p = subprocess.Popen(["/usr/sbin/sendmail", recipient], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
228                         p.communicate(msg_as_string)
229                         status = p.returncode
230                         assert status != None, "just a sanity check"
231                         if status != 0:
232                                 print >>warn, ""
233                                 print >>warn, ('Fatal error: sendmail exited with code %s' % status)
234                                 sys.exit(1)
235                 except:
236                         print '''Error attempting to send email via sendmail. Possibly you need to configure your config.py to use a SMTP server? Please refer to the rss2email documentation or website (http://rss2email.infogami.com) for complete documentation of config.py. The options below may suffice for configuring email:
237 # 1: Use SMTP_SERVER to send mail.
238 # 0: Call /usr/sbin/sendmail to send mail.
239 SMTP_SEND = 0
240
241 SMTP_SERVER = "smtp.yourisp.net:25"
242 AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
243 SMTP_USER = 'username'  # for SMTP AUTH, set SMTP username here
244 SMTP_PASS = 'password'  # for SMTP AUTH, set SMTP password here
245 '''
246                         sys.exit(1)
247                 return None
248
249 ## html2text options ##
250
251 # Use Unicode characters instead of their ascii psuedo-replacements
252 UNICODE_SNOB = 0
253
254 # Put the links after each paragraph instead of at the end.
255 LINKS_EACH_PARAGRAPH = 0
256
257 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
258 BODY_WIDTH = 0
259
260 ### Load the Options ###
261
262 # Read options from config file if present.
263 import sys
264 sys.path.insert(0,".")
265 try:
266         from config import *
267 except:
268         pass
269
270 warn = sys.stderr
271         
272 if QP_REQUIRED:
273         print >>warn, "QP_REQUIRED has been deprecated in rss2email."
274
275 ### Import Modules ###
276
277 import cPickle as pickle, time, os, traceback, sys, types, subprocess
278 hash = ()
279 try:
280         import hashlib
281         hash = hashlib.md5
282 except ImportError:
283         import md5
284         hash = md5.new
285
286 unix = 0
287 try:
288         import fcntl
289 # A pox on SunOS file locking methods   
290         if (sys.platform.find('sunos') == -1): 
291                 unix = 1
292 except:
293         pass
294                 
295 import socket; socket_errors = []
296 for e in ['error', 'gaierror']:
297         if hasattr(socket, e): socket_errors.append(getattr(socket, e))
298
299 #DEPRECATED import mimify 
300 #DEPRECATED from StringIO import StringIO as SIO 
301 #DEPRECATED mimify.CHARSET = 'utf-8'
302
303 import feedparser
304 feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.allthingsrss.com/rss2email/"
305
306 import html2text as h2t
307
308 h2t.UNICODE_SNOB = UNICODE_SNOB
309 h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
310 h2t.BODY_WIDTH = BODY_WIDTH
311 html2text = h2t.html2text
312
313 from types import *
314
315 ### Utility Functions ###
316
317 import threading
318 class TimeoutError(Exception): pass
319
320 class InputError(Exception): pass
321
322 def timelimit(timeout, function):
323 #    def internal(function):
324         def internal2(*args, **kw):
325             """
326             from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/473878
327             """
328             class Calculator(threading.Thread):
329                 def __init__(self):
330                     threading.Thread.__init__(self)
331                     self.result = None
332                     self.error = None
333                 
334                 def run(self):
335                     try:
336                         self.result = function(*args, **kw)
337                     except:
338                         self.error = sys.exc_info()
339             
340             c = Calculator()
341             c.setDaemon(True) # don't hold up exiting
342             c.start()
343             c.join(timeout)
344             if c.isAlive():
345                 raise TimeoutError
346             if c.error:
347                 raise c.error[0], c.error[1]
348             return c.result
349         return internal2
350 #    return internal
351     
352
353 def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
354 def ishtml(t): return type(t) is type(())
355 def contains(a,b): return a.find(b) != -1
356 def unu(s): # I / freakin' hate / that unicode
357         if type(s) is types.UnicodeType: return s.encode('utf-8')
358         else: return s
359
360 ### Parsing Utilities ###
361
362 def getContent(entry, HTMLOK=0):
363         """Select the best content from an entry, deHTMLizing if necessary.
364         If raw HTML is best, an ('HTML', best) tuple is returned. """
365         
366         # How this works:
367         #  * We have a bunch of potential contents. 
368         #  * We go thru looking for our first choice. 
369         #    (HTML or text, depending on HTMLOK)
370         #  * If that doesn't work, we go thru looking for our second choice.
371         #  * If that still doesn't work, we just take the first one.
372         #
373         # Possible future improvement:
374         #  * Instead of just taking the first one
375         #    pick the one in the "best" language.
376         #  * HACK: hardcoded HTMLOK, should take a tuple of media types
377         
378         conts = entry.get('content', [])
379         
380         if entry.get('summary_detail', {}):
381                 conts += [entry.summary_detail]
382         
383         if conts:
384                 if HTMLOK:
385                         for c in conts:
386                                 if contains(c.type, 'html'): return ('HTML', c.value)
387         
388                 if not HTMLOK: # Only need to convert to text if HTML isn't OK
389                         for c in conts:
390                                 if contains(c.type, 'html'):
391                                         return html2text(c.value)
392                 
393                 for c in conts:
394                         if c.type == 'text/plain': return c.value
395         
396                 return conts[0].value   
397         
398         return ""
399
400 def getID(entry):
401         """Get best ID from an entry."""
402         if TRUST_GUID:
403                 if 'id' in entry and entry.id: 
404                         # Newer versions of feedparser could return a dictionary
405                         if type(entry.id) is DictType:
406                                 return entry.id.values()[0]
407
408                         return entry.id
409
410         content = getContent(entry)
411         if content and content != "\n": return hash(unu(content)).hexdigest()
412         if 'link' in entry: return entry.link
413         if 'title' in entry: return hash(unu(entry.title)).hexdigest()
414
415 def getName(r, entry):
416         """Get the best name."""
417
418         if NO_FRIENDLY_NAME: return ''
419
420         feed = r.feed
421         if hasattr(r, "url") and r.url in OVERRIDE_FROM.keys():
422                 return OVERRIDE_FROM[r.url]
423         
424         name = feed.get('title', '')
425
426         if 'name' in entry.get('author_detail', []): # normally {} but py2.1
427                 if entry.author_detail.name:
428                         if name: name += ": "
429                         det=entry.author_detail.name
430                         try:
431                             name +=  entry.author_detail.name
432                         except UnicodeDecodeError:
433                             name +=  unicode(entry.author_detail.name, 'utf-8')
434
435         elif 'name' in feed.get('author_detail', []):
436                 if feed.author_detail.name:
437                         if name: name += ", "
438                         name += feed.author_detail.name
439         
440         return name
441
442 def validateEmail(email, planb):
443         """Do a basic quality check on email address, but return planb if email doesn't appear to be well-formed"""
444         email_parts = email.split('@')
445         if len(email_parts) != 2:
446                 return planb
447         return email
448         
449 def getEmail(r, entry):
450         """Get the best email_address. If the best guess isn't well-formed (something@somthing.com), use DEFAULT_FROM instead"""
451         
452         feed = r.feed
453                 
454         if FORCE_FROM: return DEFAULT_FROM
455         
456         if hasattr(r, "url") and r.url in OVERRIDE_EMAIL.keys():
457                 return validateEmail(OVERRIDE_EMAIL[r.url], DEFAULT_FROM)
458         
459         if 'email' in entry.get('author_detail', []):
460                 return validateEmail(entry.author_detail.email, DEFAULT_FROM)
461         
462         if 'email' in feed.get('author_detail', []):
463                 return validateEmail(feed.author_detail.email, DEFAULT_FROM)
464                 
465         if USE_PUBLISHER_EMAIL:
466                 if 'email' in feed.get('publisher_detail', []):
467                         return validateEmail(feed.publisher_detail.email, DEFAULT_FROM)
468                 
469                 if feed.get("errorreportsto", ''):
470                         return validateEmail(feed.errorreportsto, DEFAULT_FROM)
471                         
472         if hasattr(r, "url") and r.url in DEFAULT_EMAIL.keys():
473                 return DEFAULT_EMAIL[r.url]
474         return DEFAULT_FROM
475
476 ### Simple Database of Feeds ###
477
478 class Feed:
479         def __init__(self, url, to):
480                 self.url, self.etag, self.modified, self.seen = url, None, None, {}
481                 self.active = True
482                 self.to = to            
483
484 def load(lock=1):
485         if not os.path.exists(feedfile):
486                 print 'Feedfile "%s" does not exist.  If you\'re using r2e for the first time, you' % feedfile
487                 print "have to run 'r2e new' first."
488                 sys.exit(1)
489         try:
490                 feedfileObject = open(feedfile, 'r')
491         except IOError, e:
492                 print "Feedfile could not be opened: %s" % e
493                 sys.exit(1)
494         feeds = pickle.load(feedfileObject)
495         
496         if lock:
497                 locktype = 0
498                 if unix:
499                         locktype = fcntl.LOCK_EX
500                         fcntl.flock(feedfileObject.fileno(), locktype)
501                 #HACK: to deal with lock caching
502                 feedfileObject = open(feedfile, 'r')
503                 feeds = pickle.load(feedfileObject)
504                 if unix: 
505                         fcntl.flock(feedfileObject.fileno(), locktype)
506         if feeds: 
507                 for feed in feeds[1:]:
508                         if not hasattr(feed, 'active'): 
509                                 feed.active = True
510                 
511         return feeds, feedfileObject
512
513 def unlock(feeds, feedfileObject):
514         if not unix: 
515                 pickle.dump(feeds, open(feedfile, 'w'))
516         else:   
517                 fd = open(feedfile+'.tmp', 'w')
518                 pickle.dump(feeds, fd)
519                 fd.flush()
520                 os.fsync(fd.fileno())
521                 fd.close()
522                 os.rename(feedfile+'.tmp', feedfile)
523                 fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN)
524
525 #@timelimit(FEED_TIMEOUT)               
526 def parse(url, etag, modified):
527         if PROXY == '':
528                 return feedparser.parse(url, etag, modified)
529         else:
530                 proxy = urllib2.ProxyHandler( {"http":PROXY} )
531                 return feedparser.parse(url, etag, modified, handlers = [proxy])        
532         
533                 
534 ### Program Functions ###
535
536 def add(*args):
537         if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'):
538                 urls, to = [args[0]], args[1]
539         else:
540                 urls, to = args, None
541         
542         feeds, feedfileObject = load()
543         if (feeds and not isstr(feeds[0]) and to is None) or (not len(feeds) and to is None):
544                 print "No email address has been defined. Please run 'r2e email emailaddress' or"
545                 print "'r2e add url emailaddress'."
546                 sys.exit(1)
547         for url in urls: feeds.append(Feed(url, to))
548         unlock(feeds, feedfileObject)
549
550 def run(num=None):
551         feeds, feedfileObject = load()
552         smtpserver = None
553         try:
554                 # We store the default to address as the first item in the feeds list.
555                 # Here we take it out and save it for later.
556                 default_to = ""
557                 if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
558                 else: ifeeds = feeds
559                 
560                 if num: ifeeds = [feeds[num]]
561                 feednum = 0
562                 
563                 for f in ifeeds:
564                         try: 
565                                 feednum += 1
566                                 if not f.active: continue
567                                 
568                                 if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
569                                 r = {}
570                                 try:
571                                         r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
572                                 except TimeoutError:
573                                         print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
574                                         continue
575                                 
576                                 # Handle various status conditions, as required
577                                 if 'status' in r:
578                                         if r.status == 301: f.url = r['url']
579                                         elif r.status == 410:
580                                                 print >>warn, "W: feed gone; deleting", f.url
581                                                 feeds.remove(f)
582                                                 continue
583                                 
584                                 http_status = r.get('status', 200)
585                                 if VERBOSE > 1: print >>warn, "I: http status", http_status
586                                 http_headers = r.get('headers', {
587                                   'content-type': 'application/rss+xml', 
588                                   'content-length':'1'})
589                                 exc_type = r.get("bozo_exception", Exception()).__class__
590                                 if http_status != 304 and not r.entries and not r.get('version', ''):
591                                         if http_status not in [200, 302]: 
592                                                 print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)
593
594                                         elif contains(http_headers.get('content-type', 'rss'), 'html'):
595                                                 print >>warn, "W: looks like HTML [%d] %s"  % (feednum, f.url)
596
597                                         elif http_headers.get('content-length', '1') == '0':
598                                                 print >>warn, "W: empty page [%d] %s" % (feednum, f.url)
599
600                                         elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
601                                                 print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
602                                         
603                                         elif exc_type == IOError:
604                                                 print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
605                                         
606                                         elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
607                                                 print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
608                                         
609                                         elif exc_type in socket_errors:
610                                                 exc_reason = r.bozo_exception.args[1]
611                                                 print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
612
613                                         elif exc_type == urllib2.URLError:
614                                                 if r.bozo_exception.reason.__class__ in socket_errors:
615                                                         exc_reason = r.bozo_exception.reason.args[1]
616                                                 else:
617                                                         exc_reason = r.bozo_exception.reason
618                                                 print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
619                                         
620                                         elif exc_type == AttributeError:
621                                                 print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
622                                         
623                                         elif exc_type == KeyboardInterrupt:
624                                                 raise r.bozo_exception
625                                                 
626                                         elif r.bozo:
627                                                 print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))
628
629                                         else:
630                                                 print >>warn, "=== rss2email encountered a problem with this feed ==="
631                                                 print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
632                                                 print >>warn, "=== If this occurs repeatedly, send this to lindsey@allthingsrss.com ==="
633                                                 print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
634                                                 print >>warn, r
635                                                 print >>warn, "rss2email", __version__
636                                                 print >>warn, "feedparser", feedparser.__version__
637                                                 print >>warn, "html2text", h2t.__version__
638                                                 print >>warn, "Python", sys.version
639                                                 print >>warn, "=== END HERE ==="
640                                         continue
641                                 
642                                 r.entries.reverse()
643                                 
644                                 for entry in r.entries:
645                                         id = getID(entry)
646                                         
647                                         # If TRUST_GUID isn't set, we get back hashes of the content.
648                                         # Instead of letting these run wild, we put them in context
649                                         # by associating them with the actual ID (if it exists).
650                                         
651                                         frameid = entry.get('id')
652                                         if not(frameid): frameid = id
653                                         if type(frameid) is DictType:
654                                                 frameid = frameid.values()[0]
655                                         
656                                         # If this item's ID is in our database
657                                         # then it's already been sent
658                                         # and we don't need to do anything more.
659                                         
660                                         if frameid in f.seen:
661                                                 if f.seen[frameid] == id: continue
662
663                                         if not (f.to or default_to):
664                                                 print "No default email address defined. Please run 'r2e email emailaddress'"
665                                                 print "Ignoring feed %s" % f.url
666                                                 break
667                                         
668                                         if 'title_detail' in entry and entry.title_detail:
669                                                 title = entry.title_detail.value
670                                                 if contains(entry.title_detail.type, 'html'):
671                                                         title = html2text(title)
672                                         else:
673                                                 title = getContent(entry)[:70]
674
675                                         title = title.replace("\n", " ").strip()
676                                         
677                                         datetime = time.gmtime()
678
679                                         if DATE_HEADER:
680                                                 for datetype in DATE_HEADER_ORDER:
681                                                         kind = datetype+"_parsed"
682                                                         if kind in entry and entry[kind]: datetime = entry[kind]
683                                                 
684                                         link = entry.get('link', "")
685                                         
686                                         from_addr = getEmail(r, entry)
687                                         
688                                         name = h2t.unescape(getName(r, entry))
689                                         fromhdr = formataddr((name, from_addr,))
690                                         tohdr = (f.to or default_to)
691                                         subjecthdr = title
692                                         datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
693                                         useragenthdr = "rss2email"
694                                         
695                                         # Add post tags, if available
696                                         tagline = ""
697                                         if 'tags' in entry:
698                                                 tags = entry.get('tags')
699                                                 taglist = []
700                                                 if tags:
701                                                         for tag in tags:
702                                                                 taglist.append(tag['term'])
703                                                 if taglist:
704                                                         tagline = ",".join(taglist)
705                                         
706                                         extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline}
707                                         if BONUS_HEADER != '':
708                                                 for hdr in BONUS_HEADER.strip().splitlines():
709                                                         pos = hdr.strip().find(':')
710                                                         if pos > 0:
711                                                                 extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
712                                                         else:
713                                                                 print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER 
714                                         
715                                         entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
716                                         contenttype = 'plain'
717                                         content = ''
718                                         if USE_CSS_STYLING and HTML_MAIL:
719                                                 contenttype = 'html'
720                                                 content = "<html>\n" 
721                                                 content += '<head><style><!--' + STYLE_SHEET + '//--></style></head>\n'
722                                                 content += '<body>\n'
723                                                 content += '<div id="entry">\n'
724                                                 content += '<h1'
725                                                 content += ' class="header"'
726                                                 content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
727                                                 if ishtml(entrycontent):
728                                                         body = entrycontent[1].strip()
729                                                 else:
730                                                         body = entrycontent.strip()
731                                                 if body != '':  
732                                                         content += '<div id="body"><table><tr><td>\n' + body + '</td></tr></table></div>\n'
733                                                 content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
734                                                 if hasattr(entry,'enclosures'):
735                                                         for enclosure in entry.enclosures:
736                                                                 if (hasattr(enclosure, 'url') and enclosure.url != ""):
737                                                                         content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
738                                                                 if (hasattr(enclosure, 'src') and enclosure.src != ""):
739                                                                         content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
740                                                 if 'links' in entry:
741                                                         for extralink in entry.links:
742                                                                 if ('rel' in extralink) and extralink['rel'] == u'via':
743                                                                         extraurl = extralink['href']
744                                                                         extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
745                                                                         viatitle = extraurl
746                                                                         if ('title' in extralink):
747                                                                             viatitle = extralink['title']
748                                                                         content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n'
749                                                 content += '</p></div>\n'
750                                                 content += "\n\n</body></html>"
751                                         else:   
752                                                 if ishtml(entrycontent):
753                                                         contenttype = 'html'
754                                                         content = "<html>\n" 
755                                                         content = ("<html><body>\n\n" + 
756                                                                    '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
757                                                                    entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
758                                                                    '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
759                                                                    
760                                                         if hasattr(entry,'enclosures'):
761                                                                 for enclosure in entry.enclosures:
762                                                                         if enclosure.url != "":
763                                                                                 content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
764                                                         if 'links' in entry:
765                                                                 for extralink in entry.links:
766                                                                         if ('rel' in extralink) and extralink['rel'] == u'via':
767                                                                                 content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
768                                                                 
769                                                         content += ("\n</body></html>")
770                                                 else:
771                                                         content = entrycontent.strip() + "\n\nURL: "+link
772                                                         if hasattr(entry,'enclosures'):
773                                                                 for enclosure in entry.enclosures:
774                                                                         if enclosure.url != "":
775                                                                                 content += ('\nEnclosure: ' + enclosure.url + "\n")
776                                                         if 'links' in entry:
777                                                                 for extralink in entry.links:
778                                                                         if ('rel' in extralink) and extralink['rel'] == u'via':
779                                                                                 content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'
780
781                                         smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)
782                         
783                                         f.seen[frameid] = id
784                                         
785                                 f.etag, f.modified = r.get('etag', None), r.get('modified', None)
786                         except (KeyboardInterrupt, SystemExit):
787                                 raise
788                         except:
789                                 print >>warn, "=== rss2email encountered a problem with this feed ==="
790                                 print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
791                                 print >>warn, "=== If this occurs repeatedly, send this to lindsey@allthingsrss.com ==="
792                                 print >>warn, "E: could not parse", f.url
793                                 traceback.print_exc(file=warn)
794                                 print >>warn, "rss2email", __version__
795                                 print >>warn, "feedparser", feedparser.__version__
796                                 print >>warn, "html2text", h2t.__version__
797                                 print >>warn, "Python", sys.version
798                                 print >>warn, "=== END HERE ==="
799                                 continue
800
801         finally:                
802                 unlock(feeds, feedfileObject)
803                 if smtpserver:
804                         smtpserver.quit()
805
806 def list():
807         feeds, feedfileObject = load(lock=0)
808         default_to = ""
809         
810         if feeds and isstr(feeds[0]):
811                 default_to = feeds[0]; ifeeds = feeds[1:]; i=1
812                 print "default email:", default_to
813         else: ifeeds = feeds; i = 0
814         for f in ifeeds:
815                 active = ('[ ]', '[*]')[f.active]
816                 print `i`+':',active, f.url, '('+(f.to or ('default: '+default_to))+')'
817                 if not (f.to or default_to):
818                         print "   W: Please define a default address with 'r2e email emailaddress'"
819                 i+= 1
820
821 def opmlexport():
822         import xml.sax.saxutils
823         feeds, feedfileObject = load(lock=0)
824         
825         if feeds:
826                 print '<?xml version="1.0" encoding="UTF-8"?>\n<opml version="1.0">\n<head>\n<title>rss2email OPML export</title>\n</head>\n<body>'
827                 for f in feeds[1:]:
828                         url = xml.sax.saxutils.escape(f.url)
829                         print '<outline type="rss" text="%s" xmlUrl="%s"/>' % (url, url)
830                 print '</body>\n</opml>'
831
832 def opmlimport(importfile):
833         importfileObject = None
834         print 'Importing feeds from', importfile
835         if not os.path.exists(importfile):
836                 print 'OPML import file "%s" does not exist.' % feedfile
837         try:
838                 importfileObject = open(importfile, 'r')
839         except IOError, e:
840                 print "OPML import file could not be opened: %s" % e
841                 sys.exit(1)
842         try:
843                 import xml.dom.minidom
844                 dom = xml.dom.minidom.parse(importfileObject)
845                 newfeeds = dom.getElementsByTagName('outline')
846         except:
847                 print 'E: Unable to parse OPML file'
848                 sys.exit(1)
849
850         feeds, feedfileObject = load(lock=1)
851         
852         import xml.sax.saxutils
853         
854         for f in newfeeds:
855                 if f.hasAttribute('xmlUrl'):
856                         feedurl = f.getAttribute('xmlUrl')
857                         print 'Adding %s' % xml.sax.saxutils.unescape(feedurl)
858                         feeds.append(Feed(feedurl, None))
859                         
860         unlock(feeds, feedfileObject)
861
862 def delete(n):
863         feeds, feedfileObject = load()
864         if (n == 0) and (feeds and isstr(feeds[0])):
865                 print >>warn, "W: ID has to be equal to or higher than 1"
866         elif n >= len(feeds):
867                 print >>warn, "W: no such feed"
868         else:
869                 print >>warn, "W: deleting feed %s" % feeds[n].url
870                 feeds = feeds[:n] + feeds[n+1:]
871                 if n != len(feeds):
872                         print >>warn, "W: feed IDs have changed, list before deleting again"
873         unlock(feeds, feedfileObject)
874         
875 def toggleactive(n, active):
876         feeds, feedfileObject = load()
877         if (n == 0) and (feeds and isstr(feeds[0])):
878                 print >>warn, "W: ID has to be equal to or higher than 1"
879         elif n >= len(feeds):
880                 print >>warn, "W: no such feed"
881         else:
882                 action = ('Pausing', 'Unpausing')[active]
883                 print >>warn, "%s feed %s" % (action, feeds[n].url)
884                 feeds[n].active = active
885         unlock(feeds, feedfileObject)
886         
887 def reset():
888         feeds, feedfileObject = load()
889         if feeds and isstr(feeds[0]):
890                 ifeeds = feeds[1:]
891         else: ifeeds = feeds
892         for f in ifeeds:
893                 if VERBOSE: print "Resetting %d already seen items" % len(f.seen)
894                 f.seen = {}
895                 f.etag = None
896                 f.modified = None
897         
898         unlock(feeds, feedfileObject)
899         
900 def email(addr):
901         feeds, feedfileObject = load()
902         if feeds and isstr(feeds[0]): feeds[0] = addr
903         else: feeds = [addr] + feeds
904         unlock(feeds, feedfileObject)
905
906 if __name__ == '__main__':
907         args = sys.argv
908         try:
909                 if len(args) < 3: raise InputError, "insufficient args"
910                 feedfile, action, args = args[1], args[2], args[3:]
911                 
912                 if action == "run": 
913                         if args and args[0] == "--no-send":
914                                 def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None):
915                                         if VERBOSE: print 'Not sending:', unu(subject)
916
917                         if args and args[-1].isdigit(): run(int(args[-1]))
918                         else: run()
919
920                 elif action == "email":
921                         if not args:
922                                 raise InputError, "Action '%s' requires an argument" % action
923                         else:
924                                 email(args[0])
925
926                 elif action == "add": add(*args)
927
928                 elif action == "new": 
929                         if len(args) == 1: d = [args[0]]
930                         else: d = []
931                         pickle.dump(d, open(feedfile, 'w'))
932
933                 elif action == "list": list()
934
935                 elif action in ("help", "--help", "-h"): print __doc__
936
937                 elif action == "delete":
938                         if not args:
939                                 raise InputError, "Action '%s' requires an argument" % action
940                         elif args[0].isdigit():
941                                 delete(int(args[0]))
942                         else:
943                                 raise InputError, "Action '%s' requires a number as its argument" % action
944
945                 elif action in ("pause", "unpause"):
946                         if not args:
947                                 raise InputError, "Action '%s' requires an argument" % action
948                         elif args[0].isdigit():
949                                 active = (action == "unpause")
950                                 toggleactive(int(args[0]), active)
951                         else:
952                                 raise InputError, "Action '%s' requires a number as its argument" % action
953
954                 elif action == "reset": reset()
955
956                 elif action == "opmlexport": opmlexport()
957
958                 elif action == "opmlimport": 
959                         if not args:
960                                 raise InputError, "OPML import '%s' requires a filename argument" % action
961                         opmlimport(args[0])
962
963                 else:
964                         raise InputError, "Invalid action"
965                 
966         except InputError, e:
967                 print "E:", e
968                 print
969                 print __doc__
970