rss2email.py

   1 #!/usr/bin/python
   2 """rss2email: get RSS feeds emailed to you
   3 http://rss2email.infogami.com
   4
   5 Usage:
   6   new [emailaddress] (create new feedfile)
   7   email newemailaddress (update default email)
   8   run [--no-send] [num]
   9   add feedurl [emailaddress]
  10   list
  11   reset
  12   delete n
  13   pause n
  14   unpause n
  15   opmlexport
  16   opmlimport filename
  17 """
  18 __version__ = "2.71"
  19 __author__ = "Lindsey Smith (lindsey@allthingsrss.com)"
  20 __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3."
  21 ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess",
  22                      "Matej Cepl", "Martin 'Joey' Schulze",
  23                      "Marcel Ackermann (http://www.DreamFlasher.de)",
  24                      "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
  25
  26 import urllib2
  27 urllib2.install_opener(urllib2.build_opener())
  28
  29 ### Vaguely Customizable Options ###
  30
  31 # The email address messages are from by default:
  32 DEFAULT_FROM = "user@rss2email.invalid"
  33
  34 # 1: Send text/html messages when possible.
  35 # 0: Convert HTML to plain text.
  36 HTML_MAIL = 0
  37
  38 # 1: Only use the DEFAULT_FROM address.
  39 # 0: Use the email address specified by the feed, when possible.
  40 FORCE_FROM = 0
  41
  42 # 1: Receive one email per post.
  43 # 0: Receive an email every time a post changes.
  44 TRUST_GUID = 1
  45
  46 # 1: Generate Date header based on item's date, when possible.
  47 # 0: Generate Date header based on time sent.
  48 DATE_HEADER = 0
  49
  50 # A tuple consisting of some combination of
  51 # ('issued', 'created', 'modified', 'expired')
  52 # expressing ordered list of preference in dates
  53 # to use for the Date header of the email.
  54 DATE_HEADER_ORDER = ('modified', 'issued', 'created')
  55
  56 # 1: Apply Q-P conversion (required for some MUAs).
  57 # 0: Send message in 8-bits.
  58 # http://cr.yp.to/smtp/8bitmime.html
  59 #DEPRECATED
  60 QP_REQUIRED = 0
  61 #DEPRECATED
  62
  63 # 1: Name feeds as they're being processed.
  64 # 0: Keep quiet.
  65 VERBOSE = 0
  66
  67 # 1: Use the publisher's email if you can't find the author's.
  68 # 0: Just use the DEFAULT_FROM email instead.
  69 USE_PUBLISHER_EMAIL = 0
  70
  71 # 1: Use SMTP_SERVER to send mail.
  72 # 0: Call /usr/sbin/sendmail to send mail.
  73 SMTP_SEND = 0
  74
  75 SMTP_SERVER = "smtp.yourisp.net:25"
  76 AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
  77 SMTP_USER = 'username'  # for SMTP AUTH, set SMTP username here
  78 SMTP_PASS = 'password'  # for SMTP AUTH, set SMTP password here
  79
  80 # Connect to the SMTP server using SSL
  81 SMTP_SSL = 0
  82
  83 # Set this to add a bonus header to all emails (start with '\n').
  84 BONUS_HEADER = ''
  85 # Example: BONUS_HEADER = '\nApproved: joe@bob.org'
  86
  87 # Set this to override From addresses. Keys are feed URLs, values are new titles.
  88 OVERRIDE_FROM = {}
  89
  90 # Set this to override From email addresses. Keys are feed URLs, values are new emails.
  91 OVERRIDE_EMAIL = {}
  92
  93 # Set this to default From email addresses. Keys are feed URLs, values are new email addresses.
  94 DEFAULT_EMAIL = {}
  95
  96 # Only use the email from address rather than friendly name plus email address
  97 NO_FRIENDLY_NAME = 0
  98
  99 # Set this to override the timeout (in seconds) for feed server response
 100 FEED_TIMEOUT = 60
 101
 102 # Optional CSS styling
 103 USE_CSS_STYLING = 0
 104 STYLE_SHEET='h1 {font: 18pt Georgia, "Times New Roman";} body {font: 12pt Arial;} a:link {font: 12pt Arial; font-weight: bold; color: #0000cc} blockquote {font-family: monospace; }  .header { background: #e0ecff; border-bottom: solid 4px #c3d9ff; padding: 5px; margin-top: 0px; color: red;} .header a { font-size: 20px; text-decoration: none; } .footer { background: #c3d9ff; border-top: solid 4px #c3d9ff; padding: 5px; margin-bottom: 0px; } #entry {border: solid 4px #c3d9ff; } #body { margin-left: 5px; margin-right: 5px; }'
 105
 106 # If you have an HTTP Proxy set this in the format 'http://your.proxy.here:8080/'
 107 PROXY=""
 108
 109 # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works
 110 # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes
 111 CHARSET_LIST='US-ASCII', 'ISO-8859-1', 'UTF-8', 'BIG5', 'ISO-2022-JP'
 112
 113 from email.MIMEText import MIMEText
 114 from email.Header import Header as _Header
 115 from email.Utils import parseaddr, formataddr
 116
 117 class Header(_Header):
 118     # Work-around for <http://bugs.python.org/issue5871>
 119     def append(self, s=None, *args, **kwargs):
 120         if s is not None:
 121             s = s.replace('\n', ' ').replace('\r', ' ')
 122         _Header.append(self, s, *args, **kwargs)
 123
 124 # Note: You can also override the send function.
 125
 126 def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None):
 127         """Send an email.
 128
 129         All arguments should be Unicode strings (plain ASCII works as well).
 130
 131         Only the real name part of sender and recipient addresses may contain
 132         non-ASCII characters.
 133
 134         The email will be properly MIME encoded and delivered though SMTP to
 135         localhost port 25.  This is easy to change if you want something different.
 136
 137         The charset of the email will be the first one out of the list
 138         that can represent all the characters occurring in the email.
 139         """
 140
 141         # Header class is smart enough to try US-ASCII, then the charset we
 142         # provide, then fall back to UTF-8.
 143         header_charset = 'ISO-8859-1'
 144
 145         # We must choose the body charset manually
 146         for body_charset in CHARSET_LIST:
 147             try:
 148                 body.encode(body_charset)
 149             except (UnicodeError, LookupError):
 150                 pass
 151             else:
 152                 break
 153
 154         # Split real name (which is optional) and email address parts
 155         sender_name, sender_addr = parseaddr(sender)
 156         recipient_name, recipient_addr = parseaddr(recipient)
 157
 158         # We must always pass Unicode strings to Header, otherwise it will
 159         # use RFC 2047 encoding even on plain ASCII strings.
 160         sender_name = str(Header(unicode(sender_name), header_charset))
 161         recipient_name = str(Header(unicode(recipient_name), header_charset))
 162
 163         # Make sure email addresses do not contain non-ASCII characters
 164         sender_addr = sender_addr.encode('ascii')
 165         recipient_addr = recipient_addr.encode('ascii')
 166
 167         # Create the message ('plain' stands for Content-Type: text/plain)
 168         msg = MIMEText(body.encode(body_charset), contenttype, body_charset)
 169         msg['To'] = formataddr((recipient_name, recipient_addr))
 170         msg['Subject'] = Header(unicode(subject), header_charset)
 171         for hdr in extraheaders.keys():
 172                 try:
 173                         msg[hdr] = Header(unicode(extraheaders[hdr], header_charset))
 174                 except:
 175                         msg[hdr] = Header(extraheaders[hdr])
 176
 177         fromhdr = formataddr((sender_name, sender_addr))
 178         msg['From'] = fromhdr
 179
 180         msg_as_string = msg.as_string()
 181 #DEPRECATED     if QP_REQUIRED:
 182 #DEPRECATED             ins, outs = SIO(msg_as_string), SIO()
 183 #DEPRECATED             mimify.mimify(ins, outs)
 184 #DEPRECATED             msg_as_string = outs.getvalue()
 185
 186         if SMTP_SEND:
 187                 if not smtpserver:
 188                         import smtplib
 189
 190                         try:
 191                                 if SMTP_SSL:
 192                                         smtpserver = smtplib.SMTP_SSL()
 193                                 else:
 194                                         smtpserver = smtplib.SMTP()
 195                                 smtpserver.connect(SMTP_SERVER)
 196                         except KeyboardInterrupt:
 197                                 raise
 198                         except Exception, e:
 199                                 print >>warn, ""
 200                                 print >>warn, ('Fatal error: could not connect to mail server "%s"' % SMTP_SERVER)
 201                                 print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly')
 202                                 if hasattr(e, 'reason'):
 203                                         print >>warn, "Reason:", e.reason
 204                                 sys.exit(1)
 205
 206                         if AUTHREQUIRED:
 207                                 try:
 208                                         smtpserver.ehlo()
 209                                         if not SMTP_SSL: smtpserver.starttls()
 210                                         smtpserver.ehlo()
 211                                         smtpserver.login(SMTP_USER, SMTP_PASS)
 212                                 except KeyboardInterrupt:
 213                                         raise
 214                                 except Exception, e:
 215                                         print >>warn, ""
 216                                         print >>warn, ('Fatal error: could not authenticate with mail server "%s" as user "%s"' % (SMTP_SERVER, SMTP_USER))
 217                                         print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly')
 218                                         if hasattr(e, 'reason'):
 219                                                 print >>warn, "Reason:", e.reason
 220                                         sys.exit(1)
 221
 222                 smtpserver.sendmail(sender, recipient, msg_as_string)
 223                 return smtpserver
 224
 225         else:
 226                 try:
 227                         p = subprocess.Popen(["/usr/sbin/sendmail", recipient], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 228                         p.communicate(msg_as_string)
 229                         status = p.returncode
 230                         assert status != None, "just a sanity check"
 231                         if status != 0:
 232                                 print >>warn, ""
 233                                 print >>warn, ('Fatal error: sendmail exited with code %s' % status)
 234                                 sys.exit(1)
 235                 except:
 236                         print '''Error attempting to send email via sendmail. Possibly you need to configure your config.py to use a SMTP server? Please refer to the rss2email documentation or website (http://rss2email.infogami.com) for complete documentation of config.py. The options below may suffice for configuring email:
 237 # 1: Use SMTP_SERVER to send mail.
 238 # 0: Call /usr/sbin/sendmail to send mail.
 239 SMTP_SEND = 0
 240
 241 SMTP_SERVER = "smtp.yourisp.net:25"
 242 AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
 243 SMTP_USER = 'username'  # for SMTP AUTH, set SMTP username here
 244 SMTP_PASS = 'password'  # for SMTP AUTH, set SMTP password here
 245 '''
 246                         sys.exit(1)
 247                 return None
 248
 249 ## html2text options ##
 250
 251 # Use Unicode characters instead of their ascii psuedo-replacements
 252 UNICODE_SNOB = 0
 253
 254 # Put the links after each paragraph instead of at the end.
 255 LINKS_EACH_PARAGRAPH = 0
 256
 257 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
 258 BODY_WIDTH = 0
 259
 260 ### Load the Options ###
 261
 262 # Read options from config file if present.
 263 import sys
 264 sys.path.insert(0,".")
 265 try:
 266         from config import *
 267 except:
 268         pass
 269
 270 warn = sys.stderr
 271
 272 if QP_REQUIRED:
 273         print >>warn, "QP_REQUIRED has been deprecated in rss2email."
 274
 275 ### Import Modules ###
 276
 277 import cPickle as pickle, time, os, traceback, sys, types, subprocess
 278 hash = ()
 279 try:
 280         import hashlib
 281         hash = hashlib.md5
 282 except ImportError:
 283         import md5
 284         hash = md5.new
 285
 286 unix = 0
 287 try:
 288         import fcntl
 289 # A pox on SunOS file locking methods
 290         if (sys.platform.find('sunos') == -1):
 291                 unix = 1
 292 except:
 293         pass
 294
 295 import socket; socket_errors = []
 296 for e in ['error', 'gaierror']:
 297         if hasattr(socket, e): socket_errors.append(getattr(socket, e))
 298
 299 #DEPRECATED import mimify
 300 #DEPRECATED from StringIO import StringIO as SIO
 301 #DEPRECATED mimify.CHARSET = 'utf-8'
 302
 303 import feedparser
 304 feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.allthingsrss.com/rss2email/"
 305
 306 import html2text as h2t
 307
 308 h2t.UNICODE_SNOB = UNICODE_SNOB
 309 h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
 310 h2t.BODY_WIDTH = BODY_WIDTH
 311 html2text = h2t.html2text
 312
 313 from types import *
 314
 315 ### Utility Functions ###
 316
 317 import threading
 318 class TimeoutError(Exception): pass
 319
 320 class InputError(Exception): pass
 321
 322 def timelimit(timeout, function):
 323 #    def internal(function):
 324         def internal2(*args, **kw):
 325             """
 326             from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/473878
 327             """
 328             class Calculator(threading.Thread):
 329                 def __init__(self):
 330                     threading.Thread.__init__(self)
 331                     self.result = None
 332                     self.error = None
 333
 334                 def run(self):
 335                     try:
 336                         self.result = function(*args, **kw)
 337                     except:
 338                         self.error = sys.exc_info()
 339
 340             c = Calculator()
 341             c.setDaemon(True) # don't hold up exiting
 342             c.start()
 343             c.join(timeout)
 344             if c.isAlive():
 345                 raise TimeoutError
 346             if c.error:
 347                 raise c.error[0], c.error[1]
 348             return c.result
 349         return internal2
 350 #    return internal
 351
 352
 353 def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
 354 def ishtml(t): return type(t) is type(())
 355 def contains(a,b): return a.find(b) != -1
 356 def unu(s): # I / freakin' hate / that unicode
 357         if type(s) is types.UnicodeType: return s.encode('utf-8')
 358         else: return s
 359
 360 ### Parsing Utilities ###
 361
 362 def getContent(entry, HTMLOK=0):
 363         """Select the best content from an entry, deHTMLizing if necessary.
 364         If raw HTML is best, an ('HTML', best) tuple is returned. """
 365
 366         # How this works:
 367         #  * We have a bunch of potential contents.
 368         #  * We go thru looking for our first choice.
 369         #    (HTML or text, depending on HTMLOK)
 370         #  * If that doesn't work, we go thru looking for our second choice.
 371         #  * If that still doesn't work, we just take the first one.
 372         #
 373         # Possible future improvement:
 374         #  * Instead of just taking the first one
 375         #    pick the one in the "best" language.
 376         #  * HACK: hardcoded HTMLOK, should take a tuple of media types
 377
 378         conts = entry.get('content', [])
 379
 380         if entry.get('summary_detail', {}):
 381                 conts += [entry.summary_detail]
 382
 383         if conts:
 384                 if HTMLOK:
 385                         for c in conts:
 386                                 if contains(c.type, 'html'): return ('HTML', c.value)
 387
 388                 if not HTMLOK: # Only need to convert to text if HTML isn't OK
 389                         for c in conts:
 390                                 if contains(c.type, 'html'):
 391                                         return html2text(c.value)
 392
 393                 for c in conts:
 394                         if c.type == 'text/plain': return c.value
 395
 396                 return conts[0].value
 397
 398         return ""
 399
 400 def getID(entry):
 401         """Get best ID from an entry."""
 402         if TRUST_GUID:
 403                 if 'id' in entry and entry.id:
 404                         # Newer versions of feedparser could return a dictionary
 405                         if type(entry.id) is DictType:
 406                                 return entry.id.values()[0]
 407
 408                         return entry.id
 409
 410         content = getContent(entry)
 411         if content and content != "\n": return hash(unu(content)).hexdigest()
 412         if 'link' in entry: return entry.link
 413         if 'title' in entry: return hash(unu(entry.title)).hexdigest()
 414
 415 def getName(r, entry):
 416         """Get the best name."""
 417
 418         if NO_FRIENDLY_NAME: return ''
 419
 420         feed = r.feed
 421         if hasattr(r, "url") and r.url in OVERRIDE_FROM.keys():
 422                 return OVERRIDE_FROM[r.url]
 423
 424         name = feed.get('title', '')
 425
 426         if 'name' in entry.get('author_detail', []): # normally {} but py2.1
 427                 if entry.author_detail.name:
 428                         if name: name += ": "
 429                         det=entry.author_detail.name
 430                         try:
 431                             name +=  entry.author_detail.name
 432                         except UnicodeDecodeError:
 433                             name +=  unicode(entry.author_detail.name, 'utf-8')
 434
 435         elif 'name' in feed.get('author_detail', []):
 436                 if feed.author_detail.name:
 437                         if name: name += ", "
 438                         name += feed.author_detail.name
 439
 440         return name
 441
 442 def validateEmail(email, planb):
 443         """Do a basic quality check on email address, but return planb if email doesn't appear to be well-formed"""
 444         email_parts = email.split('@')
 445         if len(email_parts) != 2:
 446                 return planb
 447         return email
 448
 449 def getEmail(r, entry):
 450         """Get the best email_address. If the best guess isn't well-formed (something@somthing.com), use DEFAULT_FROM instead"""
 451
 452         feed = r.feed
 453
 454         if FORCE_FROM: return DEFAULT_FROM
 455
 456         if hasattr(r, "url") and r.url in OVERRIDE_EMAIL.keys():
 457                 return validateEmail(OVERRIDE_EMAIL[r.url], DEFAULT_FROM)
 458
 459         if 'email' in entry.get('author_detail', []):
 460                 return validateEmail(entry.author_detail.email, DEFAULT_FROM)
 461
 462         if 'email' in feed.get('author_detail', []):
 463                 return validateEmail(feed.author_detail.email, DEFAULT_FROM)
 464
 465         if USE_PUBLISHER_EMAIL:
 466                 if 'email' in feed.get('publisher_detail', []):
 467                         return validateEmail(feed.publisher_detail.email, DEFAULT_FROM)
 468
 469                 if feed.get("errorreportsto", ''):
 470                         return validateEmail(feed.errorreportsto, DEFAULT_FROM)
 471
 472         if hasattr(r, "url") and r.url in DEFAULT_EMAIL.keys():
 473                 return DEFAULT_EMAIL[r.url]
 474         return DEFAULT_FROM
 475
 476 ### Simple Database of Feeds ###
 477
 478 class Feed:
 479         def __init__(self, url, to):
 480                 self.url, self.etag, self.modified, self.seen = url, None, None, {}
 481                 self.active = True
 482                 self.to = to
 483
 484 def load(lock=1):
 485         if not os.path.exists(feedfile):
 486                 print 'Feedfile "%s" does not exist.  If you\'re using r2e for the first time, you' % feedfile
 487                 print "have to run 'r2e new' first."
 488                 sys.exit(1)
 489         try:
 490                 feedfileObject = open(feedfile, 'r')
 491         except IOError, e:
 492                 print "Feedfile could not be opened: %s" % e
 493                 sys.exit(1)
 494         feeds = pickle.load(feedfileObject)
 495
 496         if lock:
 497                 locktype = 0
 498                 if unix:
 499                         locktype = fcntl.LOCK_EX
 500                         fcntl.flock(feedfileObject.fileno(), locktype)
 501                 #HACK: to deal with lock caching
 502                 feedfileObject = open(feedfile, 'r')
 503                 feeds = pickle.load(feedfileObject)
 504                 if unix:
 505                         fcntl.flock(feedfileObject.fileno(), locktype)
 506         if feeds:
 507                 for feed in feeds[1:]:
 508                         if not hasattr(feed, 'active'):
 509                                 feed.active = True
 510
 511         return feeds, feedfileObject
 512
 513 def unlock(feeds, feedfileObject):
 514         if not unix:
 515                 pickle.dump(feeds, open(feedfile, 'w'))
 516         else:
 517                 fd = open(feedfile+'.tmp', 'w')
 518                 pickle.dump(feeds, fd)
 519                 fd.flush()
 520                 os.fsync(fd.fileno())
 521                 fd.close()
 522                 os.rename(feedfile+'.tmp', feedfile)
 523                 fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN)
 524
 525 #@timelimit(FEED_TIMEOUT)
 526 def parse(url, etag, modified):
 527         if PROXY == '':
 528                 return feedparser.parse(url, etag, modified)
 529         else:
 530                 proxy = urllib2.ProxyHandler( {"http":PROXY} )
 531                 return feedparser.parse(url, etag, modified, handlers = [proxy])
 532
 533
 534 ### Program Functions ###
 535
 536 def add(*args):
 537         if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'):
 538                 urls, to = [args[0]], args[1]
 539         else:
 540                 urls, to = args, None
 541
 542         feeds, feedfileObject = load()
 543         if (feeds and not isstr(feeds[0]) and to is None) or (not len(feeds) and to is None):
 544                 print "No email address has been defined. Please run 'r2e email emailaddress' or"
 545                 print "'r2e add url emailaddress'."
 546                 sys.exit(1)
 547         for url in urls: feeds.append(Feed(url, to))
 548         unlock(feeds, feedfileObject)
 549
 550 def run(num=None):
 551         feeds, feedfileObject = load()
 552         smtpserver = None
 553         try:
 554                 # We store the default to address as the first item in the feeds list.
 555                 # Here we take it out and save it for later.
 556                 default_to = ""
 557                 if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:]
 558                 else: ifeeds = feeds
 559
 560                 if num: ifeeds = [feeds[num]]
 561                 feednum = 0
 562
 563                 for f in ifeeds:
 564                         try:
 565                                 feednum += 1
 566                                 if not f.active: continue
 567
 568                                 if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
 569                                 r = {}
 570                                 try:
 571                                         r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
 572                                 except TimeoutError:
 573                                         print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
 574                                         continue
 575
 576                                 # Handle various status conditions, as required
 577                                 if 'status' in r:
 578                                         if r.status == 301: f.url = r['url']
 579                                         elif r.status == 410:
 580                                                 print >>warn, "W: feed gone; deleting", f.url
 581                                                 feeds.remove(f)
 582                                                 continue
 583
 584                                 http_status = r.get('status', 200)
 585                                 if VERBOSE > 1: print >>warn, "I: http status", http_status
 586                                 http_headers = r.get('headers', {
 587                                   'content-type': 'application/rss+xml',
 588                                   'content-length':'1'})
 589                                 exc_type = r.get("bozo_exception", Exception()).__class__
 590                                 if http_status != 304 and not r.entries and not r.get('version', ''):
 591                                         if http_status not in [200, 302]:
 592                                                 print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)
 593
 594                                         elif contains(http_headers.get('content-type', 'rss'), 'html'):
 595                                                 print >>warn, "W: looks like HTML [%d] %s"  % (feednum, f.url)
 596
 597                                         elif http_headers.get('content-length', '1') == '0':
 598                                                 print >>warn, "W: empty page [%d] %s" % (feednum, f.url)
 599
 600                                         elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
 601                                                 print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
 602
 603                                         elif exc_type == IOError:
 604                                                 print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
 605
 606                                         elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
 607                                                 print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
 608
 609                                         elif exc_type in socket_errors:
 610                                                 exc_reason = r.bozo_exception.args[1]
 611                                                 print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
 612
 613                                         elif exc_type == urllib2.URLError:
 614                                                 if r.bozo_exception.reason.__class__ in socket_errors:
 615                                                         exc_reason = r.bozo_exception.reason.args[1]
 616                                                 else:
 617                                                         exc_reason = r.bozo_exception.reason
 618                                                 print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
 619
 620                                         elif exc_type == AttributeError:
 621                                                 print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
 622
 623                                         elif exc_type == KeyboardInterrupt:
 624                                                 raise r.bozo_exception
 625
 626                                         elif r.bozo:
 627                                                 print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))
 628
 629                                         else:
 630                                                 print >>warn, "=== rss2email encountered a problem with this feed ==="
 631                                                 print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
 632                                                 print >>warn, "=== If this occurs repeatedly, send this to lindsey@allthingsrss.com ==="
 633                                                 print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
 634                                                 print >>warn, r
 635                                                 print >>warn, "rss2email", __version__
 636                                                 print >>warn, "feedparser", feedparser.__version__
 637                                                 print >>warn, "html2text", h2t.__version__
 638                                                 print >>warn, "Python", sys.version
 639                                                 print >>warn, "=== END HERE ==="
 640                                         continue
 641
 642                                 r.entries.reverse()
 643
 644                                 for entry in r.entries:
 645                                         id = getID(entry)
 646
 647                                         # If TRUST_GUID isn't set, we get back hashes of the content.
 648                                         # Instead of letting these run wild, we put them in context
 649                                         # by associating them with the actual ID (if it exists).
 650
 651                                         frameid = entry.get('id')
 652                                         if not(frameid): frameid = id
 653                                         if type(frameid) is DictType:
 654                                                 frameid = frameid.values()[0]
 655
 656                                         # If this item's ID is in our database
 657                                         # then it's already been sent
 658                                         # and we don't need to do anything more.
 659
 660                                         if frameid in f.seen:
 661                                                 if f.seen[frameid] == id: continue
 662
 663                                         if not (f.to or default_to):
 664                                                 print "No default email address defined. Please run 'r2e email emailaddress'"
 665                                                 print "Ignoring feed %s" % f.url
 666                                                 break
 667
 668                                         if 'title_detail' in entry and entry.title_detail:
 669                                                 title = entry.title_detail.value
 670                                                 if contains(entry.title_detail.type, 'html'):
 671                                                         title = html2text(title)
 672                                         else:
 673                                                 title = getContent(entry)[:70]
 674
 675                                         title = title.replace("\n", " ").strip()
 676
 677                                         datetime = time.gmtime()
 678
 679                                         if DATE_HEADER:
 680                                                 for datetype in DATE_HEADER_ORDER:
 681                                                         kind = datetype+"_parsed"
 682                                                         if kind in entry and entry[kind]: datetime = entry[kind]
 683
 684                                         link = entry.get('link', "")
 685
 686                                         from_addr = getEmail(r, entry)
 687
 688                                         name = h2t.unescape(getName(r, entry))
 689                                         fromhdr = formataddr((name, from_addr,))
 690                                         tohdr = (f.to or default_to)
 691                                         subjecthdr = title
 692                                         datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
 693                                         useragenthdr = "rss2email"
 694
 695                                         # Add post tags, if available
 696                                         tagline = ""
 697                                         if 'tags' in entry:
 698                                                 tags = entry.get('tags')
 699                                                 taglist = []
 700                                                 if tags:
 701                                                         for tag in tags:
 702                                                                 taglist.append(tag['term'])
 703                                                 if taglist:
 704                                                         tagline = ",".join(taglist)
 705
 706                                         extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline}
 707                                         if BONUS_HEADER != '':
 708                                                 for hdr in BONUS_HEADER.strip().splitlines():
 709                                                         pos = hdr.strip().find(':')
 710                                                         if pos > 0:
 711                                                                 extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
 712                                                         else:
 713                                                                 print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER
 714
 715                                         entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
 716                                         contenttype = 'plain'
 717                                         content = ''
 718                                         if USE_CSS_STYLING and HTML_MAIL:
 719                                                 contenttype = 'html'
 720                                                 content = "<html>\n"
 721                                                 content += '<head><style><!--' + STYLE_SHEET + '//--></style></head>\n'
 722                                                 content += '<body>\n'
 723                                                 content += '<div id="entry">\n'
 724                                                 content += '<h1'
 725                                                 content += ' class="header"'
 726                                                 content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
 727                                                 if ishtml(entrycontent):
 728                                                         body = entrycontent[1].strip()
 729                                                 else:
 730                                                         body = entrycontent.strip()
 731                                                 if body != '':
 732                                                         content += '<div id="body"><table><tr><td>\n' + body + '</td></tr></table></div>\n'
 733                                                 content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
 734                                                 if hasattr(entry,'enclosures'):
 735                                                         for enclosure in entry.enclosures:
 736                                                                 if (hasattr(enclosure, 'url') and enclosure.url != ""):
 737                                                                         content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
 738                                                                 if (hasattr(enclosure, 'src') and enclosure.src != ""):
 739                                                                         content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
 740                                                 if 'links' in entry:
 741                                                         for extralink in entry.links:
 742                                                                 if ('rel' in extralink) and extralink['rel'] == u'via':
 743                                                                         extraurl = extralink['href']
 744                                                                         extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
 745                                                                         viatitle = extraurl
 746                                                                         if ('title' in extralink):
 747                                                                             viatitle = extralink['title']
 748                                                                         content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n'
 749                                                 content += '</p></div>\n'
 750                                                 content += "\n\n</body></html>"
 751                                         else:
 752                                                 if ishtml(entrycontent):
 753                                                         contenttype = 'html'
 754                                                         content = "<html>\n"
 755                                                         content = ("<html><body>\n\n" +
 756                                                                    '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
 757                                                                    entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
 758                                                                    '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
 759
 760                                                         if hasattr(entry,'enclosures'):
 761                                                                 for enclosure in entry.enclosures:
 762                                                                         if enclosure.url != "":
 763                                                                                 content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
 764                                                         if 'links' in entry:
 765                                                                 for extralink in entry.links:
 766                                                                         if ('rel' in extralink) and extralink['rel'] == u'via':
 767                                                                                 content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
 768
 769                                                         content += ("\n</body></html>")
 770                                                 else:
 771                                                         content = entrycontent.strip() + "\n\nURL: "+link
 772                                                         if hasattr(entry,'enclosures'):
 773                                                                 for enclosure in entry.enclosures:
 774                                                                         if enclosure.url != "":
 775                                                                                 content += ('\nEnclosure: ' + enclosure.url + "\n")
 776                                                         if 'links' in entry:
 777                                                                 for extralink in entry.links:
 778                                                                         if ('rel' in extralink) and extralink['rel'] == u'via':
 779                                                                                 content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'
 780
 781                                         smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)
 782
 783                                         f.seen[frameid] = id
 784
 785                                 f.etag, f.modified = r.get('etag', None), r.get('modified', None)
 786                         except (KeyboardInterrupt, SystemExit):
 787                                 raise
 788                         except:
 789                                 print >>warn, "=== rss2email encountered a problem with this feed ==="
 790                                 print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
 791                                 print >>warn, "=== If this occurs repeatedly, send this to lindsey@allthingsrss.com ==="
 792                                 print >>warn, "E: could not parse", f.url
 793                                 traceback.print_exc(file=warn)
 794                                 print >>warn, "rss2email", __version__
 795                                 print >>warn, "feedparser", feedparser.__version__
 796                                 print >>warn, "html2text", h2t.__version__
 797                                 print >>warn, "Python", sys.version
 798                                 print >>warn, "=== END HERE ==="
 799                                 continue
 800
 801         finally:
 802                 unlock(feeds, feedfileObject)
 803                 if smtpserver:
 804                         smtpserver.quit()
 805
 806 def list():
 807         feeds, feedfileObject = load(lock=0)
 808         default_to = ""
 809
 810         if feeds and isstr(feeds[0]):
 811                 default_to = feeds[0]; ifeeds = feeds[1:]; i=1
 812                 print "default email:", default_to
 813         else: ifeeds = feeds; i = 0
 814         for f in ifeeds:
 815                 active = ('[ ]', '[*]')[f.active]
 816                 print `i`+':',active, f.url, '('+(f.to or ('default: '+default_to))+')'
 817                 if not (f.to or default_to):
 818                         print "   W: Please define a default address with 'r2e email emailaddress'"
 819                 i+= 1
 820
 821 def opmlexport():
 822         import xml.sax.saxutils
 823         feeds, feedfileObject = load(lock=0)
 824
 825         if feeds:
 826                 print '<?xml version="1.0" encoding="UTF-8"?>\n<opml version="1.0">\n<head>\n<title>rss2email OPML export</title>\n</head>\n<body>'
 827                 for f in feeds[1:]:
 828                         url = xml.sax.saxutils.escape(f.url)
 829                         print '<outline type="rss" text="%s" xmlUrl="%s"/>' % (url, url)
 830                 print '</body>\n</opml>'
 831
 832 def opmlimport(importfile):
 833         importfileObject = None
 834         print 'Importing feeds from', importfile
 835         if not os.path.exists(importfile):
 836                 print 'OPML import file "%s" does not exist.' % feedfile
 837         try:
 838                 importfileObject = open(importfile, 'r')
 839         except IOError, e:
 840                 print "OPML import file could not be opened: %s" % e
 841                 sys.exit(1)
 842         try:
 843                 import xml.dom.minidom
 844                 dom = xml.dom.minidom.parse(importfileObject)
 845                 newfeeds = dom.getElementsByTagName('outline')
 846         except:
 847                 print 'E: Unable to parse OPML file'
 848                 sys.exit(1)
 849
 850         feeds, feedfileObject = load(lock=1)
 851
 852         import xml.sax.saxutils
 853
 854         for f in newfeeds:
 855                 if f.hasAttribute('xmlUrl'):
 856                         feedurl = f.getAttribute('xmlUrl')
 857                         print 'Adding %s' % xml.sax.saxutils.unescape(feedurl)
 858                         feeds.append(Feed(feedurl, None))
 859
 860         unlock(feeds, feedfileObject)
 861
 862 def delete(n):
 863         feeds, feedfileObject = load()
 864         if (n == 0) and (feeds and isstr(feeds[0])):
 865                 print >>warn, "W: ID has to be equal to or higher than 1"
 866         elif n >= len(feeds):
 867                 print >>warn, "W: no such feed"
 868         else:
 869                 print >>warn, "W: deleting feed %s" % feeds[n].url
 870                 feeds = feeds[:n] + feeds[n+1:]
 871                 if n != len(feeds):
 872                         print >>warn, "W: feed IDs have changed, list before deleting again"
 873         unlock(feeds, feedfileObject)
 874
 875 def toggleactive(n, active):
 876         feeds, feedfileObject = load()
 877         if (n == 0) and (feeds and isstr(feeds[0])):
 878                 print >>warn, "W: ID has to be equal to or higher than 1"
 879         elif n >= len(feeds):
 880                 print >>warn, "W: no such feed"
 881         else:
 882                 action = ('Pausing', 'Unpausing')[active]
 883                 print >>warn, "%s feed %s" % (action, feeds[n].url)
 884                 feeds[n].active = active
 885         unlock(feeds, feedfileObject)
 886
 887 def reset():
 888         feeds, feedfileObject = load()
 889         if feeds and isstr(feeds[0]):
 890                 ifeeds = feeds[1:]
 891         else: ifeeds = feeds
 892         for f in ifeeds:
 893                 if VERBOSE: print "Resetting %d already seen items" % len(f.seen)
 894                 f.seen = {}
 895                 f.etag = None
 896                 f.modified = None
 897
 898         unlock(feeds, feedfileObject)
 899
 900 def email(addr):
 901         feeds, feedfileObject = load()
 902         if feeds and isstr(feeds[0]): feeds[0] = addr
 903         else: feeds = [addr] + feeds
 904         unlock(feeds, feedfileObject)
 905
 906 if __name__ == '__main__':
 907         args = sys.argv
 908         try:
 909                 if len(args) < 3: raise InputError, "insufficient args"
 910                 feedfile, action, args = args[1], args[2], args[3:]
 911
 912                 if action == "run":
 913                         if args and args[0] == "--no-send":
 914                                 def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None):
 915                                         if VERBOSE: print 'Not sending:', unu(subject)
 916
 917                         if args and args[-1].isdigit(): run(int(args[-1]))
 918                         else: run()
 919
 920                 elif action == "email":
 921                         if not args:
 922                                 raise InputError, "Action '%s' requires an argument" % action
 923                         else:
 924                                 email(args[0])
 925
 926                 elif action == "add": add(*args)
 927
 928                 elif action == "new":
 929                         if len(args) == 1: d = [args[0]]
 930                         else: d = []
 931                         pickle.dump(d, open(feedfile, 'w'))
 932
 933                 elif action == "list": list()
 934
 935                 elif action in ("help", "--help", "-h"): print __doc__
 936
 937                 elif action == "delete":
 938                         if not args:
 939                                 raise InputError, "Action '%s' requires an argument" % action
 940                         elif args[0].isdigit():
 941                                 delete(int(args[0]))
 942                         else:
 943                                 raise InputError, "Action '%s' requires a number as its argument" % action
 944
 945                 elif action in ("pause", "unpause"):
 946                         if not args:
 947                                 raise InputError, "Action '%s' requires an argument" % action
 948                         elif args[0].isdigit():
 949                                 active = (action == "unpause")
 950                                 toggleactive(int(args[0]), active)
 951                         else:
 952                                 raise InputError, "Action '%s' requires a number as its argument" % action
 953
 954                 elif action == "reset": reset()
 955
 956                 elif action == "opmlexport": opmlexport()
 957
 958                 elif action == "opmlimport":
 959                         if not args:
 960                                 raise InputError, "OPML import '%s' requires a filename argument" % action
 961                         opmlimport(args[0])
 962
 963                 else:
 964                         raise InputError, "Invalid action"
 965
 966         except InputError, e:
 967                 print "E:", e
 968                 print
 969                 print __doc__
 970