Better attribute handling. Factored out tag handling into getTags()
authorU-SEVEN\lindsey <lindsey.smith@gmail.com>
Fri, 24 Jun 2011 18:07:22 +0000 (11:07 -0700)
committerU-SEVEN\lindsey <lindsey.smith@gmail.com>
Fri, 24 Jun 2011 18:07:22 +0000 (11:07 -0700)
rss2email.py

index 3299cd5ed2d2ab2444c6172adfbb281e739d7219..995750b1f853dd577667c0ec9e64b44b1b21670f 100644 (file)
@@ -24,6 +24,7 @@ ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess",
                      "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
 
 import urllib2
+import BeautifulSoup
 urllib2.install_opener(urllib2.build_opener())
 
 ### Vaguely Customizable Options ###
@@ -382,7 +383,8 @@ def getContent(entry, HTMLOK=0):
                if not HTMLOK: # Only need to convert to text if HTML isn't OK
                        for c in conts:
                                if contains(c.type, 'html'):
-                                       return html2text(c.value)
+                                       cleanerhtml = BeautifulSoup.BeautifulSoup(c.value)
+                                       return html2text(unicode(cleanerhtml))
                
                for c in conts:
                        if c.type == 'text/plain': return c.value
@@ -392,7 +394,8 @@ def getContent(entry, HTMLOK=0):
        return ""
 
 def getID(entry):
-       """Get best ID from an entry."""
+       """Get best ID from an entry.
+       NEEDS UNIT TESTS"""
        if TRUST_GUID:
                if 'id' in entry and entry.id: 
                        # Newer versions of feedparser could return a dictionary
@@ -406,17 +409,17 @@ def getID(entry):
        if 'link' in entry: return entry.link
        if 'title' in entry: return hash(unu(entry.title)).hexdigest()
 
-def getName(r, entry):
+def getName(fullfeed, entry):
        """Get the best name.
        NEEDS UNIT TESTS"""
 
        if NO_FRIENDLY_NAME: return ''
 
-       feed = r.feed
-       if hasattr(r, "url") and r.url in OVERRIDE_FROM.keys():
-               return OVERRIDE_FROM[r.url]
+       feedinfo = fullfeed.feed
+       if hasattr(fullfeed, "url") and fullfeed.url in OVERRIDE_FROM.keys():
+               return OVERRIDE_FROM[fullfeed.url]
        
-       name = feed.get('title', '')
+       name = feedinfo.get('title', '')
 
        if 'name' in entry.get('author_detail', []): # normally {} but py2.1
                if entry.author_detail.name:
@@ -427,10 +430,10 @@ def getName(r, entry):
                        except UnicodeDecodeError:
                            name +=  unicode(entry.author_detail.name, 'utf-8')
 
-       elif 'name' in feed.get('author_detail', []):
-               if feed.author_detail.name:
+       elif 'name' in feedinfo.get('author_detail', []):
+               if feedinfo.author_detail.name:
                        if name: name += ", "
-                       name += feed.author_detail.name
+                       name += feedinfo.author_detail.name
        
        return name
 
@@ -469,6 +472,21 @@ def getEmail(r, entry):
                return DEFAULT_EMAIL[r.url]
        return DEFAULT_FROM
 
+def getTags(entry):
+       """If the entry has any tags, build a tagline and return as a string. Otherwise returns empty string"""
+       tagline = ""
+       if 'tags' in entry:
+               tags = entry.get('tags')
+               taglist = []
+               if tags:
+                       for tag in tags:
+                               if tag.has_key('term'): taglist.append(tag['term'])
+               if taglist:
+                       tagline = ",".join(taglist)
+
+       return tagline
+       
+
 ### Simple Database of Feeds ###
 
 class Feed:
@@ -689,16 +707,8 @@ def run(num=None):
                                        useragenthdr = "rss2email"
                                        
                                        # Add post tags, if available
-                                       tagline = ""
-                                       if 'tags' in entry:
-                                               tags = entry.get('tags')
-                                               taglist = []
-                                               if tags:
-                                                       for tag in tags:
-                                                               taglist.append(tag['term'])
-                                               if taglist:
-                                                       tagline = ",".join(taglist)
-                                       
+                                       tagline = getTags(entry)
+
                                        extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline}
                                        if BONUS_HEADER != '':
                                                for hdr in BONUS_HEADER.strip().splitlines():