Better attribute handling. Factored out tag handling into getTags()

author U-SEVEN\lindsey <lindsey.smith@gmail.com>

Fri, 24 Jun 2011 18:07:22 +0000 (11:07 -0700)

committer U-SEVEN\lindsey <lindsey.smith@gmail.com>

Fri, 24 Jun 2011 18:07:22 +0000 (11:07 -0700)
author U-SEVEN\lindsey <lindsey.smith@gmail.com>
Fri, 24 Jun 2011 18:07:22 +0000 (11:07 -0700)
committer U-SEVEN\lindsey <lindsey.smith@gmail.com>
Fri, 24 Jun 2011 18:07:22 +0000 (11:07 -0700)
diff --git a/rss2email.py b/rss2email.py

index 3299cd5ed2d2ab2444c6172adfbb281e739d7219..995750b1f853dd577667c0ec9e64b44b1b21670f 100644 (file)
--- a/rss2email.py
+++ b/rss2email.py
@@ -24,6 +24,7 @@ ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess",
                       "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
  
  import urllib2
+import BeautifulSoup
  urllib2.install_opener(urllib2.build_opener())
  
  ### Vaguely Customizable Options ###
@@ -382,7 +383,8 @@ def getContent(entry, HTMLOK=0):
                 if not HTMLOK: # Only need to convert to text if HTML isn't OK
                         for c in conts:
                                 if contains(c.type, 'html'):
-                                       return html2text(c.value)
+                                       cleanerhtml = BeautifulSoup.BeautifulSoup(c.value)
+                                       return html2text(unicode(cleanerhtml))
                 
                 for c in conts:
                         if c.type == 'text/plain': return c.value
@@ -392,7 +394,8 @@ def getContent(entry, HTMLOK=0):
         return ""
  
  def getID(entry):
-       """Get best ID from an entry."""
+       """Get best ID from an entry.
+       NEEDS UNIT TESTS"""
         if TRUST_GUID:
                 if 'id' in entry and entry.id: 
                         # Newer versions of feedparser could return a dictionary
@@ -406,17 +409,17 @@ def getID(entry):
         if 'link' in entry: return entry.link
         if 'title' in entry: return hash(unu(entry.title)).hexdigest()
  
-def getName(r, entry):
+def getName(fullfeed, entry):
         """Get the best name.
         NEEDS UNIT TESTS"""
  
         if NO_FRIENDLY_NAME: return ''
  
-       feed = r.feed
-       if hasattr(r, "url") and r.url in OVERRIDE_FROM.keys():
-               return OVERRIDE_FROM[r.url]
+       feedinfo = fullfeed.feed
+       if hasattr(fullfeed, "url") and fullfeed.url in OVERRIDE_FROM.keys():
+               return OVERRIDE_FROM[fullfeed.url]
         
-       name = feed.get('title', '')
+       name = feedinfo.get('title', '')
  
         if 'name' in entry.get('author_detail', []): # normally {} but py2.1
                 if entry.author_detail.name:
@@ -427,10 +430,10 @@ def getName(r, entry):
                         except UnicodeDecodeError:
                             name +=  unicode(entry.author_detail.name, 'utf-8')
  
-       elif 'name' in feed.get('author_detail', []):
-               if feed.author_detail.name:
+       elif 'name' in feedinfo.get('author_detail', []):
+               if feedinfo.author_detail.name:
                         if name: name += ", "
-                       name += feed.author_detail.name
+                       name += feedinfo.author_detail.name
         
         return name
  
@@ -469,6 +472,21 @@ def getEmail(r, entry):
                 return DEFAULT_EMAIL[r.url]
         return DEFAULT_FROM
  
+def getTags(entry):
+       """If the entry has any tags, build a tagline and return as a string. Otherwise returns empty string"""
+       tagline = ""
+       if 'tags' in entry:
+               tags = entry.get('tags')
+               taglist = []
+               if tags:
+                       for tag in tags:
+                               if tag.has_key('term'): taglist.append(tag['term'])
+               if taglist:
+                       tagline = ",".join(taglist)
+
+       return tagline
+       
+
  ### Simple Database of Feeds ###
  
  class Feed:
@@ -689,16 +707,8 @@ def run(num=None):
                                         useragenthdr = "rss2email"
                                         
                                         # Add post tags, if available
-                                       tagline = ""
-                                       if 'tags' in entry:
-                                               tags = entry.get('tags')
-                                               taglist = []
-                                               if tags:
-                                                       for tag in tags:
-                                                               taglist.append(tag['term'])
-                                               if taglist:
-                                                       tagline = ",".join(taglist)
-                                       
+                                       tagline = getTags(entry)
+
                                         extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline}
                                         if BONUS_HEADER != '':
                                                 for hdr in BONUS_HEADER.strip().splitlines():
author	U-SEVEN\lindsey <lindsey.smith@gmail.com>
	Fri, 24 Jun 2011 18:07:22 +0000 (11:07 -0700)
committer	U-SEVEN\lindsey <lindsey.smith@gmail.com>
	Fri, 24 Jun 2011 18:07:22 +0000 (11:07 -0700)