1 # Copyright (C) 2013 Francois Boulogne <fboulogne at april dot org>
3 # This file is part of rss2email.
5 # rss2email is free software: you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free Software
7 # Foundation, either version 2 of the License, or (at your option) version 3 of
10 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
11 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14 # You should have received a copy of the GNU General Public License along with
15 # rss2email. If not, see <http://www.gnu.org/licenses/>.
17 """Remove redirects on the post URL.
19 Several websites use redirects (e.g. feedburner) for various reasons like
20 statistics. You may want to avoid this for privacy or for durability.
22 This hook finds and uses the real url behind redirects.
32 LOG = _logging.getLogger(__name__)
35 def process(feed, parsed, entry, guid, message):
37 encoding = message.get_charsets()[0]
38 content = str(message.get_payload(decode=True), encoding)
47 for enclosure in entry['enclosures']:
48 links.append(enclosure['href'])
53 # Remove the redirect and modify the content
54 timeout = rss2email.config.CONFIG['DEFAULT'].getint('feed-timeout')
55 proxy = rss2email.config.CONFIG['DEFAULT']['proxy']
58 request = urllib.request.Request(link)
59 request.add_header('User-agent', rss2email.feed._USER_AGENT)
60 direct_link = urllib.request.urlopen(request).geturl()
61 except Exception as e:
62 LOG.warning('could not follow redirect for {}: {}'.format(
65 content = re.sub(re.escape(link), direct_link, content, re.MULTILINE)
67 # clear CTE and set message. It can be important to clear the CTE
68 # before setting the payload, since the payload is only re-encoded
69 # if CTE is not already set.
70 del message['Content-Transfer-Encoding']
71 message.set_payload(content, charset=encoding)