From 18cde86ad1a2ce183d49f2c380fe4be8b2242c80 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Boulogne?= Date: Sun, 29 Sep 2013 22:34:10 +0200 Subject: [PATCH] post_process.redirect: Add hook to remove redirections MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: François Boulogne Signed-off-by: W. Trevor King --- rss2email/post_process/redirect.py | 51 ++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 rss2email/post_process/redirect.py diff --git a/rss2email/post_process/redirect.py b/rss2email/post_process/redirect.py new file mode 100644 index 0000000..6ffd674 --- /dev/null +++ b/rss2email/post_process/redirect.py @@ -0,0 +1,51 @@ +# Copyright (C) 2013 Francois Boulogne +# +# This file is part of rss2email. +# +# rss2email is free software: you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation, either version 2 of the License, or (at your option) version 3 of +# the License. +# +# rss2email is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# rss2email. If not, see . + +"""Remove redirects on the post URL. + +Several websites use redirects (e.g. feedburner) for various reasons like +statistics. You may want to avoid this for privacy or for durability. + +This hook finds and uses the real url behind redirects. +""" + +import urllib +import re + +def process(feed, parsed, entry, guid, message): + # decode message + encoding = message.get_charsets()[0] + content = str(message.get_payload(decode=True), encoding) + + # Get the link + link = entry['link'] + if not link: + return message + + # Remove the redirect and modify the content + try: + direct_link = urllib.request.urlopen(link).geturl() + except: + return message + content = re.sub(re.escape(link), direct_link, content, re.MULTILINE) + + # clear CTE and set message. It can be important to clear the CTE + # before setting the payload, since the payload is only re-encoded + # if CTE is not already set. + del message['Content-Transfer-Encoding'] + message.set_payload(content, charset=encoding) + + return message -- 2.26.2