WIP: feed: Add the include-references setting for cid: linking img-cid
authorW. Trevor King <wking@tremily.us>
Sun, 14 Apr 2013 15:26:31 +0000 (11:26 -0400)
committerW. Trevor King <wking@tremily.us>
Sun, 14 Apr 2013 15:26:31 +0000 (11:26 -0400)
This still needs an HTML filter to find and replace links
(_process_entry_content_html).  It also needs something real in
_get_reference() for downloading and creating the referenced parts.

Signed-off-by: W. Trevor King <wking@tremily.us>
14 files changed:
rss2email/config.py
rss2email/feed.py
test/xkcd/1.config [new file with mode: 0644]
test/xkcd/1.expected [new file with mode: 0644]
test/xkcd/2.config [new file with mode: 0644]
test/xkcd/2.expected [new file with mode: 0644]
test/xkcd/3.config [new file with mode: 0644]
test/xkcd/4.config [new file with mode: 0644]
test/xkcd/README [new file with mode: 0644]
test/xkcd/feed.atom [new file with mode: 0644]
test/xkcd/imgs.xkcd.com/comics/all_adobe_updates.png [new file with mode: 0644]
test/xkcd/imgs.xkcd.com/comics/flowchart.png [new file with mode: 0644]
test/xkcd/imgs.xkcd.com/comics/geologist.png [new file with mode: 0644]
test/xkcd/imgs.xkcd.com/comics/subways.png [new file with mode: 0644]

index 22f862667789a71beeb76dd60ef905fd94f1fad6..aac8e44cd8f3d27f8bfdf6c3ae4b34f1511d7a27 100644 (file)
@@ -98,6 +98,9 @@ CONFIG['DEFAULT'] = _collections.OrderedDict((
         # characters, we iterate through the list below and use the
         # first character set that works.
         ('encodings', 'US-ASCII, ISO-8859-1, UTF-8, BIG5, ISO-2022-JP'),
+        # True: Use RFC 2392's cid: to include and link referenced data.
+        # False: Only include the converted entry, not its references.
+        ('include-references', str(False)),
         ## HTML conversion
         # True: Send text/html messages when possible.
         # False: Convert HTML to plain text.
index c71f524e58f09bc1bf1ecbe89ac9a89ba943398a..34d05f6098da98b3faeded5b2552fd9f9b07fab3 100644 (file)
@@ -28,6 +28,8 @@
 """
 
 import collections as _collections
+from email.mime.multipart import MIMEMultipart as _MIMEMultipart
+from email.mime.nonmultipart import MIMENonMultipart as _MIMENonMultipart
 from email.utils import formataddr as _formataddr
 import hashlib as _hashlib
 import html as _html
@@ -36,6 +38,7 @@ import re as _re
 import socket as _socket
 import time as _time
 import urllib.error as _urllib_error
+import urllib.parse as _urllib_parse
 import urllib.request as _urllib_request
 import xml.sax as _sax
 
@@ -163,6 +166,7 @@ class Feed (object):
         'active',
         'date_header',
         'trust_guid',
+        'include_references',
         'html_mail',
         'use_css',
         'unicode_snob',
@@ -429,17 +433,22 @@ class Feed (object):
 
         content = self._get_entry_content(entry)
         try:
-            content = self._process_entry_content(
+            parts = self._process_entry_content(
                 entry=entry, content=content, subject=subject)
         except _error.ProcessingError as e:
             e.parsed = parsed
             raise
-        message = _email.get_message(
+        if len(parts) == 1:
+            message = parts[0]
+        else:
+            _MIMEMultipart()
+            for part in parts:
+                message.attach(part)
+        _email.set_headers(
+            message=message,
             sender=sender,
             recipient=self.to,
             subject=subject,
-            body=content['value'],
-            content_type=content['type'].split('/', 1)[1],
             extra_headers=extra_headers,
             config=self.config,
             section=self.section)
@@ -663,8 +672,21 @@ class Feed (object):
         return {'type': 'text/plain', 'value': ''}
 
     def _process_entry_content(self, entry, content, subject):
-        "Convert entry content to the requested format."
+        """Convert entry content to the requested format
+
+        Returns a list of parts, with a `text/*` part first containing
+        the content.  If `self.include_references` is True, the
+        referenced parts are also included as attachments.
+        """
+        parts = []
         link = self._get_entry_link(entry)
+        if content['type'] in ('text/html', 'application/xhtml+xml'):
+            html_content,new_parts = self._process_entry_content_html(
+                entry=entry, content=content, subject=subject,
+                html=content['value'].strip())
+            parts.extend(new_parts)
+        else:
+            html_content = _html.escape(content['value'].strip())
         if self.html_mail:
             lines = [
                 '<!DOCTYPE html>',
@@ -695,22 +717,30 @@ class Feed (object):
                     '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
                     ])
             for enclosure in getattr(entry, 'enclosures', []):
+                part = None
                 if getattr(enclosure, 'url', None):
+                    ref_link,part = self._get_reference(url=enclosure.url)
                     lines.append(
                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
-                            enclosure.url))
+                            ref_link))
                 if getattr(enclosure, 'src', None):
+                    ref_link,part = self._get_reference(url=enclosure.src)
                     lines.append(
                         '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
-                            enclosure.src))
-                    lines.append(
-                        '<p><img src="{}" /></p>'.format(enclosure.src))
+                            ref_link))
+                    lines.append('<p><img src="{}" /></p>'.format(ref_link))
+                if part:
+                    parts.append(part)
             for elink in getattr(entry, 'links', []):
+                part = None
                 if elink.get('rel', None) == 'via':
                     url = elink['href']
+                    ref_link,part = self._get_reference(url=url)
                     title = elink.get('title', url)
                     lines.append('<p>Via <a href="{}">{}</a></p>'.format(
-                            url, title))
+                            ref_link, title))
+                if part:
+                    parts.append(part)
             lines.extend([
                     '</div>',  # /footer
                     '</div>',  # /entry
@@ -719,7 +749,6 @@ class Feed (object):
                     ''])
             content['type'] = 'text/html'
             content['value'] = '\n'.join(lines)
-            return content
         else:  # not self.html_mail
             if content['type'] in ('text/html', 'application/xhtml+xml'):
                 try:
@@ -731,18 +760,64 @@ class Feed (object):
             lines.append('')
             lines.append('URL: {}'.format(link))
             for enclosure in getattr(entry, 'enclosures', []):
-                if getattr(enclosure, 'url', None):
-                    lines.append('Enclosure: {}'.format(enclosure.url))
-                if getattr(enclosure, 'src', None):
-                    lines.append('Enclosure: {}'.format(enclosure.src))
+                for url in [
+                        getattr(enclosure, 'url', None),
+                        getattr(enclosure, 'src', None),
+                        ]:
+                    ref_link,part = self._get_reference(url=url)
+                    lines.append('Enclosure: {}'.format(ref_link))
+                    if part:
+                        parts.append(part)
             for elink in getattr(entry, 'links', []):
                 if elink.get('rel', None) == 'via':
                     url = elink['href']
+                    ref_link,part = self._get_reference(url=url)
                     title = elink.get('title', url)
-                    lines.append('Via: {} {}'.format(title, url))
+                    lines.append('Via: {} {}'.format(title, ref_link))
+                    if part:
+                        parts.append(part)
             content['type'] = 'text/plain'
             content['value'] = '\n'.join(lines)
-            return content
+        content_part = _email.get_mimetext(
+            body=content['value'],
+            content_type=content['type'].split('/', 1)[1],
+            config=self.config,
+            section=self.section)
+        parts.insert(0, content_part)
+        return parts
+
+    def _process_entry_content_html(self, entry, content, subject, html):
+        """Manipulate the entry HTML
+
+        For example, replace links to images with cid: links if
+        `self.include_references` is True.
+
+        Returns a the new HTML and a list of parts (which may be empty).
+        """
+        return (html, [])  # TODO
+
+    def _get_reference(self, url):
+        """Get references for cid: links.
+
+        RFC 2392 [1] provides linking between message parts based on
+        Content-IDs via `cid:...` URLs.  If `self.include_references`
+        is True, download the object referenced by `url` and return a
+        tuple containing a `cid:...` URL and the MIME part containing
+        the referenced data.  Otherwise, return `(url, None)`.
+
+        [1]: http://tools.ietf.org/html/rfc2392
+        """
+        if self.include_references:
+            cid = _email.get_id()
+            link = 'cid:{}'.format(_urllib_parse.quote(cid[1:-1]))
+            _LOG.critical(link)
+            part = _MIMENoneMultipart('image', 'png')
+            part.add_content(b'AAA')
+        else:
+            link = url
+            part = None
+        _LOG.critical((link, part))
+        return (link, part)
 
     def _send(self, sender, message):
         _LOG.info('send message for {}'.format(self))
diff --git a/test/xkcd/1.config b/test/xkcd/1.config
new file mode 100644 (file)
index 0000000..15935c4
--- /dev/null
@@ -0,0 +1,3 @@
+[DEFAULT]
+to = a@b.com
+date-header = True
diff --git a/test/xkcd/1.expected b/test/xkcd/1.expected
new file mode 100644 (file)
index 0000000..c815599
--- /dev/null
@@ -0,0 +1,87 @@
+SENT BY: "xkcd.com" <user@rss2email.invalid>
+MIME-Version: 1.0
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+From: "xkcd.com" <user@rss2email.invalid>
+To: a@b.com
+Subject: Flowchart
+Date: Fri, 05 Apr 2013 00:00:00 -0000
+Message-ID: <...@dev.null.invalid>
+User-Agent: rss2email
+X-RSS-Feed: xkcd/feed.atom
+X-RSS-ID: http://xkcd.com/1195/
+X-RSS-URL: http://xkcd.com/1195/
+
+![The way out is to use the marker you have to add a box that says 'get a
+marker' to the line between you and 'start', then add a 'no' line from the
+trap box to 'end'.](http://imgs.xkcd.com/comics/flowchart.png)
+
+
+
+URL: http://xkcd.com/1195/
+
+SENT BY: "xkcd.com" <user@rss2email.invalid>
+MIME-Version: 1.0
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+From: "xkcd.com" <user@rss2email.invalid>
+To: a@b.com
+Subject: Subways
+Date: Mon, 08 Apr 2013 00:00:00 -0000
+Message-ID: <...@dev.null.invalid>
+User-Agent: rss2email
+X-RSS-Feed: xkcd/feed.atom
+X-RSS-ID: http://xkcd.com/1196/
+X-RSS-URL: http://xkcd.com/1196/
+
+![About one in three North American subway stops are in
+NYC.](http://imgs.xkcd.com/comics/subways.png)
+
+
+
+URL: http://xkcd.com/1196/
+
+SENT BY: "xkcd.com" <user@rss2email.invalid>
+MIME-Version: 1.0
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+From: "xkcd.com" <user@rss2email.invalid>
+To: a@b.com
+Subject: All Adobe Updates
+Date: Wed, 10 Apr 2013 00:00:00 -0000
+Message-ID: <...@dev.null.invalid>
+User-Agent: rss2email
+X-RSS-Feed: xkcd/feed.atom
+X-RSS-ID: http://xkcd.com/1197/
+X-RSS-URL: http://xkcd.com/1197/
+
+![ALERT: Some pending mandatory software updates require version 21.1.2 of the
+Oracle/Sun Java(tm) JDK(tm) Update Manager Runtime Environment Meta-Updater,
+which is not available for your
+platform.](http://imgs.xkcd.com/comics/all_adobe_updates.png)
+
+
+
+URL: http://xkcd.com/1197/
+
+SENT BY: "xkcd.com" <user@rss2email.invalid>
+MIME-Version: 1.0
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+From: "xkcd.com" <user@rss2email.invalid>
+To: a@b.com
+Subject: Geologist
+Date: Fri, 12 Apr 2013 00:00:00 -0000
+Message-ID: <...@dev.null.invalid>
+User-Agent: rss2email
+X-RSS-Feed: xkcd/feed.atom
+X-RSS-ID: http://xkcd.com/1198/
+X-RSS-URL: http://xkcd.com/1198/
+
+!['It seems like it's still alive, Professor.' 'Yeah, a big one like this can
+keep running around for a few billion years after you remove the
+head."](http://imgs.xkcd.com/comics/geologist.png)
+
+
+
+URL: http://xkcd.com/1198/
diff --git a/test/xkcd/2.config b/test/xkcd/2.config
new file mode 100644 (file)
index 0000000..a9794f6
--- /dev/null
@@ -0,0 +1,4 @@
+[DEFAULT]
+to = a@b.com
+date-header = True
+html-mail = True
diff --git a/test/xkcd/2.expected b/test/xkcd/2.expected
new file mode 100644 (file)
index 0000000..7b7a2fd
--- /dev/null
@@ -0,0 +1,123 @@
+SENT BY: "xkcd.com" <user@rss2email.invalid>
+MIME-Version: 1.0
+Content-Type: text/html; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+From: "xkcd.com" <user@rss2email.invalid>
+To: a@b.com
+Subject: Flowchart
+Date: Fri, 05 Apr 2013 00:00:00 -0000
+Message-ID: <...@dev.null.invalid>
+User-Agent: rss2email
+X-RSS-Feed: xkcd/feed.atom
+X-RSS-ID: http://xkcd.com/1195/
+X-RSS-URL: http://xkcd.com/1195/
+
+<!DOCTYPE html>
+<html>
+  <head>
+</head>
+<body>
+<div id="entry>
+<h1 class="header"><a href="http://xkcd.com/1195/">Flowchart</a></h1>
+<div id="body">
+<img alt="The way out is to use the marker you have to add a box that says 'get a marker' to the line between you and 'start', then add a 'no' line from the trap box to 'end'." src="http://imgs.xkcd.com/comics/flowchart.png" title="The way out is to use the marker you have to add a box that says 'get a marker' to the line between you and 'start', then add a 'no' line from the trap box to 'end'." />
+</div>
+<div class="footer"><p>URL: <a href="http://xkcd.com/1195/">http://xkcd.com/1195/</a></p>
+</div>
+</div>
+</body>
+</html>
+
+
+SENT BY: "xkcd.com" <user@rss2email.invalid>
+MIME-Version: 1.0
+Content-Type: text/html; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+From: "xkcd.com" <user@rss2email.invalid>
+To: a@b.com
+Subject: Subways
+Date: Mon, 08 Apr 2013 00:00:00 -0000
+Message-ID: <...@dev.null.invalid>
+User-Agent: rss2email
+X-RSS-Feed: xkcd/feed.atom
+X-RSS-ID: http://xkcd.com/1196/
+X-RSS-URL: http://xkcd.com/1196/
+
+<!DOCTYPE html>
+<html>
+  <head>
+</head>
+<body>
+<div id="entry>
+<h1 class="header"><a href="http://xkcd.com/1196/">Subways</a></h1>
+<div id="body">
+<img alt="About one in three North American subway stops are in NYC." src="http://imgs.xkcd.com/comics/subways.png" title="About one in three North American subway stops are in NYC." />
+</div>
+<div class="footer"><p>URL: <a href="http://xkcd.com/1196/">http://xkcd.com/1196/</a></p>
+</div>
+</div>
+</body>
+</html>
+
+
+SENT BY: "xkcd.com" <user@rss2email.invalid>
+MIME-Version: 1.0
+Content-Type: text/html; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+From: "xkcd.com" <user@rss2email.invalid>
+To: a@b.com
+Subject: All Adobe Updates
+Date: Wed, 10 Apr 2013 00:00:00 -0000
+Message-ID: <...@dev.null.invalid>
+User-Agent: rss2email
+X-RSS-Feed: xkcd/feed.atom
+X-RSS-ID: http://xkcd.com/1197/
+X-RSS-URL: http://xkcd.com/1197/
+
+<!DOCTYPE html>
+<html>
+  <head>
+</head>
+<body>
+<div id="entry>
+<h1 class="header"><a href="http://xkcd.com/1197/">All Adobe Updates</a></h1>
+<div id="body">
+<img alt="ALERT: Some pending mandatory software updates require version 21.1.2 of the Oracle/Sun Java(tm) JDK(tm) Update Manager Runtime Environment Meta-Updater, which is not available for your platform." src="http://imgs.xkcd.com/comics/all_adobe_updates.png" title="ALERT: Some pending mandatory software updates require version 21.1.2 of the Oracle/Sun Java(tm) JDK(tm) Update Manager Runtime Environment Meta-Updater, which is not available for your platform." />
+</div>
+<div class="footer"><p>URL: <a href="http://xkcd.com/1197/">http://xkcd.com/1197/</a></p>
+</div>
+</div>
+</body>
+</html>
+
+
+SENT BY: "xkcd.com" <user@rss2email.invalid>
+MIME-Version: 1.0
+Content-Type: text/html; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+From: "xkcd.com" <user@rss2email.invalid>
+To: a@b.com
+Subject: Geologist
+Date: Fri, 12 Apr 2013 00:00:00 -0000
+Message-ID: <...@dev.null.invalid>
+User-Agent: rss2email
+X-RSS-Feed: xkcd/feed.atom
+X-RSS-ID: http://xkcd.com/1198/
+X-RSS-URL: http://xkcd.com/1198/
+
+<!DOCTYPE html>
+<html>
+  <head>
+</head>
+<body>
+<div id="entry>
+<h1 class="header"><a href="http://xkcd.com/1198/">Geologist</a></h1>
+<div id="body">
+<img alt="'It seems like it's still alive, Professor.' 'Yeah, a big one like this can keep running around for a few billion years after you remove the head.&quot;" src="http://imgs.xkcd.com/comics/geologist.png" title="'It seems like it's still alive, Professor.' 'Yeah, a big one like this can keep running around for a few billion years after you remove the head.&quot;" />
+</div>
+<div class="footer"><p>URL: <a href="http://xkcd.com/1198/">http://xkcd.com/1198/</a></p>
+</div>
+</div>
+</body>
+</html>
+
diff --git a/test/xkcd/3.config b/test/xkcd/3.config
new file mode 100644 (file)
index 0000000..c9da8c1
--- /dev/null
@@ -0,0 +1,4 @@
+[DEFAULT]
+to = a@b.com
+date-header = True
+include-references = True
diff --git a/test/xkcd/4.config b/test/xkcd/4.config
new file mode 100644 (file)
index 0000000..d8e0f9d
--- /dev/null
@@ -0,0 +1,5 @@
+[DEFAULT]
+to = a@b.com
+date-header = True
+include-references = True
+html-mail = True
diff --git a/test/xkcd/README b/test/xkcd/README
new file mode 100644 (file)
index 0000000..d6fb8b0
--- /dev/null
@@ -0,0 +1,22 @@
+feed.atom is a snapshot of
+
+  http://xkcd.com/atom.xml
+
+as of 2013-04-14.
+
+HTTP headers:
+
+  HTTP/1.1 200 OK
+  Vary: Accept-Encoding
+  Content-Type: application/xml
+  Accept-Ranges: bytes
+  ETag: "3353825625"
+  Last-Modified: Fri, 12 Apr 2013 04:00:05 GMT
+  Content-Length: 2498
+  Date: Sun, 14 Apr 2013 12:49:47 GMT
+  Server: lighttpd/1.4.28
+
+The URLs referenced from the entries were downloaded on the same day
+and stored with filenames matching their URL
+(e.g. `http://imgs.xkcd.com/comics/geologist.png` is stored as
+`imgs.xkcd.com/comics/geologist.png`).
diff --git a/test/xkcd/feed.atom b/test/xkcd/feed.atom
new file mode 100644 (file)
index 0000000..466653a
--- /dev/null
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title>xkcd.com</title><link href="http://xkcd.com/" rel="alternate"></link><id>http://xkcd.com/</id><updated>2013-04-12T00:00:00Z</updated><entry><title>Geologist</title><link href="http://xkcd.com/1198/" rel="alternate"></link><updated>2013-04-12T00:00:00Z</updated><id>http://xkcd.com/1198/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/geologist.png" title="'It seems like it's still alive, Professor.' 'Yeah, a big one like this can keep running around for a few billion years after you remove the head.&amp;quot;" alt="'It seems like it's still alive, Professor.' 'Yeah, a big one like this can keep running around for a few billion years after you remove the head.&amp;quot;" /&gt;</summary></entry><entry><title>All Adobe Updates</title><link href="http://xkcd.com/1197/" rel="alternate"></link><updated>2013-04-10T00:00:00Z</updated><id>http://xkcd.com/1197/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/all_adobe_updates.png" title="ALERT: Some pending mandatory software updates require version 21.1.2 of the Oracle/Sun Java(tm) JDK(tm) Update Manager Runtime Environment Meta-Updater, which is not available for your platform." alt="ALERT: Some pending mandatory software updates require version 21.1.2 of the Oracle/Sun Java(tm) JDK(tm) Update Manager Runtime Environment Meta-Updater, which is not available for your platform." /&gt;</summary></entry><entry><title>Subways</title><link href="http://xkcd.com/1196/" rel="alternate"></link><updated>2013-04-08T00:00:00Z</updated><id>http://xkcd.com/1196/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/subways.png" title="About one in three North American subway stops are in NYC." alt="About one in three North American subway stops are in NYC." /&gt;</summary></entry><entry><title>Flowchart</title><link href="http://xkcd.com/1195/" rel="alternate"></link><updated>2013-04-05T00:00:00Z</updated><id>http://xkcd.com/1195/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/flowchart.png" title="The way out is to use the marker you have to add a box that says 'get a marker' to the line between you and 'start', then add a 'no' line from the trap box to 'end'." alt="The way out is to use the marker you have to add a box that says 'get a marker' to the line between you and 'start', then add a 'no' line from the trap box to 'end'." /&gt;</summary></entry></feed>
\ No newline at end of file
diff --git a/test/xkcd/imgs.xkcd.com/comics/all_adobe_updates.png b/test/xkcd/imgs.xkcd.com/comics/all_adobe_updates.png
new file mode 100644 (file)
index 0000000..e033c6f
Binary files /dev/null and b/test/xkcd/imgs.xkcd.com/comics/all_adobe_updates.png differ
diff --git a/test/xkcd/imgs.xkcd.com/comics/flowchart.png b/test/xkcd/imgs.xkcd.com/comics/flowchart.png
new file mode 100644 (file)
index 0000000..fbd0123
Binary files /dev/null and b/test/xkcd/imgs.xkcd.com/comics/flowchart.png differ
diff --git a/test/xkcd/imgs.xkcd.com/comics/geologist.png b/test/xkcd/imgs.xkcd.com/comics/geologist.png
new file mode 100644 (file)
index 0000000..c8c81a8
Binary files /dev/null and b/test/xkcd/imgs.xkcd.com/comics/geologist.png differ
diff --git a/test/xkcd/imgs.xkcd.com/comics/subways.png b/test/xkcd/imgs.xkcd.com/comics/subways.png
new file mode 100644 (file)
index 0000000..6093927
Binary files /dev/null and b/test/xkcd/imgs.xkcd.com/comics/subways.png differ