xkcd.com

From: W. Trevor King Date: Sun, 14 Apr 2013 15:26:31 +0000 (-0400) Subject: WIP: feed: Add the include-references setting for cid: linking X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=refs%2Fheads%2Fimg-cid;p=rss2email.git WIP: feed: Add the include-references setting for cid: linking This still needs an HTML filter to find and replace links (_process_entry_content_html). It also needs something real in _get_reference() for downloading and creating the referenced parts. Signed-off-by: W. Trevor King --- diff --git a/rss2email/config.py b/rss2email/config.py index 22f8626..aac8e44 100644 --- a/rss2email/config.py +++ b/rss2email/config.py @@ -98,6 +98,9 @@ CONFIG['DEFAULT'] = _collections.OrderedDict(( # characters, we iterate through the list below and use the # first character set that works. ('encodings', 'US-ASCII, ISO-8859-1, UTF-8, BIG5, ISO-2022-JP'), + # True: Use RFC 2392's cid: to include and link referenced data. + # False: Only include the converted entry, not its references. + ('include-references', str(False)), ## HTML conversion # True: Send text/html messages when possible. # False: Convert HTML to plain text. diff --git a/rss2email/feed.py b/rss2email/feed.py index c71f524..34d05f6 100644 --- a/rss2email/feed.py +++ b/rss2email/feed.py @@ -28,6 +28,8 @@ """ import collections as _collections +from email.mime.multipart import MIMEMultipart as _MIMEMultipart +from email.mime.nonmultipart import MIMENonMultipart as _MIMENonMultipart from email.utils import formataddr as _formataddr import hashlib as _hashlib import html as _html @@ -36,6 +38,7 @@ import re as _re import socket as _socket import time as _time import urllib.error as _urllib_error +import urllib.parse as _urllib_parse import urllib.request as _urllib_request import xml.sax as _sax @@ -163,6 +166,7 @@ class Feed (object): 'active', 'date_header', 'trust_guid', + 'include_references', 'html_mail', 'use_css', 'unicode_snob', @@ -429,17 +433,22 @@ class Feed (object): content = self._get_entry_content(entry) try: - content = self._process_entry_content( + parts = self._process_entry_content( entry=entry, content=content, subject=subject) except _error.ProcessingError as e: e.parsed = parsed raise - message = _email.get_message( + if len(parts) == 1: + message = parts[0] + else: + _MIMEMultipart() + for part in parts: + message.attach(part) + _email.set_headers( + message=message, sender=sender, recipient=self.to, subject=subject, - body=content['value'], - content_type=content['type'].split('/', 1)[1], extra_headers=extra_headers, config=self.config, section=self.section) @@ -663,8 +672,21 @@ class Feed (object): return {'type': 'text/plain', 'value': ''} def _process_entry_content(self, entry, content, subject): - "Convert entry content to the requested format." + """Convert entry content to the requested format + + Returns a list of parts, with a `text/*` part first containing + the content. If `self.include_references` is True, the + referenced parts are also included as attachments. + """ + parts = [] link = self._get_entry_link(entry) + if content['type'] in ('text/html', 'application/xhtml+xml'): + html_content,new_parts = self._process_entry_content_html( + entry=entry, content=content, subject=subject, + html=content['value'].strip()) + parts.extend(new_parts) + else: + html_content = _html.escape(content['value'].strip()) if self.html_mail: lines = [ '', @@ -695,22 +717,30 @@ class Feed (object): '

URL: {0}

'.format(link), ]) for enclosure in getattr(entry, 'enclosures', []): + part = None if getattr(enclosure, 'url', None): + ref_link,part = self._get_reference(url=enclosure.url) lines.append( '

Enclosure: {0}

'.format( - enclosure.url)) + ref_link)) if getattr(enclosure, 'src', None): + ref_link,part = self._get_reference(url=enclosure.src) lines.append( '

Enclosure: {0}

'.format( - enclosure.src)) - lines.append( - '

'.format(enclosure.src)) + ref_link)) + lines.append('

'.format(ref_link)) + if part: + parts.append(part) for elink in getattr(entry, 'links', []): + part = None if elink.get('rel', None) == 'via': url = elink['href'] + ref_link,part = self._get_reference(url=url) title = elink.get('title', url) lines.append('

Via {}

'.format( - url, title)) + ref_link, title)) + if part: + parts.append(part) lines.extend([ '', # /footer '', # /entry @@ -719,7 +749,6 @@ class Feed (object): '']) content['type'] = 'text/html' content['value'] = '\n'.join(lines) - return content else: # not self.html_mail if content['type'] in ('text/html', 'application/xhtml+xml'): try: @@ -731,18 +760,64 @@ class Feed (object): lines.append('') lines.append('URL: {}'.format(link)) for enclosure in getattr(entry, 'enclosures', []): - if getattr(enclosure, 'url', None): - lines.append('Enclosure: {}'.format(enclosure.url)) - if getattr(enclosure, 'src', None): - lines.append('Enclosure: {}'.format(enclosure.src)) + for url in [ + getattr(enclosure, 'url', None), + getattr(enclosure, 'src', None), + ]: + ref_link,part = self._get_reference(url=url) + lines.append('Enclosure: {}'.format(ref_link)) + if part: + parts.append(part) for elink in getattr(entry, 'links', []): if elink.get('rel', None) == 'via': url = elink['href'] + ref_link,part = self._get_reference(url=url) title = elink.get('title', url) - lines.append('Via: {} {}'.format(title, url)) + lines.append('Via: {} {}'.format(title, ref_link)) + if part: + parts.append(part) content['type'] = 'text/plain' content['value'] = '\n'.join(lines) - return content + content_part = _email.get_mimetext( + body=content['value'], + content_type=content['type'].split('/', 1)[1], + config=self.config, + section=self.section) + parts.insert(0, content_part) + return parts + + def _process_entry_content_html(self, entry, content, subject, html): + """Manipulate the entry HTML + + For example, replace links to images with cid: links if + `self.include_references` is True. + + Returns a the new HTML and a list of parts (which may be empty). + """ + return (html, []) # TODO + + def _get_reference(self, url): + """Get references for cid: links. + + RFC 2392 [1] provides linking between message parts based on + Content-IDs via `cid:...` URLs. If `self.include_references` + is True, download the object referenced by `url` and return a + tuple containing a `cid:...` URL and the MIME part containing + the referenced data. Otherwise, return `(url, None)`. + + [1]: http://tools.ietf.org/html/rfc2392 + """ + if self.include_references: + cid = _email.get_id() + link = 'cid:{}'.format(_urllib_parse.quote(cid[1:-1])) + _LOG.critical(link) + part = _MIMENoneMultipart('image', 'png') + part.add_content(b'AAA') + else: + link = url + part = None + _LOG.critical((link, part)) + return (link, part) def _send(self, sender, message): _LOG.info('send message for {}'.format(self)) diff --git a/test/xkcd/1.config b/test/xkcd/1.config new file mode 100644 index 0000000..15935c4 --- /dev/null +++ b/test/xkcd/1.config @@ -0,0 +1,3 @@ +[DEFAULT] +to = a@b.com +date-header = True diff --git a/test/xkcd/1.expected b/test/xkcd/1.expected new file mode 100644 index 0000000..c815599 --- /dev/null +++ b/test/xkcd/1.expected @@ -0,0 +1,87 @@ +SENT BY: "xkcd.com" +MIME-Version: 1.0 +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: 7bit +From: "xkcd.com" +To: a@b.com +Subject: Flowchart +Date: Fri, 05 Apr 2013 00:00:00 -0000 +Message-ID: <...@dev.null.invalid> +User-Agent: rss2email +X-RSS-Feed: xkcd/feed.atom +X-RSS-ID: http://xkcd.com/1195/ +X-RSS-URL: http://xkcd.com/1195/ + +![The way out is to use the marker you have to add a box that says 'get a +marker' to the line between you and 'start', then add a 'no' line from the +trap box to 'end'.](http://imgs.xkcd.com/comics/flowchart.png) + + + +URL: http://xkcd.com/1195/ + +SENT BY: "xkcd.com" +MIME-Version: 1.0 +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: 7bit +From: "xkcd.com" +To: a@b.com +Subject: Subways +Date: Mon, 08 Apr 2013 00:00:00 -0000 +Message-ID: <...@dev.null.invalid> +User-Agent: rss2email +X-RSS-Feed: xkcd/feed.atom +X-RSS-ID: http://xkcd.com/1196/ +X-RSS-URL: http://xkcd.com/1196/ + +![About one in three North American subway stops are in +NYC.](http://imgs.xkcd.com/comics/subways.png) + + + +URL: http://xkcd.com/1196/ + +SENT BY: "xkcd.com" +MIME-Version: 1.0 +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: 7bit +From: "xkcd.com" +To: a@b.com +Subject: All Adobe Updates +Date: Wed, 10 Apr 2013 00:00:00 -0000 +Message-ID: <...@dev.null.invalid> +User-Agent: rss2email +X-RSS-Feed: xkcd/feed.atom +X-RSS-ID: http://xkcd.com/1197/ +X-RSS-URL: http://xkcd.com/1197/ + +![ALERT: Some pending mandatory software updates require version 21.1.2 of the +Oracle/Sun Java(tm) JDK(tm) Update Manager Runtime Environment Meta-Updater, +which is not available for your +platform.](http://imgs.xkcd.com/comics/all_adobe_updates.png) + + + +URL: http://xkcd.com/1197/ + +SENT BY: "xkcd.com" +MIME-Version: 1.0 +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: 7bit +From: "xkcd.com" +To: a@b.com +Subject: Geologist +Date: Fri, 12 Apr 2013 00:00:00 -0000 +Message-ID: <...@dev.null.invalid> +User-Agent: rss2email +X-RSS-Feed: xkcd/feed.atom +X-RSS-ID: http://xkcd.com/1198/ +X-RSS-URL: http://xkcd.com/1198/ + +!['It seems like it's still alive, Professor.' 'Yeah, a big one like this can +keep running around for a few billion years after you remove the +head."](http://imgs.xkcd.com/comics/geologist.png) + + + +URL: http://xkcd.com/1198/ diff --git a/test/xkcd/2.config b/test/xkcd/2.config new file mode 100644 index 0000000..a9794f6 --- /dev/null +++ b/test/xkcd/2.config @@ -0,0 +1,4 @@ +[DEFAULT] +to = a@b.com +date-header = True +html-mail = True diff --git a/test/xkcd/2.expected b/test/xkcd/2.expected new file mode 100644 index 0000000..7b7a2fd --- /dev/null +++ b/test/xkcd/2.expected @@ -0,0 +1,123 @@ +SENT BY: "xkcd.com" +MIME-Version: 1.0 +Content-Type: text/html; charset="us-ascii" +Content-Transfer-Encoding: 7bit +From: "xkcd.com" +To: a@b.com +Subject: Flowchart +Date: Fri, 05 Apr 2013 00:00:00 -0000 +Message-ID: <...@dev.null.invalid> +User-Agent: rss2email +X-RSS-Feed: xkcd/feed.atom +X-RSS-ID: http://xkcd.com/1195/ +X-RSS-URL: http://xkcd.com/1195/ + + + + + + +

Flowchart +

The way out is to use the marker you have to add a box that says 'get a marker' to the line between you and 'start', then add a 'no' line from the trap box to 'end'.

+ +

+ + + + +SENT BY: "xkcd.com" +MIME-Version: 1.0 +Content-Type: text/html; charset="us-ascii" +Content-Transfer-Encoding: 7bit +From: "xkcd.com" +To: a@b.com +Subject: Subways +Date: Mon, 08 Apr 2013 00:00:00 -0000 +Message-ID: <...@dev.null.invalid> +User-Agent: rss2email +X-RSS-Feed: xkcd/feed.atom +X-RSS-ID: http://xkcd.com/1196/ +X-RSS-URL: http://xkcd.com/1196/ + + + + + + +

Subways +

+ +

+ + + + +SENT BY: "xkcd.com" +MIME-Version: 1.0 +Content-Type: text/html; charset="us-ascii" +Content-Transfer-Encoding: 7bit +From: "xkcd.com" +To: a@b.com +Subject: All Adobe Updates +Date: Wed, 10 Apr 2013 00:00:00 -0000 +Message-ID: <...@dev.null.invalid> +User-Agent: rss2email +X-RSS-Feed: xkcd/feed.atom +X-RSS-ID: http://xkcd.com/1197/ +X-RSS-URL: http://xkcd.com/1197/ + + + + + + +

All Adobe Updates +

ALERT: Some pending mandatory software updates require version 21.1.2 of the Oracle/Sun Java(tm) JDK(tm) Update Manager Runtime Environment Meta-Updater, which is not available for your platform.

+ +

+ + + + +SENT BY: "xkcd.com" +MIME-Version: 1.0 +Content-Type: text/html; charset="us-ascii" +Content-Transfer-Encoding: 7bit +From: "xkcd.com" +To: a@b.com +Subject: Geologist +Date: Fri, 12 Apr 2013 00:00:00 -0000 +Message-ID: <...@dev.null.invalid> +User-Agent: rss2email +X-RSS-Feed: xkcd/feed.atom +X-RSS-ID: http://xkcd.com/1198/ +X-RSS-URL: http://xkcd.com/1198/ + + + + + + +

Geologist +

'It seems like it's still alive, Professor.' 'Yeah, a big one like this can keep running around for a few billion years after you remove the head."

+ +

+ + + diff --git a/test/xkcd/3.config b/test/xkcd/3.config new file mode 100644 index 0000000..c9da8c1 --- /dev/null +++ b/test/xkcd/3.config @@ -0,0 +1,4 @@ +[DEFAULT] +to = a@b.com +date-header = True +include-references = True diff --git a/test/xkcd/4.config b/test/xkcd/4.config new file mode 100644 index 0000000..d8e0f9d --- /dev/null +++ b/test/xkcd/4.config @@ -0,0 +1,5 @@ +[DEFAULT] +to = a@b.com +date-header = True +include-references = True +html-mail = True diff --git a/test/xkcd/README b/test/xkcd/README new file mode 100644 index 0000000..d6fb8b0 --- /dev/null +++ b/test/xkcd/README @@ -0,0 +1,22 @@ +feed.atom is a snapshot of + + http://xkcd.com/atom.xml + +as of 2013-04-14. + +HTTP headers: + + HTTP/1.1 200 OK + Vary: Accept-Encoding + Content-Type: application/xml + Accept-Ranges: bytes + ETag: "3353825625" + Last-Modified: Fri, 12 Apr 2013 04:00:05 GMT + Content-Length: 2498 + Date: Sun, 14 Apr 2013 12:49:47 GMT + Server: lighttpd/1.4.28 + +The URLs referenced from the entries were downloaded on the same day +and stored with filenames matching their URL +(e.g. `http://imgs.xkcd.com/comics/geologist.png` is stored as +`imgs.xkcd.com/comics/geologist.png`). diff --git a/test/xkcd/feed.atom b/test/xkcd/feed.atom new file mode 100644 index 0000000..466653a --- /dev/null +++ b/test/xkcd/feed.atom @@ -0,0 +1,2 @@ + +xkcd.comhttp://xkcd.com/2013-04-12T00:00:00ZGeologist2013-04-12T00:00:00Zhttp://xkcd.com/1198/

All Adobe Updates2013-04-10T00:00:00Zhttp://xkcd.com/1197/

Subways2013-04-08T00:00:00Zhttp://xkcd.com/1196/

Flowchart2013-04-05T00:00:00Zhttp://xkcd.com/1195/

\ No newline at end of file diff --git a/test/xkcd/imgs.xkcd.com/comics/all_adobe_updates.png b/test/xkcd/imgs.xkcd.com/comics/all_adobe_updates.png new file mode 100644 index 0000000..e033c6f Binary files /dev/null and b/test/xkcd/imgs.xkcd.com/comics/all_adobe_updates.png differ diff --git a/test/xkcd/imgs.xkcd.com/comics/flowchart.png b/test/xkcd/imgs.xkcd.com/comics/flowchart.png new file mode 100644 index 0000000..fbd0123 Binary files /dev/null and b/test/xkcd/imgs.xkcd.com/comics/flowchart.png differ diff --git a/test/xkcd/imgs.xkcd.com/comics/geologist.png b/test/xkcd/imgs.xkcd.com/comics/geologist.png new file mode 100644 index 0000000..c8c81a8 Binary files /dev/null and b/test/xkcd/imgs.xkcd.com/comics/geologist.png differ diff --git a/test/xkcd/imgs.xkcd.com/comics/subways.png b/test/xkcd/imgs.xkcd.com/comics/subways.png new file mode 100644 index 0000000..6093927 Binary files /dev/null and b/test/xkcd/imgs.xkcd.com/comics/subways.png differ