1 # -*- coding: utf-8 -*-
2 # Copyright (C) 2004-2013 Aaron Swartz
5 # Dennis Keitzel <github@pinshot.net>
7 # Etienne Millon <me@emillon.org>
8 # J. Lewis Muir <jlmuir@imca-cat.org>
10 # Lindsey Smith <lindsey.smith@gmail.com>
12 # Martin 'Joey' Schulze
14 # W. Trevor King <wking@tremily.us>
16 # This file is part of rss2email.
18 # rss2email is free software: you can redistribute it and/or modify it under
19 # the terms of the GNU General Public License as published by the Free Software
20 # Foundation, either version 2 of the License, or (at your option) version 3 of
23 # rss2email is distributed in the hope that it will be useful, but WITHOUT ANY
24 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
25 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
27 # You should have received a copy of the GNU General Public License along with
28 # rss2email. If not, see <http://www.gnu.org/licenses/>.
30 """Define the ``Feed`` class for handling a single feed
33 import collections as _collections
34 from email.mime.message import MIMEMessage as _MIMEMessage
35 from email.mime.multipart import MIMEMultipart as _MIMEMultipart
36 from email.utils import formataddr as _formataddr
37 import hashlib as _hashlib
38 import html.parser as _html_parser
40 import socket as _socket
42 import urllib.error as _urllib_error
43 import urllib.request as _urllib_request
45 import xml.sax as _sax
46 import xml.sax.saxutils as _saxutils
48 import feedparser as _feedparser
49 import html2text as _html2text
52 from . import __version__
53 from . import LOG as _LOG
54 from . import config as _config
55 from . import email as _email
56 from . import error as _error
57 from . import util as _util
60 _USER_AGENT = 'rss2email/{} +{}'.format(__version__, __url__)
61 _feedparser.USER_AGENT = _USER_AGENT
62 _urllib_request.install_opener(_urllib_request.build_opener())
64 for e in ['error', 'herror', 'gaierror']:
65 if hasattr(_socket, e):
66 _SOCKET_ERRORS.append(getattr(_socket, e))
67 del e # cleanup namespace
68 _SOCKET_ERRORS = tuple(_SOCKET_ERRORS)
71 # TypeError: 'str' does not support the buffer interface
72 _feedparser.PREFERRED_XML_PARSERS = []
76 """Utility class for feed manipulation and storage.
80 >>> from .config import CONFIG
83 ... name='test-feed', url='http://example.com/feed.atom', to='a@b.com')
85 test-feed (http://example.com/feed.atom -> a@b.com)
89 'user@rss2email.invalid'
91 >>> feed.from_email = 'a@b.com'
92 >>> feed.save_to_config()
93 >>> feed.config.write(sys.stdout) # doctest: +REPORT_UDIFF, +ELLIPSIS
95 from = user@rss2email.invalid
100 url = http://example.com/feed.atom
105 >>> feed.etag = 'dummy etag'
106 >>> string = pickle.dumps(feed)
107 >>> feed = pickle.loads(string)
108 >>> feed.load_from_config(config=CONFIG)
112 'http://example.com/feed.atom'
114 Names can only contain letters, digits, and '._-'. Here the
115 invalid space causes an exception:
117 >>> Feed(name='invalid name')
118 Traceback (most recent call last):
120 rss2email.error.InvalidFeedName: invalid feed name 'invalid name'
122 However, you aren't restricted to ASCII letters:
124 >>> Feed(name='Αθήνα')
125 <Feed Αθήνα (None -> )>
127 You must define a URL:
129 >>> Feed(name='feed-without-a-url', to='a@b.com').run(send=False)
130 Traceback (most recent call last):
132 rss2email.error.InvalidFeedConfig: invalid feed configuration {'url': None}
137 >>> CONFIG['DEFAULT']['to'] = ''
138 >>> test_section = CONFIG.pop('feed.test-feed')
141 _name_regexp = _re.compile('^[\w\d.-]+$')
143 # saved/loaded from feed.dat using __getstate__/__setstate__.
144 _dynamic_attributes = [
151 ## saved/loaded from ConfigParser instance
152 # attributes that aren't in DEFAULT
153 _non_default_configured_attributes = [
156 # attributes that are in DEFAULT
157 _default_configured_attributes = [
158 key.replace('-', '_') for key in _config.CONFIG['DEFAULT'].keys()]
159 _default_configured_attributes[
160 _default_configured_attributes.index('from')
161 ] = 'from_email' # `from` is a Python keyword
162 # all attributes that are saved/loaded from .config
163 _configured_attributes = (
164 _non_default_configured_attributes + _default_configured_attributes)
165 # attribute name -> .config option
166 _configured_attribute_translations = dict(
167 (attr,attr) for attr in _non_default_configured_attributes)
168 _configured_attribute_translations.update(dict(
169 zip(_default_configured_attributes,
170 _config.CONFIG['DEFAULT'].keys())))
171 # .config option -> attribute name
172 _configured_attribute_inverse_translations = dict(
173 (v,k) for k,v in _configured_attribute_translations.items())
175 # hints for value conversion
176 _boolean_attributes = [
179 'use_publisher_email',
187 'links_after_each_paragraph',
192 _integer_attributes = [
202 _function_attributes = [
204 'digest_post_process',
207 def __init__(self, name=None, url=None, to=None, config=None):
208 self._set_name(name=name)
210 self.__setstate__(dict(
211 (attr, getattr(self, attr))
212 for attr in self._dynamic_attributes))
213 self.load_from_config(config=config)
220 return '{} ({} -> {})'.format(self.name, self.url, self.to)
223 return '<Feed {}>'.format(str(self))
225 def __getstate__(self):
226 "Save dyamic attributes"
228 (key,getattr(self,key)) for key in self._dynamic_attributes)
230 get_state = __getstate__ # make it publicly accessible
232 def __setstate__(self, state):
233 "Restore dynamic attributes"
234 keys = sorted(state.keys())
235 if keys != sorted(self._dynamic_attributes):
236 raise ValueError(state)
237 self._set_name(name=state['name'])
238 self.__dict__.update(state)
240 set_state = __setstate__ # make it publicly accessible
242 def save_to_config(self):
243 "Save configured attributes"
244 data = _collections.OrderedDict()
245 default = self.config['DEFAULT']
246 for attr in self._configured_attributes:
247 key = self._configured_attribute_translations[attr]
248 value = getattr(self, attr)
249 if value is not None:
250 value = self._get_configured_option_value(
251 attribute=attr, value=value)
252 if (attr in self._non_default_configured_attributes or
253 value != default[key]):
255 self.config[self.section] = data
257 def load_from_config(self, config=None):
258 "Restore configured attributes"
260 config = _config.CONFIG
262 if self.section in self.config:
263 data = self.config[self.section]
265 data = self.config['DEFAULT']
266 keys = sorted(data.keys())
267 expected = sorted(self._configured_attribute_translations.values())
270 if (key not in keys and
271 key not in self._non_default_configured_attributes):
272 raise _error.InvalidFeedConfig(
273 setting=key, feed=self,
274 message='missing configuration key: {}'.format(key))
276 if key not in expected:
277 raise _error.InvalidFeedConfig(
278 setting=key, feed=self,
279 message='extra configuration key: {}'.format(key))
281 (self._configured_attribute_inverse_translations[k],
282 self._get_configured_attribute_value(
283 attribute=self._configured_attribute_inverse_translations[k],
285 for k in data.keys())
286 for attr in self._non_default_configured_attributes:
289 self.__dict__.update(data)
291 def _get_configured_option_value(self, attribute, value):
294 elif attribute in self._list_attributes:
295 return ', '.join(value)
296 elif attribute in self._function_attributes:
297 return _util.import_name(value)
300 def _get_configured_attribute_value(self, attribute, key, data):
301 if attribute in self._boolean_attributes:
302 return data.getboolean(key)
303 elif attribute in self._integer_attributes:
304 return data.getint(key)
305 elif attribute in self._list_attributes:
306 return [x.strip() for x in data[key].split(',')]
307 elif attribute in self._function_attributes:
309 return _util.import_function(data[key])
314 """Reset dynamic data
320 def _set_name(self, name):
321 if not self._name_regexp.match(name):
322 raise _error.InvalidFeedName(name=name, feed=self)
324 self.section = 'feed.{}'.format(self.name)
327 """Fetch and parse a feed using feedparser.
330 ... name='test-feed',
331 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
332 >>> parsed = feed._fetch()
336 _LOG.info('fetch {}'.format(self))
338 raise _error.InvalidFeedConfig(setting='url', feed=self)
339 if self.section in self.config:
340 config = self.config[self.section]
342 config = self.config['DEFAULT']
343 proxy = config['proxy']
344 timeout = config.getint('feed-timeout')
347 kwargs['handlers'] = [_urllib_request.ProxyHandler({'http':proxy})]
348 f = _util.TimeLimitedFunction(timeout, _feedparser.parse)
349 return f(self.url, self.etag, modified=self.modified, **kwargs)
351 def _process(self, parsed):
352 _LOG.info('process {}'.format(self))
353 self._check_for_errors(parsed)
354 for entry in reversed(parsed.entries):
355 _LOG.debug('processing {}'.format(entry.get('id', 'no-id')))
356 processed = self._process_entry(parsed=parsed, entry=entry)
358 guid,id_,sender,message = processed
359 if self.post_process:
360 message = self.post_process(
361 feed=self, parsed=parsed, entry=entry, guid=guid,
365 yield (guid, id_, sender, message)
367 def _check_for_errors(self, parsed):
369 status = getattr(parsed, 'status', 200)
370 _LOG.debug('HTTP status {}'.format(status))
372 _LOG.info('redirect {} from {} to {}'.format(
373 self.name, self.url, parsed['url']))
374 self.url = parsed['url']
375 elif status not in [200, 302, 304]:
376 raise _error.HTTPError(status=status, feed=self)
378 http_headers = parsed.get('headers', {})
380 _LOG.debug('HTTP headers: {}'.format(http_headers))
382 _LOG.warning('could not get HTTP headers: {}'.format(self))
385 if 'html' in http_headers.get('content-type', 'rss'):
386 _LOG.warning('looks like HTML: {}'.format(self))
388 if http_headers.get('content-length', '1') == '0':
389 _LOG.warning('empty page: {}'.format(self))
392 version = parsed.get('version', None)
394 _LOG.debug('feed version {}'.format(version))
396 _LOG.warning('unrecognized version: {}'.format(self))
399 exc = parsed.get('bozo_exception', None)
400 if isinstance(exc, _socket.timeout):
401 _LOG.error('timed out: {}'.format(self))
403 elif isinstance(exc, OSError):
404 _LOG.error('{}: {}'.format(exc, self))
406 elif isinstance(exc, _SOCKET_ERRORS):
407 _LOG.error('{}: {}'.format(exc, self))
409 elif isinstance(exc, _feedparser.zlib.error):
410 _LOG.error('broken compression: {}'.format(self))
412 elif isinstance(exc, (IOError, AttributeError)):
413 _LOG.error('{}: {}'.format(exc, self))
415 elif isinstance(exc, KeyboardInterrupt):
417 elif isinstance(exc, _sax.SAXParseException):
418 _LOG.error('sax parsing error: {}: {}'.format(exc, self))
420 elif (parsed.bozo and
421 isinstance(exc, _feedparser.CharacterEncodingOverride)):
423 'incorrectly declared encoding: {}: {}'.format(exc, self))
425 elif parsed.bozo or exc:
427 exc = "can't process"
428 _LOG.error('processing error: {}: {}'.format(exc, self))
432 status in [200, 302] and
433 not parsed.entries and
435 raise _error.ProcessingError(parsed=parsed, feed=feed)
437 def _html2text(self, html, baseurl='', default=None):
438 self.config.setup_html2text(section=self.section)
440 return _html2text.html2text(html=html, baseurl=baseurl)
441 except _html_parser.HTMLParseError as e:
442 if default is not None:
446 def _process_entry(self, parsed, entry):
447 id_ = self._get_entry_id(entry)
448 # If .trust_guid isn't set, we get back hashes of the content.
449 # Instead of letting these run wild, we put them in context
450 # by associating them with the actual ID (if it exists).
451 guid = entry.get('id', id_)
452 if isinstance(guid, dict):
453 guid = guid.values()[0]
454 if guid in self.seen:
455 if self.seen[guid]['id'] == id_:
456 _LOG.debug('already seen {}'.format(id_))
457 return # already seen
458 sender = self._get_entry_email(parsed=parsed, entry=entry)
459 subject = self._get_entry_title(entry)
460 extra_headers = _collections.OrderedDict((
461 ('Date', self._get_entry_date(entry)),
462 ('Message-ID', '<{}@dev.null.invalid>'.format(_uuid.uuid4())),
463 ('User-Agent', _USER_AGENT),
464 ('X-RSS-Feed', self.url),
466 ('X-RSS-URL', self._get_entry_link(entry)),
467 ('X-RSS-TAGS', self._get_entry_tags(entry)),
469 for k,v in extra_headers.items(): # remove empty tags, etc.
472 if self.bonus_header:
473 for header in self.bonus_header.splitlines():
475 key,value = header.split(':', 1)
476 extra_headers[key.strip()] = value.strip()
479 'malformed bonus-header: {}'.format(
482 content = self._get_entry_content(entry)
484 content = self._process_entry_content(
485 entry=entry, content=content, subject=subject)
486 except _error.ProcessingError as e:
489 message = _email.get_message(
493 body=content['value'],
494 content_type=content['type'].split('/', 1)[1],
495 extra_headers=extra_headers,
497 section=self.section)
498 return (guid, id_, sender, message)
500 def _get_entry_id(self, entry):
501 """Get best ID from an entry."""
503 return entry.get('link', None)
505 if getattr(entry, 'id', None):
506 # Newer versions of feedparser could return a dictionary
507 if isinstance(entry.id, dict):
508 return entry.id.values()[0]
510 content = self._get_entry_content(entry)
511 content_value = content['value'].strip()
513 return _hashlib.sha1(
514 content_value.encode('unicode-escape')).hexdigest()
515 elif getattr(entry, 'link', None):
516 return _hashlib.sha1(
517 entry.link.encode('unicode-escape')).hexdigest()
518 elif getattr(entry, 'title', None):
519 return _hashlib.sha1(
520 entry.title.encode('unicode-escape')).hexdigest()
522 def _get_entry_link(self, entry):
523 return entry.get('link', None)
525 def _get_entry_title(self, entry):
526 if hasattr(entry, 'title_detail') and entry.title_detail:
527 title = entry.title_detail.value
528 if 'html' in entry.title_detail.type:
529 title = self._html2text(title, default=title)
531 content = self._get_entry_content(entry)
532 value = content['value']
533 if content['type'] in ('text/html', 'application/xhtml+xml'):
534 value = self._html2text(value, default=value)
536 title = title.replace('\n', ' ').strip()
539 def _get_entry_date(self, entry):
540 datetime = _time.gmtime()
542 for datetype in self.date_header_order:
543 kind = datetype + '_parsed'
544 if entry.get(kind, None):
545 datetime = entry[kind]
547 return _time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
549 def _get_entry_name(self, parsed, entry):
552 >>> import feedparser
553 >>> f = Feed(name='test-feed')
554 >>> parsed = feedparser.parse(
555 ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
558 ... ' <name>Example author</name>\\n'
559 ... ' <email>me@example.com</email>\\n'
560 ... ' <url>http://example.com/</url>\\n'
565 >>> entry = parsed.entries[0]
566 >>> f.name_format = ''
567 >>> f._get_entry_name(parsed, entry)
569 >>> f.name_format = '{author}'
570 >>> f._get_entry_name(parsed, entry)
572 >>> f.name_format = '{feed-title}: {author}'
573 >>> f._get_entry_name(parsed, entry)
575 >>> f.name_format = '{author} ({feed.name})'
576 >>> f._get_entry_name(parsed, entry)
577 'Example author (test-feed)'
579 if not self.name_format:
583 'feed-title': '<feed title>',
584 'author': '<author>',
585 'publisher': '<publisher>',
588 data['feed-title'] = feed.get('title', '')
589 for x in [entry, feed]:
590 if 'name' in x.get('author_detail', []):
591 if x.author_detail.name:
592 data['author'] = x.author_detail.name
594 if 'name' in feed.get('publisher_detail', []):
595 data['publisher'] = feed.publisher_detail.name
596 name = self.name_format.format(**data)
597 return _html2text.unescape(name)
599 def _validate_email(self, email, default=None):
600 """Do a basic quality check on email address
602 Return `default` if the address doesn't appear to be
603 well-formed. If `default` is `None`, return
606 >>> f = Feed(name='test-feed')
607 >>> f._validate_email('valid@example.com', 'default@example.com')
609 >>> f._validate_email('invalid@', 'default@example.com')
610 'default@example.com'
611 >>> f._validate_email('@invalid', 'default@example.com')
612 'default@example.com'
613 >>> f._validate_email('invalid', 'default@example.com')
614 'default@example.com'
616 parts = email.split('@')
617 if len(parts) != 2 or '' in parts:
619 return self.from_email
623 def _get_entry_address(self, parsed, entry):
624 """Get the best From email address ('<jdoe@a.com>')
626 If the best guess isn't well-formed (something@somthing.com),
627 use `self.from_email` instead.
630 return self.from_email
632 if 'email' in entry.get('author_detail', []):
633 return self._validate_email(entry.author_detail.email)
634 elif 'email' in feed.get('author_detail', []):
635 return self._validate_email(feed.author_detail.email)
636 if self.use_publisher_email:
637 if 'email' in feed.get('publisher_detail', []):
638 return self._validate_email(feed.publisher_detail.email)
639 if feed.get('errorreportsto', None):
640 return self._validate_email(feed.errorreportsto)
641 _LOG.debug('no sender address found, fallback to default')
642 return self.from_email
644 def _get_entry_email(self, parsed, entry):
645 """Get the best From email address ('John <jdoe@a.com>')
647 name = self._get_entry_name(parsed=parsed, entry=entry)
648 address = self._get_entry_address(parsed=parsed, entry=entry)
649 return _formataddr((name, address))
651 def _get_entry_tags(self, entry):
652 """Add post tags, if available
654 >>> f = Feed(name='test-feed')
655 >>> f._get_entry_tags({
656 ... 'tags': [{'term': 'tag1',
658 ... 'label': None}]})
660 >>> f._get_entry_tags({
661 ... 'tags': [{'term': 'tag1',
666 ... 'label': None}]})
669 Test some troublesome cases. No tags:
671 >>> f._get_entry_tags({})
675 >>> f._get_entry_tags({'tags': []})
677 Tags without a ``term`` entry:
679 >>> f._get_entry_tags({
680 ... 'tags': [{'scheme': None,
681 ... 'label': None}]})
683 Tags with an empty term:
685 >>> f._get_entry_tags({
686 ... 'tags': [{'term': '',
688 ... 'label': None}]})
690 taglist = [tag['term'] for tag in entry.get('tags', [])
691 if tag.get('term', '')]
693 return ','.join(taglist)
695 def _get_entry_content(self, entry):
696 """Select the best content from an entry.
698 Returns a feedparser content dict.
701 # * We have a bunch of potential contents.
702 # * We go thru looking for our first choice.
703 # (HTML or text, depending on self.html_mail)
704 # * If that doesn't work, we go thru looking for our second choice.
705 # * If that still doesn't work, we just take the first one.
707 # Possible future improvement:
708 # * Instead of just taking the first one
709 # pick the one in the "best" language.
710 # * HACK: hardcoded .html_mail, should take a tuple of media types
711 contents = list(entry.get('content', []))
712 if entry.get('summary_detail', None):
713 contents.append(entry.summary_detail)
715 types = ['text/html', 'text/plain']
717 types = ['text/plain', 'text/html']
718 for content_type in types:
719 for content in contents:
720 if content['type'] == content_type:
724 return {'type': 'text/plain', 'value': ''}
726 def _process_entry_content(self, entry, content, subject):
727 "Convert entry content to the requested format."
728 link = self._get_entry_link(entry)
735 if self.use_css and self.css:
737 ' <style type="text/css">',
745 '<h1 class="header"><a href="{}">{}</a></h1>'.format(
749 if content['type'] in ('text/html', 'application/xhtml+xml'):
750 lines.append(content['value'].strip())
752 lines.append(_saxutils.escape(content['value'].strip()))
753 lines.append('</div>')
755 '<div class="footer">'
756 '<p>URL: <a href="{0}">{0}</a></p>'.format(link),
758 for enclosure in getattr(entry, 'enclosures', []):
759 if getattr(enclosure, 'url', None):
761 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
763 if getattr(enclosure, 'src', None):
765 '<p>Enclosure: <a href="{0}">{0}</a></p>'.format(
768 '<p><img src="{}" /></p>'.format(enclosure.src))
769 for elink in getattr(entry, 'links', []):
770 if elink.get('rel', None) == 'via':
772 title = elink.get('title', url)
773 lines.append('<p>Via <a href="{}">{}</a></p>'.format(
781 content['type'] = 'text/html'
782 content['value'] = '\n'.join(lines)
784 else: # not self.html_mail
785 if content['type'] in ('text/html', 'application/xhtml+xml'):
787 lines = [self._html2text(content['value'])]
788 except _html_parser.HTMLParseError as e:
789 raise _error.ProcessingError(parsed=None, feed=self)
791 lines = [content['value']]
793 lines.append('URL: {}'.format(link))
794 for enclosure in getattr(entry, 'enclosures', []):
795 if getattr(enclosure, 'url', None):
796 lines.append('Enclosure: {}'.format(enclosure.url))
797 if getattr(enclosure, 'src', None):
798 lines.append('Enclosure: {}'.format(enclosure.src))
799 for elink in getattr(entry, 'links', []):
800 if elink.get('rel', None) == 'via':
802 title = elink.get('title', url)
803 lines.append('Via: {} {}'.format(title, url))
804 content['type'] = 'text/plain'
805 content['value'] = '\n'.join(lines)
808 def _send(self, sender, message):
809 _LOG.info('send message for {}'.format(self))
810 section = self.section
811 if section not in self.config:
813 _email.send(sender=sender, recipient=self.to, message=message,
814 config=self.config, section=section)
816 def run(self, send=True):
817 """Fetch and process the feed, mailing entry emails.
820 ... name='test-feed',
821 ... url='http://feeds.feedburner.com/allthingsrss/hJBr')
822 >>> def send(sender, message):
823 ... print('send from {}:'.format(sender))
824 ... print(message.as_string())
825 >>> feed._send = send
826 >>> feed.to = 'jdoe@dummy.invalid'
827 >>> #parsed = feed.run() # enable for debugging
830 raise _error.NoToEmailAddress(feed=self)
831 parsed = self._fetch()
834 digest = self._new_digest()
837 for (guid, id_, sender, message) in self._process(parsed):
838 _LOG.debug('new message: {}'.format(message['Subject']))
840 seen.append((guid, id_))
841 self._append_to_digest(digest=digest, message=message)
844 self._send(sender=sender, message=message)
845 if guid not in self.seen:
847 self.seen[guid]['id'] = id_
849 if self.digest and seen:
850 if self.digest_post_process:
851 digest = self.digest_post_process(
852 feed=self, parsed=parsed, seen=seen, message=digest)
856 digest=digest, seen=seen, sender=sender, send=send)
858 self.etag = parsed.get('etag', None)
859 self.modified = parsed.get('modified', None)
861 def _new_digest(self):
862 digest = _MIMEMultipart('digest')
863 digest['To'] = self.to # TODO: _Header(), _formataddr((recipient_name, recipient_addr))
864 digest['Subject'] = 'digest for {}'.format(self.name)
865 digest['Message-ID'] = '<{}@dev.null.invalid>'.format(_uuid.uuid4())
866 digest['User-Agent'] = _USER_AGENT
867 digest['X-RSS-Feed'] = self.url
870 def _append_to_digest(self, digest, message):
871 part = _MIMEMessage(message)
872 part.add_header('Content-Disposition', 'attachment')
875 def _send_digest(self, digest, seen, sender, send=True):
876 """Send a digest message
878 The date is extracted from the last message in the digest
879 payload. We assume that this part exists. If you don't have
880 any messages in the digest, don't call this function.
882 digest['From'] = sender # TODO: _Header(), _formataddr()...
883 last_part = digest.get_payload()[-1]
884 last_message = last_part.get_payload()[0]
885 digest['Date'] = last_message['Date']
887 _LOG.debug('new digest for {}'.format(self))
889 self._send(sender=sender, message=digest)
890 for (guid, id_) in seen:
891 if guid not in self.seen:
893 self.seen[guid]['id'] = id_