Only save the resolver cache when the on-disk version is out of date.
[apachelog.git] / apachelog / resolve.py
1 import os.path as _os_path
2 import pickle as _pickle
3 import re as _re
4 import socket as _socket
5
6
7 class Resolver (object):
8     """A simple reverse-DNS resolver.
9
10     Maintains a class-level cache of resolved IPs to avoid repeated
11     lookups on the same IP address.
12
13     Avoid hanging if we can't resolve a name.
14
15     >>> import socket
16     >>> if hasattr(_socket, 'setdefaulttimeout'):
17     ...     socket.setdefaulttimeout(5)  # set 5 second timeout
18
19     >>> r = Resolver()
20     >>> r.IP = {}  # clear cache of date from previous tests
21     >>> r.resolve('198.41.0.4')
22     'a.root-servers.net'
23     >>> r.IP
24     {'198.41.0.4': ('a.root-servers.net', [], ['198.41.0.4'])}
25
26     If you want to give shorter names to various DNS names, you can
27     add an entry to the class-level ``REGEXPS``.  The entry should use
28     your name as the key, and a list of matching regexps as the value.
29     You need to enable this enhanced resolution using the ``smart``
30     argument.
31
32     >>> r.resolve('66.249.68.33')
33     'crawl-66-249-68-33.googlebot.com'
34     >>> r = Resolver(smart=True)
35     >>> r.resolve('66.249.68.34')
36     'googlebot'
37     """
38     IP = {}
39
40     REGEXPS = {
41         'feedburner': [_re.compile('.*rate-limited-proxy-.*.google.com.*')],
42         }
43     for bot in [
44         'baiduspider',
45         'googlebot',
46         'msnbot',  # a.k.a: bingbot
47         'yandex',
48         ]:
49         REGEXPS[bot] = [_re.compile('.*{}.*'.format(bot))]
50
51     _cache_file = _os_path.expanduser(
52         _os_path.join('~', '.apachelog-resolver.cache'))
53     _cache_loaded = False
54     _cache_dirty = None
55
56     def __init__(self, smart=False):
57         self._smart = smart
58         self.load_cache()
59
60     @classmethod
61     def load_cache(self):
62         if not self._cache_loaded:
63             self._cache_loaded = True
64             try:
65                 with open(self._cache_file, 'rb') as f:
66                     self.IP = _pickle.load(f)
67                 self._cache_dirty = False
68             except IOError:
69                 pass
70             if self.IP is None:
71                 self.IP = {}
72
73     @classmethod
74     def save_cache(self):
75         self.load_cache()  # avoid clobbering unloaded content
76         if self._cache_dirty:
77             with open(self._cache_file, 'wb') as f:
78                 _pickle.dump(self.IP, f)
79
80     def resolve(self, ip):
81         if ip not in self.IP:
82             self._cache_dirty = True
83             try:
84                 self.IP[ip] = _socket.gethostbyaddr(ip)
85             except _socket.herror as e:
86                 self.IP[ip] = (ip, [], [ip])
87             except _socket.gaierror as e:
88                 self.IP[ip] = (ip, [], [ip])
89             else:
90                 if self._smart:
91                     self._smart_resolve(ip)
92         return self.IP[ip][0]
93
94     def _smart_resolve(self, ip):
95         x = self.IP[ip]
96         if self._smart:
97             for name,regexps in self.REGEXPS.items():
98                 for regexp in regexps:
99                     if regexp.match(self.IP[ip][0]):
100                         self.IP[ip] = (name, x[1], x[2])
101
102     def ips(self, name):
103         "Return a set of IP addresses used by a smart-resolved name."
104         ips = set()
105         for ip,values in self.IP.items():
106             if values[0] == name:
107                 for x in values[2]:
108                     ips.add(x)
109         return ips