Change my email address from drexel.edu to tremily.us.
[apachelog.git] / apachelog / resolve.py
1 import os.path as _os_path
2 import pickle as _pickle
3 import re as _re
4 import socket as _socket
5
6
7 class Resolver (object):
8     """A simple reverse-DNS resolver.
9
10     Maintains a class-level cache of resolved IPs to avoid repeated
11     lookups on the same IP address.
12
13     Avoid hanging if we can't resolve a name.
14
15     >>> import socket
16     >>> if hasattr(_socket, 'setdefaulttimeout'):
17     ...     socket.setdefaulttimeout(5)  # set 5 second timeout
18
19     >>> r = Resolver()
20     >>> r.IP = {}  # clear cache of date from previous tests
21     >>> r.resolve('198.41.0.4')
22     'a.root-servers.net'
23     >>> r.IP
24     {'198.41.0.4': ('a.root-servers.net', [], ['198.41.0.4'])}
25
26     If you want to give shorter names to various DNS names, you can
27     add an entry to the class-level ``REGEXPS``.  The entry should use
28     your name as the key, and a list of matching regexps as the value.
29     You need to enable this enhanced resolution using the ``smart``
30     argument.
31
32     >>> r.resolve('66.249.68.33')
33     'crawl-66-249-68-33.googlebot.com'
34     >>> r = Resolver(smart=True)
35     >>> r.resolve('66.249.68.34')
36     'googlebot'
37     """
38     IP = {}
39
40     REGEXPS = {
41         'feedburner': [_re.compile('.*rate-limited-proxy-.*.google.com.*')],
42         'yahoo': [_re.compile('.*crawl.yahoo.*')],  # slurp
43         }
44     for bot in [
45         'baiduspider',
46         'googlebot',
47         'msnbot',  # a.k.a: bingbot
48         'yandex',
49         ]:
50         REGEXPS[bot] = [_re.compile('.*{}.*'.format(bot))]
51
52     _cache_file = _os_path.expanduser(
53         _os_path.join('~', '.apachelog-resolver.cache'))
54     _cache_loaded = False
55     _cache_dirty = None
56
57     def __init__(self, smart=False):
58         self._smart = smart
59         self.load_cache()
60
61     @classmethod
62     def load_cache(self):
63         if not self._cache_loaded:
64             self._cache_loaded = True
65             try:
66                 with open(self._cache_file, 'rb') as f:
67                     self.IP = _pickle.load(f)
68                 self._cache_dirty = False
69             except IOError:
70                 pass
71             if self.IP is None:
72                 self.IP = {}
73
74     @classmethod
75     def save_cache(self):
76         self.load_cache()  # avoid clobbering unloaded content
77         if self._cache_dirty:
78             with open(self._cache_file, 'wb') as f:
79                 _pickle.dump(self.IP, f)
80
81     def resolve(self, ip):
82         if ip not in self.IP:
83             Resolver._cache_dirty = True
84             try:
85                 self.IP[ip] = _socket.gethostbyaddr(ip)
86             except _socket.herror as e:
87                 self.IP[ip] = (ip, [], [ip])
88             except _socket.gaierror as e:
89                 self.IP[ip] = (ip, [], [ip])
90             else:
91                 if self._smart:
92                     self._smart_resolve(ip)
93         return self.IP[ip][0]
94
95     def _smart_resolve(self, ip):
96         x = self.IP[ip]
97         if self._smart:
98             for name,regexps in self.REGEXPS.items():
99                 for regexp in regexps:
100                     if regexp.match(self.IP[ip][0]):
101                         self.IP[ip] = (name, x[1], x[2])
102
103     def ips(self, name):
104         "Return a set of IP addresses used by a smart-resolved name."
105         ips = set()
106         for ip,values in self.IP.items():
107             if values[0] == name:
108                 for x in values[2]:
109                     ips.add(x)
110         return ips