4 Parser for Apache log files. This is a port to python of Peter Hickman's
5 Apache::LogEntry Perl module:
6 <http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
8 Takes the Apache logging format defined in your httpd.conf and generates
9 a regular expression which is used to a line from the log file and
10 return it as a dictionary with keys corresponding to the fields defined
17 # Format copied and pasted from Apache conf - use raw string + single quotes
18 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
20 p = apachelog.parser(format)
22 for line in open('/var/apache/access.log'):
26 sys.stderr.write("Unable to parse %s" % line)
28 The return dictionary from the parse method depends on the input format.
29 For the above example, the returned dictionary would look like;
36 '%r': 'GET /images/previous.png HTTP/1.1',
37 '%t': '[23/Jan/2004:11:36:20 +0000]',
39 '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
40 '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
43 ...given an access log entry like (split across lines for formatting);
45 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
46 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
47 "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
49 You can also re-map the field names by subclassing (or re-pointing) the
52 Generally you should be able to copy and paste the format string from
53 your Apache configuration, but remember to place it in a raw string
54 using single-quotes, so that backslashes are handled correctly.
56 This module provides three of the most common log formats in the
59 # Common Log Format (CLF)
60 p = apachelog.parser(apachelog.formats['common'])
62 # Common Log Format with Virtual Host
63 p = apachelog.parser(apachelog.formats['vhcommon'])
65 # NCSA extended/combined log format
66 p = apachelog.parser(apachelog.formats['extended'])
68 For notes regarding performance while reading lines from a file
69 in Python, see <http://effbot.org/zone/readline-performance.htm>.
70 Further performance boost can be gained by using psyco
71 <http://psyco.sourceforge.net/>
73 On my system, using a loop like;
75 for line in open('access.log'):
78 ...was able to parse ~60,000 lines / second. Adding psyco to the mix,
79 up that to ~75,000 lines / second.
81 The parse_date function is intended as a fast way to convert a log
82 date into something useful, without incurring a significant date
83 parsing overhead - good enough for basic stuff but will be a problem
84 if you need to deal with log from multiple servers in different
89 __license__ = """Released under the same terms as Perl.
90 See: http://dev.perl.org/licenses/
92 __author__ = "Harry Fuecks <hfuecks@gmail.com>"
94 "Peter Hickman <peterhi@ntlworld.com>",
95 "Loic Dachary <loic@dachary.org>"
100 class ApacheLogParserError(Exception):
103 class AttrDict(dict):
105 Allows dicts to be accessed via dot notation as well as subscripts
106 Makes using the friendly names nicer
108 def __getattr__(self, name):
113 # Explanatory comments copied from
114 # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
119 # Size of response in bytes, excluding HTTP headers.
120 '%B':'response_bytes',
121 # Size of response in bytes, excluding HTTP headers. In CLF
122 # format, i.e. a "-" rather than a 0 when no bytes are sent.
123 '%b':'response_bytes_clf',
124 # The contents of cookie Foobar in the request sent to the server.
125 # Only version 0 cookies are fully supported.
126 #'%{Foobar}C':'TODO',
127 # The time taken to serve the request, in microseconds.
128 '%D':'response_time_us',
129 # The contents of the environment variable FOOBAR
130 #'%{FOOBAR}e':'TODO',
135 # The request protocol
136 '%H':'request_protocol',
137 # The contents of Foobar: header line(s) in the request sent to
138 # the server. Changes made by other modules (e.g. mod_headers)
140 #'%{Foobar}i':'TODO',
141 # Number of keepalive requests handled on this connection.
142 # Interesting if KeepAlive is being used, so that, for example,
143 # a "1" means the first keepalive request after the initial one,
144 # "2" the second, etc...; otherwise this is always 0 (indicating
145 # the initial request). Available in versions 2.2.11 and later.
146 '%k':'keepalive_num',
147 # Remote logname (from identd, if supplied). This will return a
148 # dash unless mod_ident is present and IdentityCheck is set On.
149 '%l':'remote_logname',
151 '%m':'request_method',
152 # The contents of note Foobar from another module.
153 #'%{Foobar}n':'TODO',
154 # The contents of Foobar: header line(s) in the reply.
155 #'%{Foobar}o':'TODO',
156 # The canonical port of the server serving the request
158 # The canonical port of the server serving the request or the
159 # server's actual port or the client's actual port. Valid
160 # formats are canonical, local, or remote.
161 #'%{format}p':"TODO",
162 # The process ID of the child that serviced the request.
164 # The process ID or thread id of the child that serviced the
165 # request. Valid formats are pid, tid, and hextid. hextid requires
166 # APR 1.2.0 or higher.
167 #'%{format}P':'TODO',
168 # The query string (prepended with a ? if a query string exists,
169 # otherwise an empty string)
171 # First line of request
173 # The handler generating the response (if any).
174 '%R':'response_handler',
175 # Status. For requests that got internally redirected, this is
176 # the status of the *original* request --- %>s for the last.
179 # Time the request was received (standard english format)
181 # The time, in the form given by format, which should be in
182 # strftime(3) format. (potentially localized)
183 #'%{format}t':'TODO',
184 # The time taken to serve the request, in seconds.
185 '%T':'response_time_sec',
186 # Remote user (from auth; may be bogus if return status (%s) is 401)
188 # The URL path requested, not including any query string.
190 # The canonical ServerName of the server serving the request.
191 '%v':'canonical_server_name',
192 # The server name according to the UseCanonicalName setting.
193 '%V':'server_name_config', #TODO: Needs better name
194 # Connection status when response is completed:
195 # X = connection aborted before the response completed.
196 # + = connection may be kept alive after the response is sent.
197 # - = connection will be closed after the response is sent.
198 '%X':'completed_connection_status',
199 # Bytes received, including request and headers, cannot be zero.
200 # You need to enable mod_logio to use this.
201 '%I':'bytes_received',
202 # Bytes sent, including headers, cannot be zero. You need to
203 # enable mod_logio to use this
207 def __init__(self, format, use_names=False):
209 Takes the log format from an Apache configuration file.
211 Best just copy and paste directly from the .conf file
212 and pass using a Python raw string e.g.
214 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
215 p = apachelog.parser(format)
220 self._use_names = use_names
221 self._parse_format(format)
223 def _parse_format(self, format):
225 Converts the input format to a regular
226 expression, as well as extracting fields
228 Raises an exception if it couldn't compile
231 format = format.strip()
232 format = re.sub('[ \t]+',' ',format)
236 findquotes = re.compile(r'^\\"')
237 findreferreragent = re.compile('Referer|User-Agent', re.I)
238 findpercent = re.compile('^%.*t$')
239 lstripquotes = re.compile(r'^\\"')
240 rstripquotes = re.compile(r'\\"$')
243 for element in format.split(' '):
246 if findquotes.search(element): hasquotes = 1
249 element = lstripquotes.sub('', element)
250 element = rstripquotes.sub('', element)
253 self._names.append(self.alias(element))
255 self._names.append(element)
260 if element == '%r' or findreferreragent.search(element):
261 subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
263 subpattern = r'\"([^\"]*)\"'
265 elif findpercent.search(element):
266 subpattern = r'(\[[^\]]+\])'
268 elif element == '%U':
271 subpatterns.append(subpattern)
273 self._pattern = '^' + ' '.join(subpatterns) + '$'
275 self._regex = re.compile(self._pattern)
277 raise ApacheLogParserError(e)
279 def parse(self, line):
281 Parses a single line from the log file and returns
282 a dictionary of it's contents.
284 Raises and exception if it couldn't parse the line
287 match = self._regex.match(line)
291 for k, v in zip(self._names, match.groups()):
295 raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
297 def alias(self, name):
299 Override / replace this method if you want to map format
300 field names to something else. This method is called
301 when the parser is constructed, not when actually parsing
304 Takes and returns a string fieldname
307 return self.format_to_name[name]
313 Returns the compound regular expression the parser extracted
314 from the input format (a string)
320 Returns the field names the parser extracted from the
321 input format (a list)
340 def parse_date(date):
342 Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
343 (including square brackets) and returns a two element
344 tuple containing first a timestamp of the form
345 YYYYMMDDHH24IISS e.g. 20061205105144 and second the
346 timezone offset as is e.g.;
348 parse_date('[05/Dec/2006:10:51:44 +0000]')
349 >> ('20061205105144', '+0000')
351 It does not attempt to adjust the timestamp according
352 to the timezone - this is your problem.
363 return (''.join(elems),date[21:])
367 Frequenty used log formats stored here
370 # Common Log Format (CLF)
371 'common':r'%h %l %u %t \"%r\" %>s %b',
373 # Common Log Format with Virtual Host
374 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
376 # NCSA extended/combined log format
377 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
380 if __name__ == '__main__':
383 class TestApacheLogParser(unittest.TestCase):
386 self.format = r'%h %l %u %t \"%r\" %>s '\
387 r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
388 self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
389 '%{User-Agent}i'.split(' ')
390 self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
391 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
392 '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
393 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
394 self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
395 r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
396 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
397 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
399 self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
400 r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
401 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
402 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
404 self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
405 r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
406 r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
407 r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
408 r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
409 r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
410 r'YPC 3.0.3; yplus 4.0.00d)\""'
411 # r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
412 # r'YPC 3.0.3; yplus 4.0.00d)"'
413 self.p = parser(self.format)
415 def testpattern(self):
416 self.assertEqual(self.pattern, self.p.pattern())
419 self.assertEqual(self.fields, self.p.names())
422 data = self.p.parse(self.line1)
423 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
424 self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
425 self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
426 self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
429 'GET /images/previous.png HTTP/1.1',
432 self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
433 self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
436 'http://peterhi.dyndns.org/bandwidth/index.html',
437 msg = 'Line 1 %{Referer}i'
440 data['%{User-Agent}i'],
441 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
442 msg = 'Line 1 %{User-Agent}i'
447 data = self.p.parse(self.line2)
448 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
449 self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
450 self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
453 '[23/Jan/2004:11:36:20 +0000]',
458 r'GET /images/previous.png=\" HTTP/1.1',
461 self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
462 self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
465 'http://peterhi.dyndns.org/bandwidth/index.html',
466 msg = 'Line 2 %{Referer}i'
469 data['%{User-Agent}i'],
470 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
471 msg = 'Line 2 %{User-Agent}i'
475 data = self.p.parse(self.line3)
476 self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
477 self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
478 self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
481 '[20/Jul/2004:13:18:55 -0700]',
486 r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
487 r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
491 self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
492 self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
495 r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
497 msg = 'Line 3 %{Referer}i'
500 data['%{User-Agent}i'],
501 '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
503 # 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
505 msg = 'Line 3 %{User-Agent}i'
509 def testjunkline(self):
510 self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
512 def testhasquotesaltn(self):
513 p = parser(r'%a \"%b\" %c')
514 line = r'foo "xyz" bar'
516 self.assertEqual(data['%a'],'foo', '%a')
517 self.assertEqual(data['%b'],'xyz', '%c')
518 self.assertEqual(data['%c'],'bar', '%c')
520 def testparsedate(self):
521 date = '[05/Dec/2006:10:51:44 +0000]'
522 self.assertEqual(('20061205105144','+0000'),parse_date(date))