4 Parser for Apache log files. This is a port to python of Peter Hickman's
5 Apache::LogEntry Perl module:
6 <http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
8 Takes the Apache logging format defined in your httpd.conf and generates
9 a regular expression which is used to a line from the log file and
10 return it as a dictionary with keys corresponding to the fields defined
17 # Format copied and pasted from Apache conf - use raw string + single quotes
18 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
20 p = apachelog.parser(format)
22 for line in open('/var/apache/access.log'):
26 sys.stderr.write("Unable to parse %s" % line)
28 The return dictionary from the parse method depends on the input format.
29 For the above example, the returned dictionary would look like;
36 '%r': 'GET /images/previous.png HTTP/1.1',
37 '%t': '[23/Jan/2004:11:36:20 +0000]',
39 '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
40 '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
43 ...given an access log entry like (split across lines for formatting);
45 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
46 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
47 "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
49 You can also re-map the field names by subclassing (or re-pointing) the
52 Generally you should be able to copy and paste the format string from
53 your Apache configuration, but remember to place it in a raw string
54 using single-quotes, so that backslashes are handled correctly.
56 This module provides three of the most common log formats in the
59 # Common Log Format (CLF)
60 p = apachelog.parser(apachelog.formats['common'])
62 # Common Log Format with Virtual Host
63 p = apachelog.parser(apachelog.formats['vhcommon'])
65 # NCSA extended/combined log format
66 p = apachelog.parser(apachelog.formats['extended'])
68 For notes regarding performance while reading lines from a file
69 in Python, see <http://effbot.org/zone/readline-performance.htm>.
70 Further performance boost can be gained by using psyco
71 <http://psyco.sourceforge.net/>
73 On my system, using a loop like;
75 for line in open('access.log'):
78 ...was able to parse ~60,000 lines / second. Adding psyco to the mix,
79 up that to ~75,000 lines / second.
81 The parse_date function is intended as a fast way to convert a log
82 date into something useful, without incurring a significant date
83 parsing overhead - good enough for basic stuff but will be a problem
84 if you need to deal with log from multiple servers in different
89 __license__ = """Released under the same terms as Perl.
90 See: http://dev.perl.org/licenses/
92 __author__ = "Harry Fuecks <hfuecks@gmail.com>"
94 "Peter Hickman <peterhi@ntlworld.com>",
95 "Loic Dachary <loic@dachary.org>"
100 class ApacheLogParserError(Exception):
103 class AttrDict(dict):
105 Allows dicts to be accessed via dot notation as well as subscripts
106 Makes using the friendly names nicer
108 def __getattr__(self, name):
113 # Explanatory comments copied from
114 # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
119 # Size of response in bytes, excluding HTTP headers.
120 '%B':'response_bytes',
121 # Size of response in bytes, excluding HTTP headers. In CLF
122 # format, i.e. a "-" rather than a 0 when no bytes are sent.
123 '%b':'response_bytes_clf',
124 # The contents of cookie Foobar in the request sent to the server.
125 # Only version 0 cookies are fully supported.
126 #'%{Foobar}C':'TODO',
127 # The time taken to serve the request, in microseconds.
128 '%D':'response_time_us',
129 # The contents of the environment variable FOOBAR
130 #'%{FOOBAR}e':'TODO',
135 # The request protocol
136 '%H':'request_protocol',
137 # The contents of Foobar: header line(s) in the request sent to
138 # the server. Changes made by other modules (e.g. mod_headers)
140 #'%{Foobar}i':'TODO',
141 # Number of keepalive requests handled on this connection.
142 # Interesting if KeepAlive is being used, so that, for example,
143 # a "1" means the first keepalive request after the initial one,
144 # "2" the second, etc...; otherwise this is always 0 (indicating
145 # the initial request). Available in versions 2.2.11 and later.
146 '%k':'keepalive_num',
147 # Remote logname (from identd, if supplied). This will return a
148 # dash unless mod_ident is present and IdentityCheck is set On.
149 '%l':'remote_logname',
151 '%m':'request_method',
152 # The contents of note Foobar from another module.
153 #'%{Foobar}n':'TODO',
154 # The contents of Foobar: header line(s) in the reply.
155 #'%{Foobar}o':'TODO',
156 # The canonical port of the server serving the request
158 # The canonical port of the server serving the request or the
159 # server's actual port or the client's actual port. Valid
160 # formats are canonical, local, or remote.
161 #'%{format}p':"TODO",
162 # The process ID of the child that serviced the request.
164 # The process ID or thread id of the child that serviced the
165 # request. Valid formats are pid, tid, and hextid. hextid requires
166 # APR 1.2.0 or higher.
167 #'%{format}P':'TODO',
168 # The query string (prepended with a ? if a query string exists,
169 # otherwise an empty string)
171 # First line of request
172 # e.g., what you'd see in the logs as 'GET / HTTP/1.1'
174 # The handler generating the response (if any).
175 '%R':'response_handler',
176 # Status. For requests that got internally redirected, this is
177 # the status of the *original* request --- %>s for the last.
180 # Time the request was received (standard english format)
182 # The time, in the form given by format, which should be in
183 # strftime(3) format. (potentially localized)
184 #'%{format}t':'TODO',
185 # The time taken to serve the request, in seconds.
186 '%T':'response_time_sec',
187 # Remote user (from auth; may be bogus if return status (%s) is 401)
189 # The URL path requested, not including any query string.
191 # The canonical ServerName of the server serving the request.
192 '%v':'canonical_server_name',
193 # The server name according to the UseCanonicalName setting.
194 '%V':'server_name_config', #TODO: Needs better name
195 # Connection status when response is completed:
196 # X = connection aborted before the response completed.
197 # + = connection may be kept alive after the response is sent.
198 # - = connection will be closed after the response is sent.
199 '%X':'completed_connection_status',
200 # Bytes received, including request and headers, cannot be zero.
201 # You need to enable mod_logio to use this.
202 '%I':'bytes_received',
203 # Bytes sent, including headers, cannot be zero. You need to
204 # enable mod_logio to use this
208 def __init__(self, format, use_friendly_names=False):
210 Takes the log format from an Apache configuration file.
212 Best just copy and paste directly from the .conf file
213 and pass using a Python raw string e.g.
215 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
216 p = apachelog.parser(format)
221 self._use_friendly_names = use_friendly_names
222 self._parse_format(format)
224 def _parse_format(self, format):
226 Converts the input format to a regular
227 expression, as well as extracting fields
229 Raises an exception if it couldn't compile
232 format = format.strip()
233 format = re.sub('[ \t]+',' ',format)
237 findquotes = re.compile(r'^\\"')
238 findreferreragent = re.compile('Referer|User-Agent', re.I)
239 findpercent = re.compile('^%.*t$')
240 lstripquotes = re.compile(r'^\\"')
241 rstripquotes = re.compile(r'\\"$')
244 for element in format.split(' '):
247 if findquotes.search(element): hasquotes = 1
250 element = lstripquotes.sub('', element)
251 element = rstripquotes.sub('', element)
253 if self._use_friendly_names:
254 self._names.append(self.alias(element))
256 self._names.append(element)
261 if element == '%r' or findreferreragent.search(element):
262 subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
264 subpattern = r'\"([^\"]*)\"'
266 elif findpercent.search(element):
267 subpattern = r'(\[[^\]]+\])'
269 elif element == '%U':
272 subpatterns.append(subpattern)
274 self._pattern = '^' + ' '.join(subpatterns) + '$'
276 self._regex = re.compile(self._pattern)
278 raise ApacheLogParserError(e)
280 def parse(self, line):
282 Parses a single line from the log file and returns
283 a dictionary of it's contents.
285 Raises and exception if it couldn't parse the line
288 match = self._regex.match(line)
292 for k, v in zip(self._names, match.groups()):
296 raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
298 def alias(self, name):
300 Override / replace this method if you want to map format
301 field names to something else. This method is called
302 when the parser is constructed, not when actually parsing
305 Takes and returns a string fieldname
308 return self.format_to_name[name]
314 Returns the compound regular expression the parser extracted
315 from the input format (a string)
321 Returns the field names the parser extracted from the
322 input format (a list)
341 def parse_date(date):
343 Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
344 (including square brackets) and returns a two element
345 tuple containing first a timestamp of the form
346 YYYYMMDDHH24IISS e.g. 20061205105144 and second the
347 timezone offset as is e.g.;
349 parse_date('[05/Dec/2006:10:51:44 +0000]')
350 >> ('20061205105144', '+0000')
352 It does not attempt to adjust the timestamp according
353 to the timezone - this is your problem.
364 return (''.join(elems),date[21:])
368 Frequenty used log formats stored here
371 # Common Log Format (CLF)
372 'common':r'%h %l %u %t \"%r\" %>s %b',
374 # Common Log Format with Virtual Host
375 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
377 # NCSA extended/combined log format
378 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
381 if __name__ == '__main__':
384 class TestApacheLogParser(unittest.TestCase):
387 self.format = r'%h %l %u %t \"%r\" %>s '\
388 r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
389 self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
390 '%{User-Agent}i'.split(' ')
391 self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
392 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
393 '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
394 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
395 self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
396 r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
397 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
398 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
400 self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
401 r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
402 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
403 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
405 self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
406 r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
407 r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
408 r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
409 r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
410 r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
411 r'YPC 3.0.3; yplus 4.0.00d)\""'
412 # r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
413 # r'YPC 3.0.3; yplus 4.0.00d)"'
414 self.p = parser(self.format)
416 def testpattern(self):
417 self.assertEqual(self.pattern, self.p.pattern())
420 self.assertEqual(self.fields, self.p.names())
423 data = self.p.parse(self.line1)
424 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
425 self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
426 self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
427 self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
430 'GET /images/previous.png HTTP/1.1',
433 self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
434 self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
437 'http://peterhi.dyndns.org/bandwidth/index.html',
438 msg = 'Line 1 %{Referer}i'
441 data['%{User-Agent}i'],
442 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
443 msg = 'Line 1 %{User-Agent}i'
448 data = self.p.parse(self.line2)
449 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
450 self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
451 self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
454 '[23/Jan/2004:11:36:20 +0000]',
459 r'GET /images/previous.png=\" HTTP/1.1',
462 self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
463 self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
466 'http://peterhi.dyndns.org/bandwidth/index.html',
467 msg = 'Line 2 %{Referer}i'
470 data['%{User-Agent}i'],
471 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
472 msg = 'Line 2 %{User-Agent}i'
476 data = self.p.parse(self.line3)
477 self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
478 self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
479 self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
482 '[20/Jul/2004:13:18:55 -0700]',
487 r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
488 r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
492 self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
493 self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
496 r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
498 msg = 'Line 3 %{Referer}i'
501 data['%{User-Agent}i'],
502 '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
504 # 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
506 msg = 'Line 3 %{User-Agent}i'
510 def testjunkline(self):
511 self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
513 def testhasquotesaltn(self):
514 p = parser(r'%a \"%b\" %c')
515 line = r'foo "xyz" bar'
517 self.assertEqual(data['%a'],'foo', '%a')
518 self.assertEqual(data['%b'],'xyz', '%c')
519 self.assertEqual(data['%c'],'bar', '%c')
521 def testparsedate(self):
522 date = '[05/Dec/2006:10:51:44 +0000]'
523 self.assertEqual(('20061205105144','+0000'),parse_date(date))
525 class TestApacheLogParserFriendlyNames(unittest.TestCase):
528 self.format = r'%h %l %u %t \"%r\" %>s '\
529 r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
530 self.fields = ('remote_host remote_logname remote_user time '
531 'first_line last_status response_bytes_clf '
532 '%{Referer}i %{User-Agent}i').split(' ')
533 self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
534 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
535 '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
536 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
537 self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
538 r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
539 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
540 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
542 self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
543 r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
544 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
545 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
547 self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
548 r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
549 r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
550 r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
551 r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
552 r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
553 r'YPC 3.0.3; yplus 4.0.00d)\""'
554 # r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
555 # r'YPC 3.0.3; yplus 4.0.00d)"'
556 self.p = parser(self.format, True)
558 def testpattern(self):
559 self.assertEqual(self.pattern, self.p.pattern())
562 self.assertEqual(self.fields, self.p.names())
565 data = self.p.parse(self.line1)
566 self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 1 remote_host')
567 self.assertEqual(data.remote_logname, '-', msg = 'Line 1 remote_logname')
568 self.assertEqual(data.remote_user, '-', msg = 'Line 1 remote_user')
569 self.assertEqual(data.time, '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 time')
572 'GET /images/previous.png HTTP/1.1',
573 msg = 'Line 1 first_line'
575 self.assertEqual(data.last_status, '200', msg = 'Line 1 last_status')
576 self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 1 response_bytes_clf')
579 'http://peterhi.dyndns.org/bandwidth/index.html',
580 msg = 'Line 1 %{Referer}i'
583 data['%{User-Agent}i'],
584 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
585 msg = 'Line 1 %{User-Agent}i'
590 data = self.p.parse(self.line2)
591 self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 2 remote_host')
592 self.assertEqual(data.remote_logname, '-', msg = 'Line 2 remote_logname')
593 self.assertEqual(data.remote_user, '-', msg = 'Line 2 remote_user')
596 '[23/Jan/2004:11:36:20 +0000]',
601 r'GET /images/previous.png=\" HTTP/1.1',
602 msg = 'Line 2 first_line'
604 self.assertEqual(data.last_status, '200', msg = 'Line 2 last_status')
605 self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 2 response_bytes_clf')
608 'http://peterhi.dyndns.org/bandwidth/index.html',
609 msg = 'Line 2 %{Referer}i'
612 data['%{User-Agent}i'],
613 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
614 msg = 'Line 2 %{User-Agent}i'
618 data = self.p.parse(self.line3)
619 self.assertEqual(data.remote_host, '4.224.234.46', msg = 'Line 3 remote_host')
620 self.assertEqual(data.remote_logname, '-', msg = 'Line 3 remote_logname')
621 self.assertEqual(data.remote_user, '-', msg = 'Line 3 remote_user')
624 '[20/Jul/2004:13:18:55 -0700]',
629 r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
630 r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
632 msg = 'Line 3 first_line'
634 self.assertEqual(data.last_status, '200', msg = 'Line 3 last_status')
635 self.assertEqual(data.response_bytes_clf, '2888', msg = 'Line 3 response_bytes_clf')
638 r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
640 msg = 'Line 3 %{Referer}i'
643 data['%{User-Agent}i'],
644 '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
646 # 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
648 msg = 'Line 3 %{User-Agent}i'
652 def testjunkline(self):
653 self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
655 def testhasquotesaltn(self):
656 p = parser(r'%a \"%b\" %c')
657 line = r'foo "xyz" bar'
659 self.assertEqual(data['%a'],'foo', '%a')
660 self.assertEqual(data['%b'],'xyz', '%c')
661 self.assertEqual(data['%c'],'bar', '%c')
663 def testparsedate(self):
664 date = '[05/Dec/2006:10:51:44 +0000]'
665 self.assertEqual(('20061205105144','+0000'),parse_date(date))