4 Parser for Apache log files. This is a port to python of Peter Hickman's
5 Apache::LogEntry Perl module:
6 <http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
8 Takes the Apache logging format defined in your httpd.conf and generates
9 a regular expression which is used to a line from the log file and
10 return it as a dictionary with keys corresponding to the fields defined
17 # Format copied and pasted from Apache conf - use raw string + single quotes
18 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
20 p = apachelog.parser(format)
22 for line in open('/var/apache/access.log'):
26 sys.stderr.write("Unable to parse %s" % line)
28 The return dictionary from the parse method depends on the input format.
29 For the above example, the returned dictionary would look like;
36 '%r': 'GET /images/previous.png HTTP/1.1',
37 '%t': '[23/Jan/2004:11:36:20 +0000]',
39 '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
40 '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
43 ...given an access log entry like (split across lines for formatting);
45 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
46 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
47 "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
49 You can also re-map the field names by subclassing (or re-pointing) the
52 Generally you should be able to copy and paste the format string from
53 your Apache configuration, but remember to place it in a raw string
54 using single-quotes, so that backslashes are handled correctly.
56 This module provides three of the most common log formats in the
59 # Common Log Format (CLF)
60 p = apachelog.parser(apachelog.formats['common'])
62 # Common Log Format with Virtual Host
63 p = apachelog.parser(apachelog.formats['vhcommon'])
65 # NCSA extended/combined log format
66 p = apachelog.parser(apachelog.formats['extended'])
68 For notes regarding performance while reading lines from a file
69 in Python, see <http://effbot.org/zone/readline-performance.htm>.
70 Further performance boost can be gained by using psyco
71 <http://psyco.sourceforge.net/>
73 On my system, using a loop like;
75 for line in open('access.log'):
78 ...was able to parse ~60,000 lines / second. Adding psyco to the mix,
79 up that to ~75,000 lines / second.
81 The parse_date function is intended as a fast way to convert a log
82 date into something useful, without incurring a significant date
83 parsing overhead - good enough for basic stuff but will be a problem
84 if you need to deal with log from multiple servers in different
89 __license__ = """Released under the same terms as Perl.
90 See: http://dev.perl.org/licenses/
92 __author__ = "Harry Fuecks <hfuecks@gmail.com>"
94 "Peter Hickman <peterhi@ntlworld.com>",
95 "Loic Dachary <loic@dachary.org>"
100 class ApacheLogParserError(Exception):
103 class AttrDict(dict):
105 Allows dicts to be accessed via dot notation as well as subscripts
106 Makes using the friendly names nicer
108 def __getattr__(self, name):
113 # Explanatory comments copied from
114 # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
119 # Size of response in bytes, excluding HTTP headers.
120 '%B':'response_bytes',
121 # Size of response in bytes, excluding HTTP headers. In CLF
122 # format, i.e. a "-" rather than a 0 when no bytes are sent.
123 '%b':'response_bytes_clf',
124 # The contents of cookie Foobar in the request sent to the server.
125 # Only version 0 cookies are fully supported.
128 # The time taken to serve the request, in microseconds.
129 '%D':'response_time_us',
130 # The contents of the environment variable FOOBAR
137 # The request protocol
138 '%H':'request_protocol',
139 # The contents of Foobar: header line(s) in the request sent to
140 # the server. Changes made by other modules (e.g. mod_headers)
144 # Number of keepalive requests handled on this connection.
145 # Interesting if KeepAlive is being used, so that, for example,
146 # a "1" means the first keepalive request after the initial one,
147 # "2" the second, etc...; otherwise this is always 0 (indicating
148 # the initial request). Available in versions 2.2.11 and later.
149 '%k':'keepalive_num',
150 # Remote logname (from identd, if supplied). This will return a
151 # dash unless mod_ident is present and IdentityCheck is set On.
152 '%l':'remote_logname',
154 '%m':'request_method',
155 # The contents of note Foobar from another module.
158 # The contents of Foobar: header line(s) in the reply.
160 '%{}o':'reply_header',
161 # The canonical port of the server serving the request
163 # The canonical port of the server serving the request or the
164 # server's actual port or the client's actual port. Valid
165 # formats are canonical, local, or remote.
168 # The process ID of the child that serviced the request.
170 # The process ID or thread id of the child that serviced the
171 # request. Valid formats are pid, tid, and hextid. hextid requires
172 # APR 1.2.0 or higher.
175 # The query string (prepended with a ? if a query string exists,
176 # otherwise an empty string)
178 # First line of request
179 # e.g., what you'd see in the logs as 'GET / HTTP/1.1'
181 # The handler generating the response (if any).
182 '%R':'response_handler',
183 # Status. For requests that got internally redirected, this is
184 # the status of the *original* request --- %>s for the last.
187 # Time the request was received (standard english format)
189 # The time, in the form given by format, which should be in
190 # strftime(3) format. (potentially localized)
191 #'%{format}t':'TODO',
192 # The time taken to serve the request, in seconds.
193 '%T':'response_time_sec',
194 # Remote user (from auth; may be bogus if return status (%s) is 401)
196 # The URL path requested, not including any query string.
198 # The canonical ServerName of the server serving the request.
199 '%v':'canonical_server_name',
200 # The server name according to the UseCanonicalName setting.
201 '%V':'server_name_config', #TODO: Needs better name
202 # Connection status when response is completed:
203 # X = connection aborted before the response completed.
204 # + = connection may be kept alive after the response is sent.
205 # - = connection will be closed after the response is sent.
206 '%X':'completed_connection_status',
207 # Bytes received, including request and headers, cannot be zero.
208 # You need to enable mod_logio to use this.
209 '%I':'bytes_received',
210 # Bytes sent, including headers, cannot be zero. You need to
211 # enable mod_logio to use this
215 def __init__(self, format, use_friendly_names=False):
217 Takes the log format from an Apache configuration file.
219 Best just copy and paste directly from the .conf file
220 and pass using a Python raw string e.g.
222 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
223 p = apachelog.parser(format)
228 self._use_friendly_names = use_friendly_names
229 self._parse_format(format)
231 def _parse_format(self, format):
233 Converts the input format to a regular
234 expression, as well as extracting fields
236 Raises an exception if it couldn't compile
239 format = format.strip()
240 format = re.sub('[ \t]+',' ',format)
244 findquotes = re.compile(r'^\\"')
245 findreferreragent = re.compile('Referer|User-Agent', re.I)
246 findpercent = re.compile('^%.*t$')
247 lstripquotes = re.compile(r'^\\"')
248 rstripquotes = re.compile(r'\\"$')
251 for element in format.split(' '):
254 if findquotes.search(element): hasquotes = 1
257 element = lstripquotes.sub('', element)
258 element = rstripquotes.sub('', element)
260 if self._use_friendly_names:
261 self._names.append(self.alias(element))
263 self._names.append(element)
268 if element == '%r' or findreferreragent.search(element):
269 subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
271 subpattern = r'\"([^\"]*)\"'
273 elif findpercent.search(element):
274 subpattern = r'(\[[^\]]+\])'
276 elif element == '%U':
279 subpatterns.append(subpattern)
281 self._pattern = '^' + ' '.join(subpatterns) + '$'
283 self._regex = re.compile(self._pattern)
285 raise ApacheLogParserError(e)
287 def parse(self, line):
289 Parses a single line from the log file and returns
290 a dictionary of it's contents.
292 Raises and exception if it couldn't parse the line
295 match = self._regex.match(line)
299 for k, v in zip(self._names, match.groups()):
303 raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
305 def alias(self, name):
307 Override / replace this method if you want to map format
308 field names to something else. This method is called
309 when the parser is constructed, not when actually parsing
312 For custom format names, such as %{Foobar}C, 'Foobar' is referred to
313 (in this function) as the custom_format and '%{}C' as the name
315 If the custom_format has a '-' in it (and is not a time format), then the
316 '-' is replaced with a '_' so the name remains a valid identifier.
318 Takes and returns a string fieldname
323 if name.startswith('%{'):
324 custom_format = '_' + name[2:-2]
325 name = '%{}' + name[-1]
328 custom_format = custom_format.replace('-', '_')
331 return self.format_to_name[name] + custom_format
337 Returns the compound regular expression the parser extracted
338 from the input format (a string)
344 Returns the field names the parser extracted from the
345 input format (a list)
364 def parse_date(date):
366 Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
367 (including square brackets) and returns a two element
368 tuple containing first a timestamp of the form
369 YYYYMMDDHH24IISS e.g. 20061205105144 and second the
370 timezone offset as is e.g.;
372 parse_date('[05/Dec/2006:10:51:44 +0000]')
373 >> ('20061205105144', '+0000')
375 It does not attempt to adjust the timestamp according
376 to the timezone - this is your problem.
387 return (''.join(elems),date[21:])
391 Frequenty used log formats stored here
394 # Common Log Format (CLF)
395 'common':r'%h %l %u %t \"%r\" %>s %b',
397 # Common Log Format with Virtual Host
398 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
400 # NCSA extended/combined log format
401 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
404 if __name__ == '__main__':
407 class TestApacheLogParser(unittest.TestCase):
410 self.format = r'%h %l %u %t \"%r\" %>s '\
411 r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
412 self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
413 '%{User-Agent}i'.split(' ')
414 self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
415 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
416 '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
417 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
418 self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
419 r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
420 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
421 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
423 self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
424 r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
425 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
426 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
428 self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
429 r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
430 r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
431 r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
432 r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
433 r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
434 r'YPC 3.0.3; yplus 4.0.00d)\""'
435 # r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
436 # r'YPC 3.0.3; yplus 4.0.00d)"'
437 self.p = parser(self.format)
439 def testpattern(self):
440 self.assertEqual(self.pattern, self.p.pattern())
443 self.assertEqual(self.fields, self.p.names())
446 data = self.p.parse(self.line1)
447 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
448 self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
449 self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
450 self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
453 'GET /images/previous.png HTTP/1.1',
456 self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
457 self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
460 'http://peterhi.dyndns.org/bandwidth/index.html',
461 msg = 'Line 1 %{Referer}i'
464 data['%{User-Agent}i'],
465 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
466 msg = 'Line 1 %{User-Agent}i'
471 data = self.p.parse(self.line2)
472 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
473 self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
474 self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
477 '[23/Jan/2004:11:36:20 +0000]',
482 r'GET /images/previous.png=\" HTTP/1.1',
485 self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
486 self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
489 'http://peterhi.dyndns.org/bandwidth/index.html',
490 msg = 'Line 2 %{Referer}i'
493 data['%{User-Agent}i'],
494 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
495 msg = 'Line 2 %{User-Agent}i'
499 data = self.p.parse(self.line3)
500 self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
501 self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
502 self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
505 '[20/Jul/2004:13:18:55 -0700]',
510 r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
511 r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
515 self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
516 self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
519 r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
521 msg = 'Line 3 %{Referer}i'
524 data['%{User-Agent}i'],
525 '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
527 # 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
529 msg = 'Line 3 %{User-Agent}i'
533 def testjunkline(self):
534 self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
536 def testhasquotesaltn(self):
537 p = parser(r'%a \"%b\" %c')
538 line = r'foo "xyz" bar'
540 self.assertEqual(data['%a'],'foo', '%a')
541 self.assertEqual(data['%b'],'xyz', '%c')
542 self.assertEqual(data['%c'],'bar', '%c')
544 def testparsedate(self):
545 date = '[05/Dec/2006:10:51:44 +0000]'
546 self.assertEqual(('20061205105144','+0000'),parse_date(date))
548 class TestApacheLogParserFriendlyNames(unittest.TestCase):
551 self.format = r'%h %l %u %t \"%r\" %>s '\
552 r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
553 self.fields = ('remote_host remote_logname remote_user time '
554 'first_line last_status response_bytes_clf '
555 'header_Referer header_User_Agent').split(' ')
556 self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
557 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
558 '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
559 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
560 self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
561 r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
562 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
563 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
565 self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
566 r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
567 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
568 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
570 self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
571 r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
572 r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
573 r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
574 r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
575 r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
576 r'YPC 3.0.3; yplus 4.0.00d)\""'
577 # r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
578 # r'YPC 3.0.3; yplus 4.0.00d)"'
579 self.p = parser(self.format, True)
581 def testpattern(self):
582 self.assertEqual(self.pattern, self.p.pattern())
585 self.assertEqual(self.fields, self.p.names())
588 data = self.p.parse(self.line1)
589 self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 1 remote_host')
590 self.assertEqual(data.remote_logname, '-', msg = 'Line 1 remote_logname')
591 self.assertEqual(data.remote_user, '-', msg = 'Line 1 remote_user')
592 self.assertEqual(data.time, '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 time')
595 'GET /images/previous.png HTTP/1.1',
596 msg = 'Line 1 first_line'
598 self.assertEqual(data.last_status, '200', msg = 'Line 1 last_status')
599 self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 1 response_bytes_clf')
602 'http://peterhi.dyndns.org/bandwidth/index.html',
603 msg = 'Line 1 %{Referer}i'
606 data.header_User_Agent,
607 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
608 msg = 'Line 1 %{User-Agent}i'
613 data = self.p.parse(self.line2)
614 self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 2 remote_host')
615 self.assertEqual(data.remote_logname, '-', msg = 'Line 2 remote_logname')
616 self.assertEqual(data.remote_user, '-', msg = 'Line 2 remote_user')
619 '[23/Jan/2004:11:36:20 +0000]',
624 r'GET /images/previous.png=\" HTTP/1.1',
625 msg = 'Line 2 first_line'
627 self.assertEqual(data.last_status, '200', msg = 'Line 2 last_status')
628 self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 2 response_bytes_clf')
631 'http://peterhi.dyndns.org/bandwidth/index.html',
632 msg = 'Line 2 %{Referer}i'
635 data.header_User_Agent,
636 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
637 msg = 'Line 2 %{User-Agent}i'
641 data = self.p.parse(self.line3)
642 self.assertEqual(data.remote_host, '4.224.234.46', msg = 'Line 3 remote_host')
643 self.assertEqual(data.remote_logname, '-', msg = 'Line 3 remote_logname')
644 self.assertEqual(data.remote_user, '-', msg = 'Line 3 remote_user')
647 '[20/Jul/2004:13:18:55 -0700]',
652 r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
653 r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
655 msg = 'Line 3 first_line'
657 self.assertEqual(data.last_status, '200', msg = 'Line 3 last_status')
658 self.assertEqual(data.response_bytes_clf, '2888', msg = 'Line 3 response_bytes_clf')
661 r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
663 msg = 'Line 3 %{Referer}i'
666 data.header_User_Agent,
667 '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
669 # 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
671 msg = 'Line 3 %{User-Agent}i'
675 def testjunkline(self):
676 self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
678 def testhasquotesaltn(self):
679 p = parser(r'%a \"%b\" %c')
680 line = r'foo "xyz" bar'
682 self.assertEqual(data['%a'],'foo', '%a')
683 self.assertEqual(data['%b'],'xyz', '%c')
684 self.assertEqual(data['%c'],'bar', '%c')
686 def testparsedate(self):
687 date = '[05/Dec/2006:10:51:44 +0000]'
688 self.assertEqual(('20061205105144','+0000'),parse_date(date))