4 Parser for Apache log files. This is a port to python of Peter Hickman's
5 Apache::LogEntry Perl module:
6 <http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
8 Takes the Apache logging format defined in your httpd.conf and generates
9 a regular expression which is used to a line from the log file and
10 return it as a dictionary with keys corresponding to the fields defined
17 # Format copied and pasted from Apache conf - use raw string + single quotes
18 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
20 p = apachelog.parser(format)
22 for line in open('/var/apache/access.log'):
26 sys.stderr.write("Unable to parse %s" % line)
28 The return dictionary from the parse method depends on the input format.
29 For the above example, the returned dictionary would look like;
36 '%r': 'GET /images/previous.png HTTP/1.1',
37 '%t': '[23/Jan/2004:11:36:20 +0000]',
39 '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
40 '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
43 ...given an access log entry like (split across lines for formatting);
45 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
46 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
47 "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
49 You can also re-map the field names by subclassing (or re-pointing) the
52 Generally you should be able to copy and paste the format string from
53 your Apache configuration, but remember to place it in a raw string
54 using single-quotes, so that backslashes are handled correctly.
56 This module provides three of the most common log formats in the
59 # Common Log Format (CLF)
60 p = apachelog.parser(apachelog.formats['common'])
62 # Common Log Format with Virtual Host
63 p = apachelog.parser(apachelog.formats['vhcommon'])
65 # NCSA extended/combined log format
66 p = apachelog.parser(apachelog.formats['extended'])
68 For notes regarding performance while reading lines from a file
69 in Python, see <http://effbot.org/zone/readline-performance.htm>.
70 Further performance boost can be gained by using psyco
71 <http://psyco.sourceforge.net/>
73 On my system, using a loop like;
75 for line in open('access.log'):
78 ...was able to parse ~60,000 lines / second. Adding psyco to the mix,
79 up that to ~75,000 lines / second.
81 The parse_date function is intended as a fast way to convert a log
82 date into something useful, without incurring a significant date
83 parsing overhead - good enough for basic stuff but will be a problem
84 if you need to deal with log from multiple servers in different
89 __license__ = """Released under the same terms as Perl.
90 See: http://dev.perl.org/licenses/
92 __author__ = "Harry Fuecks <hfuecks@gmail.com>"
94 "Peter Hickman <peterhi@ntlworld.com>",
95 "Loic Dachary <loic@dachary.org>"
100 class ApacheLogParserError(Exception):
105 # Explanatory comments copied from
106 # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
111 # Size of response in bytes, excluding HTTP headers.
112 '%B':'response_bytes',
113 # Size of response in bytes, excluding HTTP headers. In CLF
114 # format, i.e. a "-" rather than a 0 when no bytes are sent.
115 '%b':'response_bytes_clf',
116 # The contents of cookie Foobar in the request sent to the server.
117 # Only version 0 cookies are fully supported.
118 #'%{Foobar}C':'TODO',
119 # The time taken to serve the request, in microseconds.
120 '%D':'response_time_us',
121 # The contents of the environment variable FOOBAR
122 #'%{FOOBAR}e':'TODO',
127 # The request protocol
128 '%H':'request_protocol',
129 # The contents of Foobar: header line(s) in the request sent to
130 # the server. Changes made by other modules (e.g. mod_headers)
132 #'%{Foobar}i':'TODO',
133 # Number of keepalive requests handled on this connection.
134 # Interesting if KeepAlive is being used, so that, for example,
135 # a "1" means the first keepalive request after the initial one,
136 # "2" the second, etc...; otherwise this is always 0 (indicating
137 # the initial request). Available in versions 2.2.11 and later.
138 '%k':'keepalive_num',
139 # Remote logname (from identd, if supplied). This will return a
140 # dash unless mod_ident is present and IdentityCheck is set On.
141 '%l':'remote_logname',
143 '%m':'request_method',
144 # The contents of note Foobar from another module.
145 #'%{Foobar}n':'TODO',
146 # The contents of Foobar: header line(s) in the reply.
147 #'%{Foobar}o':'TODO',
148 # The canonical port of the server serving the request
150 # The canonical port of the server serving the request or the
151 # server's actual port or the client's actual port. Valid
152 # formats are canonical, local, or remote.
153 #'%{format}p':"TODO",
154 # The process ID of the child that serviced the request.
156 # The process ID or thread id of the child that serviced the
157 # request. Valid formats are pid, tid, and hextid. hextid requires
158 # APR 1.2.0 or higher.
159 #'%{format}P':'TODO',
160 # The query string (prepended with a ? if a query string exists,
161 # otherwise an empty string)
163 # First line of request
165 # The handler generating the response (if any).
166 '%R':'response_handler',
167 # Status. For requests that got internally redirected, this is
168 # the status of the *original* request --- %>s for the last.
170 # Time the request was received (standard english format)
172 # The time, in the form given by format, which should be in
173 # strftime(3) format. (potentially localized)
174 #'%{format}t':'TODO',
175 # The time taken to serve the request, in seconds.
176 '%T':'response_time_sec',
177 # Remote user (from auth; may be bogus if return status (%s) is 401)
179 # The URL path requested, not including any query string.
181 # The canonical ServerName of the server serving the request.
182 '%v':'canonical_server_name',
183 # The server name according to the UseCanonicalName setting.
184 '%V':'server_name_config', #TODO: Needs better name
185 # Connection status when response is completed:
186 # X = connection aborted before the response completed.
187 # + = connection may be kept alive after the response is sent.
188 # - = connection will be closed after the response is sent.
189 '%X':'completed_connection_status',
190 # Bytes received, including request and headers, cannot be zero.
191 # You need to enable mod_logio to use this.
192 '%I':'bytes_received',
193 # Bytes sent, including headers, cannot be zero. You need to
194 # enable mod_logio to use this
198 def __init__(self, format):
200 Takes the log format from an Apache configuration file.
202 Best just copy and paste directly from the .conf file
203 and pass using a Python raw string e.g.
205 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
206 p = apachelog.parser(format)
211 self._parse_format(format)
213 def _parse_format(self, format):
215 Converts the input format to a regular
216 expression, as well as extracting fields
218 Raises an exception if it couldn't compile
221 format = format.strip()
222 format = re.sub('[ \t]+',' ',format)
226 findquotes = re.compile(r'^\\"')
227 findreferreragent = re.compile('Referer|User-Agent', re.I)
228 findpercent = re.compile('^%.*t$')
229 lstripquotes = re.compile(r'^\\"')
230 rstripquotes = re.compile(r'\\"$')
233 for element in format.split(' '):
236 if findquotes.search(element): hasquotes = 1
239 element = lstripquotes.sub('', element)
240 element = rstripquotes.sub('', element)
242 self._names.append(self.alias(element))
247 if element == '%r' or findreferreragent.search(element):
248 subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
250 subpattern = r'\"([^\"]*)\"'
252 elif findpercent.search(element):
253 subpattern = r'(\[[^\]]+\])'
255 elif element == '%U':
258 subpatterns.append(subpattern)
260 self._pattern = '^' + ' '.join(subpatterns) + '$'
262 self._regex = re.compile(self._pattern)
264 raise ApacheLogParserError(e)
266 def parse(self, line):
268 Parses a single line from the log file and returns
269 a dictionary of it's contents.
271 Raises and exception if it couldn't parse the line
274 match = self._regex.match(line)
278 for k, v in zip(self._names, match.groups()):
282 raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
284 def alias(self, name):
286 Override / replace this method if you want to map format
287 field names to something else. This method is called
288 when the parser is constructed, not when actually parsing
291 Takes and returns a string fieldname
294 return self.format_to_name[name]
300 Returns the compound regular expression the parser extracted
301 from the input format (a string)
307 Returns the field names the parser extracted from the
308 input format (a list)
327 def parse_date(date):
329 Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
330 (including square brackets) and returns a two element
331 tuple containing first a timestamp of the form
332 YYYYMMDDHH24IISS e.g. 20061205105144 and second the
333 timezone offset as is e.g.;
335 parse_date('[05/Dec/2006:10:51:44 +0000]')
336 >> ('20061205105144', '+0000')
338 It does not attempt to adjust the timestamp according
339 to the timezone - this is your problem.
350 return (''.join(elems),date[21:])
354 Frequenty used log formats stored here
357 # Common Log Format (CLF)
358 'common':r'%h %l %u %t \"%r\" %>s %b',
360 # Common Log Format with Virtual Host
361 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
363 # NCSA extended/combined log format
364 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
367 if __name__ == '__main__':
370 class TestApacheLogParser(unittest.TestCase):
373 self.format = r'%h %l %u %t \"%r\" %>s '\
374 r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
375 self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
376 '%{User-Agent}i'.split(' ')
377 self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
378 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
379 '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
380 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
381 self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
382 r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
383 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
384 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
386 self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
387 r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
388 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
389 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
391 self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
392 r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
393 r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
394 r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
395 r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
396 r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
397 r'YPC 3.0.3; yplus 4.0.00d)\""'
398 # r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
399 # r'YPC 3.0.3; yplus 4.0.00d)"'
400 self.p = parser(self.format)
402 def testpattern(self):
403 self.assertEqual(self.pattern, self.p.pattern())
406 self.assertEqual(self.fields, self.p.names())
409 data = self.p.parse(self.line1)
410 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
411 self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
412 self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
413 self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
416 'GET /images/previous.png HTTP/1.1',
419 self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
420 self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
423 'http://peterhi.dyndns.org/bandwidth/index.html',
424 msg = 'Line 1 %{Referer}i'
427 data['%{User-Agent}i'],
428 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
429 msg = 'Line 1 %{User-Agent}i'
434 data = self.p.parse(self.line2)
435 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
436 self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
437 self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
440 '[23/Jan/2004:11:36:20 +0000]',
445 r'GET /images/previous.png=\" HTTP/1.1',
448 self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
449 self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
452 'http://peterhi.dyndns.org/bandwidth/index.html',
453 msg = 'Line 2 %{Referer}i'
456 data['%{User-Agent}i'],
457 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
458 msg = 'Line 2 %{User-Agent}i'
462 data = self.p.parse(self.line3)
463 self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
464 self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
465 self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
468 '[20/Jul/2004:13:18:55 -0700]',
473 r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
474 r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
478 self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
479 self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
482 r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
484 msg = 'Line 3 %{Referer}i'
487 data['%{User-Agent}i'],
488 '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
490 # 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
492 msg = 'Line 3 %{User-Agent}i'
496 def testjunkline(self):
497 self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
499 def testhasquotesaltn(self):
500 p = parser(r'%a \"%b\" %c')
501 line = r'foo "xyz" bar'
503 self.assertEqual(data['%a'],'foo', '%a')
504 self.assertEqual(data['%b'],'xyz', '%c')
505 self.assertEqual(data['%c'],'bar', '%c')
507 def testparsedate(self):
508 date = '[05/Dec/2006:10:51:44 +0000]'
509 self.assertEqual(('20061205105144','+0000'),parse_date(date))