4 Parser for Apache log files. This is a port to python of Peter Hickman's
5 Apache::LogEntry Perl module:
6 <http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
8 Takes the Apache logging format defined in your httpd.conf and generates
9 a regular expression which is used to a line from the log file and
10 return it as a dictionary with keys corresponding to the fields defined
17 # Format copied and pasted from Apache conf - use raw string + single quotes
18 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
20 p = apachelog.parser(format)
22 for line in open('/var/apache/access.log'):
26 sys.stderr.write("Unable to parse %s" % line)
28 The return dictionary from the parse method depends on the input format.
29 For the above example, the returned dictionary would look like;
36 '%r': 'GET /images/previous.png HTTP/1.1',
37 '%t': '[23/Jan/2004:11:36:20 +0000]',
39 '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
40 '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
43 ...given an access log entry like (split across lines for formatting);
45 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
46 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
47 "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
49 You can also re-map the field names by subclassing (or re-pointing) the
52 Generally you should be able to copy and paste the format string from
53 your Apache configuration, but remember to place it in a raw string
54 using single-quotes, so that backslashes are handled correctly.
56 This module provides three of the most common log formats in the
59 # Common Log Format (CLF)
60 p = apachelog.parser(apachlog.formats['common'])
62 # Common Log Format with Virtual Host
63 p = apachelog.parser(apachlog.formats['vhcommon'])
65 # NCSA extended/combined log format
66 p = apachelog.parser(apachlog.formats['extended'])
68 For notes regarding performance while reading lines from a file
69 in Python, see <http://effbot.org/zone/readline-performance.htm>.
70 Further performance boost can be gained by using psyco
71 <http://psyco.sourceforge.net/>
73 On my system, using a loop like;
75 for line in open('access.log'):
78 ...was able to parse ~60,000 lines / second. Adding psyco to the mix,
79 up that to ~75,000 lines / second.
81 The parse_date function is intended as a fast way to convert a log
82 date into something useful, without incurring a significant date
83 parsing overhead - good enough for basic stuff but will be a problem
84 if you need to deal with log from multiple servers in different
89 __license__ = """Released under the same terms as Perl.
90 See: http://dev.perl.org/licenses/
92 __author__ = "Harry Fuecks <hfuecks@gmail.com>"
94 "Peter Hickman <peterhi@ntlworld.com>",
95 "Loic Dachary <loic@dachary.org>"
100 class ApacheLogParserError(Exception):
105 def __init__(self, format):
107 Takes the log format from an Apache configuration file.
109 Best just copy and paste directly from the .conf file
110 and pass using a Python raw string e.g.
112 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
113 p = apachelog.parser(format)
118 self._parse_format(format)
120 def _parse_format(self, format):
122 Converts the input format to a regular
123 expression, as well as extracting fields
125 Raises an exception if it couldn't compile
128 format = format.strip()
129 format = re.sub('[ \t]+',' ',format)
133 findquotes = re.compile(r'^\\"')
134 findreferreragent = re.compile('Referer|User-Agent')
135 findpercent = re.compile('^%.*t$')
136 lstripquotes = re.compile(r'^\\"')
137 rstripquotes = re.compile(r'\\"$')
140 for element in format.split(' '):
143 if findquotes.search(element): hasquotes = 1
146 element = lstripquotes.sub('', element)
147 element = rstripquotes.sub('', element)
149 self._names.append(self.alias(element))
154 if element == '%r' or findreferreragent.search(element):
155 subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
157 subpattern = r'\"([^\"]*)\"'
159 elif findpercent.search(element):
160 subpattern = r'(\[[^\]]+\])'
162 elif element == '%U':
165 subpatterns.append(subpattern)
167 self._pattern = '^' + ' '.join(subpatterns) + '$'
169 self._regex = re.compile(self._pattern)
171 raise ApacheLogParserError(e)
173 def parse(self, line):
175 Parses a single line from the log file and returns
176 a dictionary of it's contents.
178 Raises and exception if it couldn't parse the line
181 match = self._regex.match(line)
185 for k, v in zip(self._names, match.groups()):
189 raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
191 def alias(self, name):
193 Override / replace this method if you want to map format
194 field names to something else. This method is called
195 when the parser is constructed, not when actually parsing
198 Takes and returns a string fieldname
204 Returns the compound regular expression the parser extracted
205 from the input format (a string)
211 Returns the field names the parser extracted from the
212 input format (a list)
231 def parse_date(date):
233 Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
234 (including square brackets) and returns a two element
235 tuple containing first a timestamp of the form
236 YYYYMMDDHH24IISS e.g. 20061205105144 and second the
237 timezone offset as is e.g.;
239 parse_date('[05/Dec/2006:10:51:44 +0000]')
240 >> ('20061205105144', '+0000')
242 It does not attempt to adjust the timestamp according
243 to the timezone - this is your problem.
254 return (''.join(elems),date[21:])
258 Frequenty used log formats stored here
261 # Common Log Format (CLF)
262 'common':r'%h %l %u %t \"%r\" %>s %b',
264 # Common Log Format with Virtual Host
265 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
267 # NCSA extended/combined log format
268 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"',
271 if __name__ == '__main__':
274 class TestApacheLogParser(unittest.TestCase):
277 self.format = r'%h %l %u %t \"%r\" %>s '\
278 r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
279 self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
280 '%{User-Agent}i'.split(' ')
281 self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
282 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
283 '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
284 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
285 self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
286 r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
287 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
288 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
290 self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
291 r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
292 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
293 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
295 self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
296 r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
297 r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
298 r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
299 r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
300 r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
301 r'YPC 3.0.3; yplus 4.0.00d)"'
302 self.p = parser(self.format)
304 def testpattern(self):
305 self.assertEqual(self.pattern, self.p.pattern())
308 self.assertEqual(self.fields, self.p.names())
311 data = self.p.parse(self.line1)
312 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
313 self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
314 self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
315 self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
318 'GET /images/previous.png HTTP/1.1',
321 self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
322 self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
325 'http://peterhi.dyndns.org/bandwidth/index.html',
326 msg = 'Line 1 %{Referer}i'
329 data['%{User-Agent}i'],
330 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
331 msg = 'Line 1 %{User-Agent}i'
336 data = self.p.parse(self.line2)
337 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
338 self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
339 self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
342 '[23/Jan/2004:11:36:20 +0000]',
347 r'GET /images/previous.png=\" HTTP/1.1',
350 self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
351 self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
354 'http://peterhi.dyndns.org/bandwidth/index.html',
355 msg = 'Line 2 %{Referer}i'
358 data['%{User-Agent}i'],
359 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
360 msg = 'Line 2 %{User-Agent}i'
364 data = self.p.parse(self.line3)
365 self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
366 self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
367 self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
370 '[20/Jul/2004:13:18:55 -0700]',
375 r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
376 r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
380 self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
381 self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
384 r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
386 msg = 'Line 3 %{Referer}i'
389 data['%{User-Agent}i'],
390 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
392 msg = 'Line 3 %{User-Agent}i'
396 def testjunkline(self):
397 self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
399 def testhasquotesaltn(self):
400 p = parser(r'%a \"%b\" %c')
401 line = r'foo "xyz" bar'
403 self.assertEqual(data['%a'],'foo', '%a')
404 self.assertEqual(data['%b'],'xyz', '%c')
405 self.assertEqual(data['%c'],'bar', '%c')
407 def testparsedate(self):
408 date = '[05/Dec/2006:10:51:44 +0000]'
409 self.assertEqual(('20061205105144','+0000'),parse_date(date))