4 Parser for Apache log files. This is a port to python of Peter Hickman's
5 Apache::LogEntry Perl module:
6 <http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
8 Takes the Apache logging format defined in your httpd.conf and generates
9 a regular expression which is used to a line from the log file and
10 return it as a dictionary with keys corresponding to the fields defined
17 # Format copied and pasted from Apache conf - use raw string + single quotes
18 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
20 p = apachelog.parser(format)
22 for line in open('/var/apache/access.log'):
26 sys.stderr.write("Unable to parse %s" % line)
28 The return dictionary from the parse method depends on the input format.
29 For the above example, the returned dictionary would look like;
36 '%r': 'GET /images/previous.png HTTP/1.1',
37 '%t': '[23/Jan/2004:11:36:20 +0000]',
39 '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
40 '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
43 ...given an access log entry like (split across lines for formatting);
45 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
46 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
47 "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
49 You can also re-map the field names by subclassing (or re-pointing) the
52 Generally you should be able to copy and paste the format string from
53 your Apache configuration, but remember to place it in a raw string
54 using single-quotes, so that backslashes are handled correctly.
56 This module provides three of the most common log formats in the
59 # Common Log Format (CLF)
60 p = apachelog.parser(apachlog.formats['common'])
62 # Common Log Format with Virtual Host
63 p = apachelog.parser(apachlog.formats['vhcommon'])
65 # NCSA extended/combined log format
66 p = apachelog.parser(apachlog.formats['extended'])
68 For notes regarding performance while reading lines from a file
69 in Python, see <http://effbot.org/zone/readline-performance.htm>.
70 Further performance boost can be gained by using psyco
71 <http://psyco.sourceforge.net/>
73 On my system, using a loop like;
75 for line in open('access.log'):
78 ...was able to parse ~60,000 lines / second. Adding psyco to the mix,
79 up that to ~75,000 lines / second.
81 The parse_date function is intended as a fast way to convert a log
82 date into something useful, without incurring a significant date
83 parsing overhead - good enough for basic stuff but will be a problem
84 if you need to deal with log from multiple servers in different
89 __license__ = """Released under the same terms as Perl.
90 See: http://dev.perl.org/licenses/
92 __author__ = "Harry Fuecks <hfuecks@gmail.com>"
94 "Peter Hickman <peterhi@ntlworld.com>",
99 class ApacheLogParserError(Exception):
104 def __init__(self, format):
106 Takes the log format from an Apache configuration file.
108 Best just copy and paste directly from the .conf file
109 and pass using a Python raw string e.g.
111 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
112 p = apachelog.parser(format)
117 self._parse_format(format)
119 def _parse_format(self, format):
121 Converts the input format to a regular
122 expression, as well as extracting fields
124 Raises an exception if it couldn't compile
127 format = format.strip()
128 format = re.sub('[ \t]+',' ',format)
132 findquotes = re.compile(r'^\\"')
133 findreferreragent = re.compile('Referer|User-Agent')
134 findpercent = re.compile('^%.*t$')
135 lstripquotes = re.compile(r'^\\"')
136 rstripquotes = re.compile(r'\\"$')
139 for element in format.split(' '):
142 if findquotes.search(element): hasquotes = 1
145 element = lstripquotes.sub('', element)
146 element = rstripquotes.sub('', element)
148 self._names.append(self.alias(element))
153 if element == '%r' or findreferreragent.search(element):
154 subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
156 subpattern = r'\"([^\"]*)\"'
158 elif findpercent.search(element):
159 subpattern = r'(\[[^\]]+\])'
161 elif element == '%U':
164 subpatterns.append(subpattern)
166 self._pattern = '^' + ' '.join(subpatterns) + '$'
168 self._regex = re.compile(self._pattern)
170 raise ApacheLogParserError(e)
172 def parse(self, line):
174 Parses a single line from the log file and returns
175 a dictionary of it's contents.
177 Raises and exception if it couldn't parse the line
180 match = self._regex.match(line)
184 for k, v in zip(self._names, match.groups()):
188 raise ApacheLogParserError("Unable to parse: %s" % line)
190 def alias(self, name):
192 Override / replace this method if you want to map format
193 field names to something else. This method is called
194 when the parser is constructed, not when actually parsing
197 Takes and returns a string fieldname
203 Returns the compound regular expression the parser extracted
204 from the input format (a string)
210 Returns the field names the parser extracted from the
211 input format (a list)
230 def parse_date(date):
232 Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
233 (including square brackets) and returns a two element
234 tuple containing first a timestamp of the form
235 YYYYMMDDHH24IISS e.g. 20061205105144 and second the
236 timezone offset as is e.g.;
238 parse_date('[05/Dec/2006:10:51:44 +0000]')
239 >> ('20061205105144', '+0000')
241 It does not attempt to adjust the timestamp according
242 to the timezone - this is your problem.
253 return (''.join(elems),date[21:])
257 Frequenty used log formats stored here
260 # Common Log Format (CLF)
261 'common':r'%h %l %u %t \"%r\" %>s %b',
263 # Common Log Format with Virtual Host
264 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
266 # NCSA extended/combined log format
267 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"',
270 if __name__ == '__main__':
273 class TestApacheLogParser(unittest.TestCase):
276 self.format = r'%h %l %u %t \"%r\" %>s '\
277 r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
278 self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
279 '%{User-Agent}i'.split(' ')
280 self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
281 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
282 '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
283 '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
284 self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
285 r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
286 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
287 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
289 self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
290 r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
291 r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
292 r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
294 self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
295 r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
296 r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
297 r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
298 r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
299 r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
300 r'YPC 3.0.3; yplus 4.0.00d)"'
301 self.p = parser(self.format)
303 def testpattern(self):
304 self.assertEqual(self.pattern, self.p.pattern())
307 self.assertEqual(self.fields, self.p.names())
310 data = self.p.parse(self.line1)
311 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
312 self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
313 self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
314 self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
317 'GET /images/previous.png HTTP/1.1',
320 self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
321 self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
324 'http://peterhi.dyndns.org/bandwidth/index.html',
325 msg = 'Line 1 %{Referer}i'
328 data['%{User-Agent}i'],
329 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
330 msg = 'Line 1 %{User-Agent}i'
335 data = self.p.parse(self.line2)
336 self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
337 self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
338 self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
341 '[23/Jan/2004:11:36:20 +0000]',
346 r'GET /images/previous.png=\" HTTP/1.1',
349 self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
350 self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
353 'http://peterhi.dyndns.org/bandwidth/index.html',
354 msg = 'Line 2 %{Referer}i'
357 data['%{User-Agent}i'],
358 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
359 msg = 'Line 2 %{User-Agent}i'
363 data = self.p.parse(self.line3)
364 self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
365 self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
366 self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
369 '[20/Jul/2004:13:18:55 -0700]',
374 r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
375 r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
379 self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
380 self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
383 r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
385 msg = 'Line 3 %{Referer}i'
388 data['%{User-Agent}i'],
389 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
391 msg = 'Line 3 %{User-Agent}i'
395 def testjunkline(self):
396 self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
398 def testhasquotesaltn(self):
399 p = parser(r'%a \"%b\" %c')
400 line = r'foo "xyz" bar'
402 self.assertEqual(data['%a'],'foo', '%a')
403 self.assertEqual(data['%b'],'xyz', '%c')
404 self.assertEqual(data['%c'],'bar', '%c')
406 def testparsedate(self):
407 date = '[05/Dec/2006:10:51:44 +0000]'
408 self.assertEqual(('20061205105144','+0000'),parse_date(date))