From: W. Trevor King Date: Sat, 18 Feb 2012 18:02:10 +0000 (-0500) Subject: Rename parser -> Parser and formats -> FORMATS in apachelog.parser. X-Git-Url: http://git.tremily.us/?p=apachelog.git;a=commitdiff_plain;h=535e4a372355c66c70b707794dbd15aa0c846488 Rename parser -> Parser and formats -> FORMATS in apachelog.parser. This brings us closer to PEP8_ compliance (CapWords class names). Global constants are not given an explict case convention in PEP8, but UPPERCASE seems conventional (for example, see the `os` module). I also moved the main docstring and metadata from `apachelog.parser.__doc__` to `apachelog.__doc__`, so it will be easier to find. I updated that docstring to use reStructuredText, and converted the examples to doctests. .. _PEP8: http://www.python.org/dev/peps/pep-0008/ --- diff --git a/apachelog/__init__.py b/apachelog/__init__.py index e69de29..c4c1ef7 100644 --- a/apachelog/__init__.py +++ b/apachelog/__init__.py @@ -0,0 +1,93 @@ +r"""Apache Log Parser + +Parser for Apache log files. This is a port to python of Peter Hickman's +`Apache::LogEntry Perl module`__. + +.. __: http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex + +Takes the `Apache logging format`__ defined in your ``httpd.conf`` and +generates a regular expression which is used to a line from the log +file and return it as a dictionary with keys corresponding to the +fields defined in the log format. + +.. __: http://httpd.apache.org/docs/current/mod/mod_log_config.html#formats + +Import libraries used in the example: + +>>> import apachelog.parser, sys, StringIO, pprint + +You should generally be able to copy and paste the format string from +your Apache configuration, but remember to place it in a raw string +using single-quotes, so that backslashes are handled correctly. + +>>> format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' +>>> p = apachelog.parser.Parser(format) + +Now open your log file. For this example, we'll fake a log file with +``StringIO``. + +>>> #log_stream = open('/var/apache/access.log') +>>> log_stream = StringIO.StringIO('\n'.join([ +... '192.168.0.1 - - [18/Feb/2012:10:25:43 -0500] "GET / HTTP/1.1" 200 561 "-" "Mozilla/5.0 (...)"', +... 'junk line', +... ])) +>>> for line in log_stream: +... try: +... data = p.parse(line) +... except: +... print("Unable to parse %s" % line.rstrip()) +... else: +... pprint.pprint(data) +{'%>s': '200', + '%b': '561', + '%h': '192.168.0.1', + '%l': '-', + '%r': 'GET / HTTP/1.1', + '%t': '[18/Feb/2012:10:25:43 -0500]', + '%u': '-', + '%{Referer}i': '-', + '%{User-Agent}i': 'Mozilla/5.0 (...)'} +Unable to parse junk line + +The return dictionary from the parse method has values for each +directive in the format string. + +You can also re-map the field names by subclassing (or clobbering) the +alias method. + +This module provides three of the most common log formats in the +formats dictionary; + +>>> # Common Log Format (CLF) +>>> p = apachelog.parser.Parser(apachelog.parser.FORMATS['common']) +>>> # Common Log Format with Virtual Host +>>> p = apachelog.parser.Parser(apachelog.parser.FORMATS['vhcommon']) +>>> # NCSA extended/combined log format +>>> p = apachelog.parser.Parser(apachelog.parser.FORMATS['extended']) + +For some older notes regarding performance while reading lines from a +file in Python, see `this post`__ by Fredrik Lundh. Further +performance boost can be gained by using psyco_. + +.. __: http://effbot.org/zone/readline-performance.htm +.. _psycho: http://psyco.sourceforge.net/ + +On my system, using a loop like:: + + for line in open('access.log'): + p.parse(line) + +was able to parse ~60,000 lines / second. Adding psyco to the mix, +up that to ~75,000 lines / second. +""" + +__version__ = "1.2" +__license__ = """Released under the same terms as Perl. +See: http://dev.perl.org/licenses/ +""" +__author__ = "Harry Fuecks " +__contributors__ = [ + "Peter Hickman ", + "Loic Dachary ", + "W. Trevor King ", + ] diff --git a/apachelog/parser.py b/apachelog/parser.py index 6c7e3ae..288f147 100644 --- a/apachelog/parser.py +++ b/apachelog/parser.py @@ -1,98 +1,10 @@ -"""Apache Log Parser - -Parser for Apache log files. This is a port to python of Peter Hickman's -Apache::LogEntry Perl module: - - -Takes the Apache logging format defined in your httpd.conf and generates -a regular expression which is used to a line from the log file and -return it as a dictionary with keys corresponding to the fields defined -in the log format. - -Example: - - import apachelog, sys - - # Format copied and pasted from Apache conf - use raw string + single quotes - format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' - - p = apachelog.parser(format) - - for line in open('/var/apache/access.log'): - try: - data = p.parse(line) - except: - sys.stderr.write("Unable to parse %s" % line) - -The return dictionary from the parse method depends on the input format. -For the above example, the returned dictionary would look like; - - { - '%>s': '200', - '%b': '2607', - '%h': '212.74.15.68', - '%l': '-', - '%r': 'GET /images/previous.png HTTP/1.1', - '%t': '[23/Jan/2004:11:36:20 +0000]', - '%u': '-', - '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html', - '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202' - } - -...given an access log entry like (split across lines for formatting); - - 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1" - 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html" - "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202" - -You can also re-map the field names by subclassing (or re-pointing) the -alias method. - -Generally you should be able to copy and paste the format string from -your Apache configuration, but remember to place it in a raw string -using single-quotes, so that backslashes are handled correctly. - -This module provides three of the most common log formats in the -formats dictionary; - - # Common Log Format (CLF) - p = apachelog.parser(apachelog.formats['common']) - - # Common Log Format with Virtual Host - p = apachelog.parser(apachelog.formats['vhcommon']) - - # NCSA extended/combined log format - p = apachelog.parser(apachelog.formats['extended']) - -For notes regarding performance while reading lines from a file -in Python, see . -Further performance boost can be gained by using psyco - - -On my system, using a loop like; - - for line in open('access.log'): - p.parse(line) - -...was able to parse ~60,000 lines / second. Adding psyco to the mix, -up that to ~75,000 lines / second. -""" - -__version__ = "1.1" -__license__ = """Released under the same terms as Perl. -See: http://dev.perl.org/licenses/ -""" -__author__ = "Harry Fuecks " -__contributors__ = [ - "Peter Hickman ", - "Loic Dachary " - ] - import re + class ApacheLogParserError(Exception): pass + class AttrDict(dict): """ Allows dicts to be accessed via dot notation as well as subscripts @@ -101,7 +13,22 @@ class AttrDict(dict): def __getattr__(self, name): return self[name] -class parser: +""" +Frequenty used log formats stored here +""" +FORMATS = { + # Common Log Format (CLF) + 'common':r'%h %l %u %t \"%r\" %>s %b', + + # Common Log Format with Virtual Host + 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b', + + # NCSA extended/combined log format + 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"', + } + + +class Parser (object): format_to_name = { # Explanatory comments copied from # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html @@ -338,17 +265,3 @@ class parser: input format (a list) """ return self._names - -""" -Frequenty used log formats stored here -""" -formats = { - # Common Log Format (CLF) - 'common':r'%h %l %u %t \"%r\" %>s %b', - - # Common Log Format with Virtual Host - 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b', - - # NCSA extended/combined log format - 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"', - } diff --git a/apachelog/test/test_parser.py b/apachelog/test/test_parser.py index 2faa76a..8dc1f47 100644 --- a/apachelog/test/test_parser.py +++ b/apachelog/test/test_parser.py @@ -1,6 +1,6 @@ import unittest -from ..parser import ApacheLogParserError, parser, formats +from ..parser import ApacheLogParserError, FORMATS, Parser class TestApacheLogParser(unittest.TestCase): @@ -33,7 +33,7 @@ class TestApacheLogParser(unittest.TestCase): r'YPC 3.0.3; yplus 4.0.00d)\""' # r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ # r'YPC 3.0.3; yplus 4.0.00d)"' - self.p = parser(self.format) + self.p = Parser(self.format) def testpattern(self): self.assertEqual(self.pattern, self.p.pattern()) @@ -133,7 +133,7 @@ class TestApacheLogParser(unittest.TestCase): self.assertRaises(ApacheLogParserError,self.p.parse,'foobar') def testhasquotesaltn(self): - p = parser(r'%a \"%b\" %c') + p = Parser(r'%a \"%b\" %c') line = r'foo "xyz" bar' data = p.parse(line) self.assertEqual(data['%a'],'foo', '%a') @@ -171,7 +171,7 @@ class TestApacheLogParserFriendlyNames(unittest.TestCase): r'YPC 3.0.3; yplus 4.0.00d)\""' # r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ # r'YPC 3.0.3; yplus 4.0.00d)"' - self.p = parser(self.format, True) + self.p = Parser(self.format, True) def testpattern(self): self.assertEqual(self.pattern, self.p.pattern()) @@ -271,7 +271,7 @@ class TestApacheLogParserFriendlyNames(unittest.TestCase): self.assertRaises(ApacheLogParserError,self.p.parse,'foobar') def testhasquotesaltn(self): - p = parser(r'%a \"%b\" %c') + p = Parser(r'%a \"%b\" %c') line = r'foo "xyz" bar' data = p.parse(line) self.assertEqual(data['%a'],'foo', '%a')