This brings us closer to PEP8_ compliance (CapWords class names).
Global constants are not given an explict case convention in PEP8, but
UPPERCASE seems conventional (for example, see the `os` module).
I also moved the main docstring and metadata from
`apachelog.parser.__doc__` to `apachelog.__doc__`, so it will be
easier to find. I updated that docstring to use reStructuredText, and
converted the examples to doctests.
.. _PEP8: http://www.python.org/dev/peps/pep-0008/
+r"""Apache Log Parser
+
+Parser for Apache log files. This is a port to python of Peter Hickman's
+`Apache::LogEntry Perl module`__.
+
+.. __: http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex
+
+Takes the `Apache logging format`__ defined in your ``httpd.conf`` and
+generates a regular expression which is used to a line from the log
+file and return it as a dictionary with keys corresponding to the
+fields defined in the log format.
+
+.. __: http://httpd.apache.org/docs/current/mod/mod_log_config.html#formats
+
+Import libraries used in the example:
+
+>>> import apachelog.parser, sys, StringIO, pprint
+
+You should generally be able to copy and paste the format string from
+your Apache configuration, but remember to place it in a raw string
+using single-quotes, so that backslashes are handled correctly.
+
+>>> format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
+>>> p = apachelog.parser.Parser(format)
+
+Now open your log file. For this example, we'll fake a log file with
+``StringIO``.
+
+>>> #log_stream = open('/var/apache/access.log')
+>>> log_stream = StringIO.StringIO('\n'.join([
+... '192.168.0.1 - - [18/Feb/2012:10:25:43 -0500] "GET / HTTP/1.1" 200 561 "-" "Mozilla/5.0 (...)"',
+... 'junk line',
+... ]))
+>>> for line in log_stream:
+... try:
+... data = p.parse(line)
+... except:
+... print("Unable to parse %s" % line.rstrip())
+... else:
+... pprint.pprint(data)
+{'%>s': '200',
+ '%b': '561',
+ '%h': '192.168.0.1',
+ '%l': '-',
+ '%r': 'GET / HTTP/1.1',
+ '%t': '[18/Feb/2012:10:25:43 -0500]',
+ '%u': '-',
+ '%{Referer}i': '-',
+ '%{User-Agent}i': 'Mozilla/5.0 (...)'}
+Unable to parse junk line
+
+The return dictionary from the parse method has values for each
+directive in the format string.
+
+You can also re-map the field names by subclassing (or clobbering) the
+alias method.
+
+This module provides three of the most common log formats in the
+formats dictionary;
+
+>>> # Common Log Format (CLF)
+>>> p = apachelog.parser.Parser(apachelog.parser.FORMATS['common'])
+>>> # Common Log Format with Virtual Host
+>>> p = apachelog.parser.Parser(apachelog.parser.FORMATS['vhcommon'])
+>>> # NCSA extended/combined log format
+>>> p = apachelog.parser.Parser(apachelog.parser.FORMATS['extended'])
+
+For some older notes regarding performance while reading lines from a
+file in Python, see `this post`__ by Fredrik Lundh. Further
+performance boost can be gained by using psyco_.
+
+.. __: http://effbot.org/zone/readline-performance.htm
+.. _psycho: http://psyco.sourceforge.net/
+
+On my system, using a loop like::
+
+ for line in open('access.log'):
+ p.parse(line)
+
+was able to parse ~60,000 lines / second. Adding psyco to the mix,
+up that to ~75,000 lines / second.
+"""
+
+__version__ = "1.2"
+__license__ = """Released under the same terms as Perl.
+See: http://dev.perl.org/licenses/
+"""
+__author__ = "Harry Fuecks <hfuecks@gmail.com>"
+__contributors__ = [
+ "Peter Hickman <peterhi@ntlworld.com>",
+ "Loic Dachary <loic@dachary.org>",
+ "W. Trevor King <wking@drexel.edu>",
+ ]
-"""Apache Log Parser
-
-Parser for Apache log files. This is a port to python of Peter Hickman's
-Apache::LogEntry Perl module:
-<http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
-
-Takes the Apache logging format defined in your httpd.conf and generates
-a regular expression which is used to a line from the log file and
-return it as a dictionary with keys corresponding to the fields defined
-in the log format.
-
-Example:
-
- import apachelog, sys
-
- # Format copied and pasted from Apache conf - use raw string + single quotes
- format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
-
- p = apachelog.parser(format)
-
- for line in open('/var/apache/access.log'):
- try:
- data = p.parse(line)
- except:
- sys.stderr.write("Unable to parse %s" % line)
-
-The return dictionary from the parse method depends on the input format.
-For the above example, the returned dictionary would look like;
-
- {
- '%>s': '200',
- '%b': '2607',
- '%h': '212.74.15.68',
- '%l': '-',
- '%r': 'GET /images/previous.png HTTP/1.1',
- '%t': '[23/Jan/2004:11:36:20 +0000]',
- '%u': '-',
- '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
- '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
- }
-
-...given an access log entry like (split across lines for formatting);
-
- 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
- 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
- "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
-
-You can also re-map the field names by subclassing (or re-pointing) the
-alias method.
-
-Generally you should be able to copy and paste the format string from
-your Apache configuration, but remember to place it in a raw string
-using single-quotes, so that backslashes are handled correctly.
-
-This module provides three of the most common log formats in the
-formats dictionary;
-
- # Common Log Format (CLF)
- p = apachelog.parser(apachelog.formats['common'])
-
- # Common Log Format with Virtual Host
- p = apachelog.parser(apachelog.formats['vhcommon'])
-
- # NCSA extended/combined log format
- p = apachelog.parser(apachelog.formats['extended'])
-
-For notes regarding performance while reading lines from a file
-in Python, see <http://effbot.org/zone/readline-performance.htm>.
-Further performance boost can be gained by using psyco
-<http://psyco.sourceforge.net/>
-
-On my system, using a loop like;
-
- for line in open('access.log'):
- p.parse(line)
-
-...was able to parse ~60,000 lines / second. Adding psyco to the mix,
-up that to ~75,000 lines / second.
-"""
-
-__version__ = "1.1"
-__license__ = """Released under the same terms as Perl.
-See: http://dev.perl.org/licenses/
-"""
-__author__ = "Harry Fuecks <hfuecks@gmail.com>"
-__contributors__ = [
- "Peter Hickman <peterhi@ntlworld.com>",
- "Loic Dachary <loic@dachary.org>"
- ]
-
import re
+
class ApacheLogParserError(Exception):
pass
+
class AttrDict(dict):
"""
Allows dicts to be accessed via dot notation as well as subscripts
def __getattr__(self, name):
return self[name]
-class parser:
+"""
+Frequenty used log formats stored here
+"""
+FORMATS = {
+ # Common Log Format (CLF)
+ 'common':r'%h %l %u %t \"%r\" %>s %b',
+
+ # Common Log Format with Virtual Host
+ 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
+
+ # NCSA extended/combined log format
+ 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
+ }
+
+
+class Parser (object):
format_to_name = {
# Explanatory comments copied from
# http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
input format (a list)
"""
return self._names
-
-"""
-Frequenty used log formats stored here
-"""
-formats = {
- # Common Log Format (CLF)
- 'common':r'%h %l %u %t \"%r\" %>s %b',
-
- # Common Log Format with Virtual Host
- 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
-
- # NCSA extended/combined log format
- 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
- }
import unittest
-from ..parser import ApacheLogParserError, parser, formats
+from ..parser import ApacheLogParserError, FORMATS, Parser
class TestApacheLogParser(unittest.TestCase):
r'YPC 3.0.3; yplus 4.0.00d)\""'
# r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
# r'YPC 3.0.3; yplus 4.0.00d)"'
- self.p = parser(self.format)
+ self.p = Parser(self.format)
def testpattern(self):
self.assertEqual(self.pattern, self.p.pattern())
self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
def testhasquotesaltn(self):
- p = parser(r'%a \"%b\" %c')
+ p = Parser(r'%a \"%b\" %c')
line = r'foo "xyz" bar'
data = p.parse(line)
self.assertEqual(data['%a'],'foo', '%a')
r'YPC 3.0.3; yplus 4.0.00d)\""'
# r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
# r'YPC 3.0.3; yplus 4.0.00d)"'
- self.p = parser(self.format, True)
+ self.p = Parser(self.format, True)
def testpattern(self):
self.assertEqual(self.pattern, self.p.pattern())
self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
def testhasquotesaltn(self):
- p = parser(r'%a \"%b\" %c')
+ p = Parser(r'%a \"%b\" %c')
line = r'foo "xyz" bar'
data = p.parse(line)
self.assertEqual(data['%a'],'foo', '%a')