Restructure from single module into a package.
authorW. Trevor King <wking@drexel.edu>
Sat, 18 Feb 2012 17:00:09 +0000 (12:00 -0500)
committerW. Trevor King <wking@drexel.edu>
Sat, 18 Feb 2012 17:24:40 +0000 (12:24 -0500)
I also split the tests out into their own submodule.  You can run them
with:

  nosetests apachelog

Once we get some doctests involved, that will change to

  nosetests --with-doctest apachelog

.gitignore [new file with mode: 0644]
apachelog.py [deleted file]
apachelog/__init__.py [new file with mode: 0644]
apachelog/parser.py [new file with mode: 0644]
apachelog/test/__init__.py [new file with mode: 0644]
apachelog/test/test_parser.py [new file with mode: 0644]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..0d20b64
--- /dev/null
@@ -0,0 +1 @@
+*.pyc
diff --git a/apachelog.py b/apachelog.py
deleted file mode 100755 (executable)
index 5609ca5..0000000
+++ /dev/null
@@ -1,691 +0,0 @@
-#!/usr/bin/env python
-"""Apache Log Parser
-
-Parser for Apache log files. This is a port to python of Peter Hickman's
-Apache::LogEntry Perl module:
-<http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
-
-Takes the Apache logging format defined in your httpd.conf and generates
-a regular expression which is used to a line from the log file and
-return it as a dictionary with keys corresponding to the fields defined
-in the log format.
-
-Example:
-
-    import apachelog, sys
-
-    # Format copied and pasted from Apache conf - use raw string + single quotes
-    format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
-
-    p = apachelog.parser(format)
-
-    for line in open('/var/apache/access.log'):
-        try:
-           data = p.parse(line)
-        except:
-           sys.stderr.write("Unable to parse %s" % line)
-
-The return dictionary from the parse method depends on the input format.
-For the above example, the returned dictionary would look like;
-
-    {
-    '%>s': '200',
-    '%b': '2607',
-    '%h': '212.74.15.68',
-    '%l': '-',
-    '%r': 'GET /images/previous.png HTTP/1.1',
-    '%t': '[23/Jan/2004:11:36:20 +0000]',
-    '%u': '-',
-    '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
-    '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
-    }
-
-...given an access log entry like (split across lines for formatting);
-
-    212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
-        200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
-        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
-
-You can also re-map the field names by subclassing (or re-pointing) the
-alias method.
-
-Generally you should be able to copy and paste the format string from
-your Apache configuration, but remember to place it in a raw string
-using single-quotes, so that backslashes are handled correctly.
-
-This module provides three of the most common log formats in the
-formats dictionary;
-
-    # Common Log Format (CLF)
-    p = apachelog.parser(apachelog.formats['common'])
-
-    # Common Log Format with Virtual Host
-    p = apachelog.parser(apachelog.formats['vhcommon'])
-
-    # NCSA extended/combined log format
-    p = apachelog.parser(apachelog.formats['extended'])
-
-For notes regarding performance while reading lines from a file
-in Python, see <http://effbot.org/zone/readline-performance.htm>.
-Further performance boost can be gained by using psyco
-<http://psyco.sourceforge.net/>
-
-On my system, using a loop like;
-
-    for line in open('access.log'):
-        p.parse(line)
-
-...was able to parse ~60,000 lines / second. Adding psyco to the mix,
-up that to ~75,000 lines / second.
-
-The parse_date function is intended as a fast way to convert a log
-date into something useful, without incurring a significant date
-parsing overhead - good enough for basic stuff but will be a problem
-if you need to deal with log from multiple servers in different
-timezones.
-"""
-
-__version__ = "1.1"
-__license__ = """Released under the same terms as Perl.
-See: http://dev.perl.org/licenses/
-"""
-__author__ = "Harry Fuecks <hfuecks@gmail.com>"
-__contributors__ = [
-    "Peter Hickman <peterhi@ntlworld.com>",
-    "Loic Dachary <loic@dachary.org>"
-    ]
-
-import re
-
-class ApacheLogParserError(Exception):
-    pass
-
-class AttrDict(dict):
-    """
-    Allows dicts to be accessed via dot notation as well as subscripts
-    Makes using the friendly names nicer
-    """
-    def __getattr__(self, name):
-        return self[name]
-
-class parser:
-    format_to_name = {
-        # Explanatory comments copied from
-        # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
-        # Remote IP-address
-        '%a':'remote_ip',
-        # Local IP-address
-        '%A':'local_ip',
-        # Size of response in bytes, excluding HTTP headers.
-        '%B':'response_bytes',
-        # Size of response in bytes, excluding HTTP headers. In CLF
-        # format, i.e. a "-" rather than a 0 when no bytes are sent.
-        '%b':'response_bytes_clf',
-        # The contents of cookie Foobar in the request sent to the server.
-        # Only version 0 cookies are fully supported.
-        #'%{Foobar}C':'',
-        '%{}C':'cookie',
-        # The time taken to serve the request, in microseconds.
-        '%D':'response_time_us',
-        # The contents of the environment variable FOOBAR
-        #'%{FOOBAR}e':'',
-        '%{}e':'env',
-        # Filename
-        '%f':'filename',
-        # Remote host
-        '%h':'remote_host',
-        # The request protocol
-        '%H':'request_protocol',
-        # The contents of Foobar: header line(s) in the request sent to
-        # the server. Changes made by other modules (e.g. mod_headers)
-        # affect this.
-        #'%{Foobar}i':'',
-        '%{}i':'header',
-        # Number of keepalive requests handled on this connection.
-        # Interesting if KeepAlive is being used, so that, for example,
-        # a "1" means the first keepalive request after the initial one,
-        # "2" the second, etc...; otherwise this is always 0 (indicating
-        # the initial request). Available in versions 2.2.11 and later.
-        '%k':'keepalive_num',
-        # Remote logname (from identd, if supplied). This will return a
-        # dash unless mod_ident is present and IdentityCheck is set On.
-        '%l':'remote_logname',
-        # The request method
-        '%m':'request_method',
-        # The contents of note Foobar from another module.
-        #'%{Foobar}n':'',
-        '%{}n':'note',
-        # The contents of Foobar: header line(s) in the reply.
-        #'%{Foobar}o':'',
-        '%{}o':'reply_header',
-        # The canonical port of the server serving the request
-        '%p':'server_port',
-        # The canonical port of the server serving the request or the
-        # server's actual port or the client's actual port. Valid
-        # formats are canonical, local, or remote.
-        #'%{format}p':"",
-        '%{}p':'port',
-        # The process ID of the child that serviced the request.
-        '%P':'process_id',
-        # The process ID or thread id of the child that serviced the
-        # request. Valid formats are pid, tid, and hextid. hextid requires
-        # APR 1.2.0 or higher.
-        #'%{format}P':'',
-        '%{}P':'pid',
-        # The query string (prepended with a ? if a query string exists,
-        # otherwise an empty string)
-        '%q':'query_string',
-        # First line of request
-        # e.g., what you'd see in the logs as 'GET / HTTP/1.1'
-        '%r':'first_line',
-        # The handler generating the response (if any).
-        '%R':'response_handler',
-        # Status. For requests that got internally redirected, this is
-        # the status of the *original* request --- %>s for the last.
-        '%s':'status',
-        '%>s':'last_status',
-        # Time the request was received (standard english format)
-        '%t':'time',
-        # The time, in the form given by format, which should be in
-        # strftime(3) format. (potentially localized)
-        #'%{format}t':'TODO',
-        # The time taken to serve the request, in seconds.
-        '%T':'response_time_sec',
-        # Remote user (from auth; may be bogus if return status (%s) is 401)
-        '%u':'remote_user',
-        # The URL path requested, not including any query string.
-        '%U':'url_path',
-        # The canonical ServerName of the server serving the request.
-        '%v':'canonical_server_name',
-        # The server name according to the UseCanonicalName setting.
-        '%V':'server_name_config', #TODO: Needs better name
-        # Connection status when response is completed:
-        # X = connection aborted before the response completed.
-        # + = connection may be kept alive after the response is sent.
-        # - = connection will be closed after the response is sent.
-        '%X':'completed_connection_status',
-        # Bytes received, including request and headers, cannot be zero.
-        # You need to enable mod_logio to use this.
-        '%I':'bytes_received',
-        # Bytes sent, including headers, cannot be zero. You need to
-        # enable mod_logio to use this
-        '%O':'bytes_sent',
-    }
-
-    def __init__(self, format, use_friendly_names=False):
-        """
-        Takes the log format from an Apache configuration file.
-
-        Best just copy and paste directly from the .conf file
-        and pass using a Python raw string e.g.
-
-        format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
-        p = apachelog.parser(format)
-        """
-        self._names = []
-        self._regex = None
-        self._pattern = ''
-        self._use_friendly_names = use_friendly_names
-        self._parse_format(format)
-
-    def _parse_format(self, format):
-        """
-        Converts the input format to a regular
-        expression, as well as extracting fields
-
-        Raises an exception if it couldn't compile
-        the generated regex.
-        """
-        format = format.strip()
-        format = re.sub('[ \t]+',' ',format)
-
-        subpatterns = []
-
-        findquotes = re.compile(r'^\\"')
-        findreferreragent = re.compile('Referer|User-Agent', re.I)
-        findpercent = re.compile('^%.*t$')
-        lstripquotes = re.compile(r'^\\"')
-        rstripquotes = re.compile(r'\\"$')
-        self._names = []
-
-        for element in format.split(' '):
-
-            hasquotes = 0
-            if findquotes.search(element): hasquotes = 1
-
-            if hasquotes:
-                element = lstripquotes.sub('', element)
-                element = rstripquotes.sub('', element)
-
-            if self._use_friendly_names:
-                self._names.append(self.alias(element))
-            else:
-                self._names.append(element)
-
-            subpattern = '(\S*)'
-
-            if hasquotes:
-                if element == '%r' or findreferreragent.search(element):
-                    subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
-                else:
-                    subpattern = r'\"([^\"]*)\"'
-
-            elif findpercent.search(element):
-                subpattern = r'(\[[^\]]+\])'
-
-            elif element == '%U':
-                subpattern = '(.+?)'
-
-            subpatterns.append(subpattern)
-
-        self._pattern = '^' + ' '.join(subpatterns) + '$'
-        try:
-            self._regex = re.compile(self._pattern)
-        except Exception, e:
-            raise ApacheLogParserError(e)
-
-    def parse(self, line):
-        """
-        Parses a single line from the log file and returns
-        a dictionary of it's contents.
-
-        Raises and exception if it couldn't parse the line
-        """
-        line = line.strip()
-        match = self._regex.match(line)
-
-        if match:
-            data = AttrDict()
-            for k, v in zip(self._names, match.groups()):
-                data[k] = v
-            return data
-
-        raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
-
-    def alias(self, name):
-        """
-        Override / replace this method if you want to map format
-        field names to something else. This method is called
-        when the parser is constructed, not when actually parsing
-        a log file
-
-        For custom format names, such as %{Foobar}C, 'Foobar' is referred to
-        (in this function) as the custom_format and '%{}C' as the name
-
-        If the custom_format has a '-' in it (and is not a time format), then the
-        '-' is replaced with a '_' so the name remains a valid identifier.
-
-        Takes and returns a string fieldname
-        """
-
-        custom_format = ''
-
-        if name.startswith('%{'):
-            custom_format = '_' + name[2:-2]
-            name = '%{}' + name[-1]
-
-            if name != '%{}t':
-                custom_format = custom_format.replace('-', '_')
-
-        try:
-            return self.format_to_name[name] + custom_format
-        except KeyError:
-            return name
-
-    def pattern(self):
-        """
-        Returns the compound regular expression the parser extracted
-        from the input format (a string)
-        """
-        return self._pattern
-
-    def names(self):
-        """
-        Returns the field names the parser extracted from the
-        input format (a list)
-        """
-        return self._names
-
-months = {
-    'Jan':'01',
-    'Feb':'02',
-    'Mar':'03',
-    'Apr':'04',
-    'May':'05',
-    'Jun':'06',
-    'Jul':'07',
-    'Aug':'08',
-    'Sep':'09',
-    'Oct':'10',
-    'Nov':'11',
-    'Dec':'12'
-    }
-
-def parse_date(date):
-    """
-    Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
-    (including square brackets) and returns a two element
-    tuple containing first a timestamp of the form
-    YYYYMMDDHH24IISS e.g. 20061205105144 and second the
-    timezone offset as is e.g.;
-
-    parse_date('[05/Dec/2006:10:51:44 +0000]')
-    >> ('20061205105144', '+0000')
-
-    It does not attempt to adjust the timestamp according
-    to the timezone - this is your problem.
-    """
-    date = date[1:-1]
-    elems = [
-        date[7:11],
-        months[date[3:6]],
-        date[0:2],
-        date[12:14],
-        date[15:17],
-        date[18:20],
-        ]
-    return (''.join(elems),date[21:])
-
-
-"""
-Frequenty used log formats stored here
-"""
-formats = {
-    # Common Log Format (CLF)
-    'common':r'%h %l %u %t \"%r\" %>s %b',
-
-    # Common Log Format with Virtual Host
-    'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
-
-    # NCSA extended/combined log format
-    'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
-    }
-
-if __name__ == '__main__':
-    import unittest
-
-    class TestApacheLogParser(unittest.TestCase):
-
-        def setUp(self):
-            self.format = r'%h %l %u %t \"%r\" %>s '\
-                          r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
-            self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
-                          '%{User-Agent}i'.split(' ')
-            self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
-                           '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
-                           '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
-                           '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
-            self.line1  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
-                          r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
-                          r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
-                          r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
-                          r'Gecko/20021202"'
-            self.line2  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
-                          r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
-                          r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
-                          r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
-                          r'Gecko/20021202"'
-            self.line3  = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
-                          r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
-                          r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
-                          r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
-                          r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
-                          r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
-                          r'YPC 3.0.3; yplus 4.0.00d)\""'
-#                          r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
-#                          r'YPC 3.0.3; yplus 4.0.00d)"'
-            self.p = parser(self.format)
-
-        def testpattern(self):
-            self.assertEqual(self.pattern, self.p.pattern())
-
-        def testnames(self):
-            self.assertEqual(self.fields, self.p.names())
-
-        def testline1(self):
-            data = self.p.parse(self.line1)
-            self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
-            self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
-            self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
-            self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
-            self.assertEqual(
-                data['%r'],
-                'GET /images/previous.png HTTP/1.1',
-                msg = 'Line 1 %r'
-                )
-            self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
-            self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
-            self.assertEqual(
-                data['%{Referer}i'],
-                'http://peterhi.dyndns.org/bandwidth/index.html',
-                msg = 'Line 1 %{Referer}i'
-                )
-            self.assertEqual(
-                data['%{User-Agent}i'],
-                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
-                msg = 'Line 1 %{User-Agent}i'
-                )
-
-
-        def testline2(self):
-            data = self.p.parse(self.line2)
-            self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
-            self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
-            self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
-            self.assertEqual(
-                data['%t'],
-                '[23/Jan/2004:11:36:20 +0000]',
-                msg = 'Line 2 %t'
-                )
-            self.assertEqual(
-                data['%r'],
-                r'GET /images/previous.png=\" HTTP/1.1',
-                msg = 'Line 2 %r'
-                )
-            self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
-            self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
-            self.assertEqual(
-                data['%{Referer}i'],
-                'http://peterhi.dyndns.org/bandwidth/index.html',
-                msg = 'Line 2 %{Referer}i'
-                )
-            self.assertEqual(
-                data['%{User-Agent}i'],
-                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
-                msg = 'Line 2 %{User-Agent}i'
-                )
-
-        def testline3(self):
-            data = self.p.parse(self.line3)
-            self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
-            self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
-            self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
-            self.assertEqual(
-                data['%t'],
-                '[20/Jul/2004:13:18:55 -0700]',
-                msg = 'Line 3 %t'
-                )
-            self.assertEqual(
-                data['%r'],
-                r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
-                r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
-                r'HTTP/1.1',
-                msg = 'Line 3 %r'
-                )
-            self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
-            self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
-            self.assertEqual(
-                data['%{Referer}i'],
-                r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
-                r'%20bimini\"',
-                msg = 'Line 3 %{Referer}i'
-                )
-            self.assertEqual(
-                data['%{User-Agent}i'],
-                '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
-                'yplus 4.0.00d)\\"',
-#                'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
-#                'yplus 4.0.00d)',
-                msg = 'Line 3 %{User-Agent}i'
-                )
-
-
-        def testjunkline(self):
-            self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
-
-        def testhasquotesaltn(self):
-            p = parser(r'%a \"%b\" %c')
-            line = r'foo "xyz" bar'
-            data = p.parse(line)
-            self.assertEqual(data['%a'],'foo', '%a')
-            self.assertEqual(data['%b'],'xyz', '%c')
-            self.assertEqual(data['%c'],'bar', '%c')
-
-        def testparsedate(self):
-            date = '[05/Dec/2006:10:51:44 +0000]'
-            self.assertEqual(('20061205105144','+0000'),parse_date(date))
-
-    class TestApacheLogParserFriendlyNames(unittest.TestCase):
-
-        def setUp(self):
-            self.format = r'%h %l %u %t \"%r\" %>s '\
-                          r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
-            self.fields = ('remote_host remote_logname remote_user time '
-                           'first_line last_status response_bytes_clf '
-                           'header_Referer header_User_Agent').split(' ')
-            self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
-                           '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
-                           '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
-                           '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
-            self.line1  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
-                          r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
-                          r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
-                          r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
-                          r'Gecko/20021202"'
-            self.line2  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
-                          r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
-                          r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
-                          r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
-                          r'Gecko/20021202"'
-            self.line3  = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
-                          r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
-                          r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
-                          r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
-                          r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
-                          r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
-                          r'YPC 3.0.3; yplus 4.0.00d)\""'
-#                          r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
-#                          r'YPC 3.0.3; yplus 4.0.00d)"'
-            self.p = parser(self.format, True)
-
-        def testpattern(self):
-            self.assertEqual(self.pattern, self.p.pattern())
-
-        def testnames(self):
-            self.assertEqual(self.fields, self.p.names())
-
-        def testline1(self):
-            data = self.p.parse(self.line1)
-            self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 1 remote_host')
-            self.assertEqual(data.remote_logname, '-', msg = 'Line 1 remote_logname')
-            self.assertEqual(data.remote_user, '-', msg = 'Line 1 remote_user')
-            self.assertEqual(data.time, '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 time')
-            self.assertEqual(
-                data.first_line,
-                'GET /images/previous.png HTTP/1.1',
-                msg = 'Line 1 first_line'
-                )
-            self.assertEqual(data.last_status, '200', msg = 'Line 1 last_status')
-            self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 1 response_bytes_clf')
-            self.assertEqual(
-                data.header_Referer,
-                'http://peterhi.dyndns.org/bandwidth/index.html',
-                msg = 'Line 1 %{Referer}i'
-                )
-            self.assertEqual(
-                data.header_User_Agent,
-                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
-                msg = 'Line 1 %{User-Agent}i'
-                )
-
-
-        def testline2(self):
-            data = self.p.parse(self.line2)
-            self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 2 remote_host')
-            self.assertEqual(data.remote_logname, '-', msg = 'Line 2 remote_logname')
-            self.assertEqual(data.remote_user, '-', msg = 'Line 2 remote_user')
-            self.assertEqual(
-                data.time,
-                '[23/Jan/2004:11:36:20 +0000]',
-                msg = 'Line 2 time'
-                )
-            self.assertEqual(
-                data.first_line,
-                r'GET /images/previous.png=\" HTTP/1.1',
-                msg = 'Line 2 first_line'
-                )
-            self.assertEqual(data.last_status, '200', msg = 'Line 2 last_status')
-            self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 2 response_bytes_clf')
-            self.assertEqual(
-                data.header_Referer,
-                'http://peterhi.dyndns.org/bandwidth/index.html',
-                msg = 'Line 2 %{Referer}i'
-                )
-            self.assertEqual(
-                data.header_User_Agent,
-                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
-                msg = 'Line 2 %{User-Agent}i'
-                )
-
-        def testline3(self):
-            data = self.p.parse(self.line3)
-            self.assertEqual(data.remote_host, '4.224.234.46', msg = 'Line 3 remote_host')
-            self.assertEqual(data.remote_logname, '-', msg = 'Line 3 remote_logname')
-            self.assertEqual(data.remote_user, '-', msg = 'Line 3 remote_user')
-            self.assertEqual(
-                data.time,
-                '[20/Jul/2004:13:18:55 -0700]',
-                msg = 'Line 3 time'
-                )
-            self.assertEqual(
-                data.first_line,
-                r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
-                r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
-                r'HTTP/1.1',
-                msg = 'Line 3 first_line'
-                )
-            self.assertEqual(data.last_status, '200', msg = 'Line 3 last_status')
-            self.assertEqual(data.response_bytes_clf, '2888', msg = 'Line 3 response_bytes_clf')
-            self.assertEqual(
-                data.header_Referer,
-                r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
-                r'%20bimini\"',
-                msg = 'Line 3 %{Referer}i'
-                )
-            self.assertEqual(
-                data.header_User_Agent,
-                '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
-                'yplus 4.0.00d)\\"',
-#                'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
-#                'yplus 4.0.00d)',
-                msg = 'Line 3 %{User-Agent}i'
-                )
-
-
-        def testjunkline(self):
-            self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
-
-        def testhasquotesaltn(self):
-            p = parser(r'%a \"%b\" %c')
-            line = r'foo "xyz" bar'
-            data = p.parse(line)
-            self.assertEqual(data['%a'],'foo', '%a')
-            self.assertEqual(data['%b'],'xyz', '%c')
-            self.assertEqual(data['%c'],'bar', '%c')
-
-        def testparsedate(self):
-            date = '[05/Dec/2006:10:51:44 +0000]'
-            self.assertEqual(('20061205105144','+0000'),parse_date(date))
-
-
-    unittest.main()
diff --git a/apachelog/__init__.py b/apachelog/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/apachelog/parser.py b/apachelog/parser.py
new file mode 100644 (file)
index 0000000..925b128
--- /dev/null
@@ -0,0 +1,401 @@
+"""Apache Log Parser
+
+Parser for Apache log files. This is a port to python of Peter Hickman's
+Apache::LogEntry Perl module:
+<http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
+
+Takes the Apache logging format defined in your httpd.conf and generates
+a regular expression which is used to a line from the log file and
+return it as a dictionary with keys corresponding to the fields defined
+in the log format.
+
+Example:
+
+    import apachelog, sys
+
+    # Format copied and pasted from Apache conf - use raw string + single quotes
+    format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
+
+    p = apachelog.parser(format)
+
+    for line in open('/var/apache/access.log'):
+        try:
+           data = p.parse(line)
+        except:
+           sys.stderr.write("Unable to parse %s" % line)
+
+The return dictionary from the parse method depends on the input format.
+For the above example, the returned dictionary would look like;
+
+    {
+    '%>s': '200',
+    '%b': '2607',
+    '%h': '212.74.15.68',
+    '%l': '-',
+    '%r': 'GET /images/previous.png HTTP/1.1',
+    '%t': '[23/Jan/2004:11:36:20 +0000]',
+    '%u': '-',
+    '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
+    '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
+    }
+
+...given an access log entry like (split across lines for formatting);
+
+    212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
+        200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
+        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
+
+You can also re-map the field names by subclassing (or re-pointing) the
+alias method.
+
+Generally you should be able to copy and paste the format string from
+your Apache configuration, but remember to place it in a raw string
+using single-quotes, so that backslashes are handled correctly.
+
+This module provides three of the most common log formats in the
+formats dictionary;
+
+    # Common Log Format (CLF)
+    p = apachelog.parser(apachelog.formats['common'])
+
+    # Common Log Format with Virtual Host
+    p = apachelog.parser(apachelog.formats['vhcommon'])
+
+    # NCSA extended/combined log format
+    p = apachelog.parser(apachelog.formats['extended'])
+
+For notes regarding performance while reading lines from a file
+in Python, see <http://effbot.org/zone/readline-performance.htm>.
+Further performance boost can be gained by using psyco
+<http://psyco.sourceforge.net/>
+
+On my system, using a loop like;
+
+    for line in open('access.log'):
+        p.parse(line)
+
+...was able to parse ~60,000 lines / second. Adding psyco to the mix,
+up that to ~75,000 lines / second.
+
+The parse_date function is intended as a fast way to convert a log
+date into something useful, without incurring a significant date
+parsing overhead - good enough for basic stuff but will be a problem
+if you need to deal with log from multiple servers in different
+timezones.
+"""
+
+__version__ = "1.1"
+__license__ = """Released under the same terms as Perl.
+See: http://dev.perl.org/licenses/
+"""
+__author__ = "Harry Fuecks <hfuecks@gmail.com>"
+__contributors__ = [
+    "Peter Hickman <peterhi@ntlworld.com>",
+    "Loic Dachary <loic@dachary.org>"
+    ]
+
+import re
+
+class ApacheLogParserError(Exception):
+    pass
+
+class AttrDict(dict):
+    """
+    Allows dicts to be accessed via dot notation as well as subscripts
+    Makes using the friendly names nicer
+    """
+    def __getattr__(self, name):
+        return self[name]
+
+class parser:
+    format_to_name = {
+        # Explanatory comments copied from
+        # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
+        # Remote IP-address
+        '%a':'remote_ip',
+        # Local IP-address
+        '%A':'local_ip',
+        # Size of response in bytes, excluding HTTP headers.
+        '%B':'response_bytes',
+        # Size of response in bytes, excluding HTTP headers. In CLF
+        # format, i.e. a "-" rather than a 0 when no bytes are sent.
+        '%b':'response_bytes_clf',
+        # The contents of cookie Foobar in the request sent to the server.
+        # Only version 0 cookies are fully supported.
+        #'%{Foobar}C':'',
+        '%{}C':'cookie',
+        # The time taken to serve the request, in microseconds.
+        '%D':'response_time_us',
+        # The contents of the environment variable FOOBAR
+        #'%{FOOBAR}e':'',
+        '%{}e':'env',
+        # Filename
+        '%f':'filename',
+        # Remote host
+        '%h':'remote_host',
+        # The request protocol
+        '%H':'request_protocol',
+        # The contents of Foobar: header line(s) in the request sent to
+        # the server. Changes made by other modules (e.g. mod_headers)
+        # affect this.
+        #'%{Foobar}i':'',
+        '%{}i':'header',
+        # Number of keepalive requests handled on this connection.
+        # Interesting if KeepAlive is being used, so that, for example,
+        # a "1" means the first keepalive request after the initial one,
+        # "2" the second, etc...; otherwise this is always 0 (indicating
+        # the initial request). Available in versions 2.2.11 and later.
+        '%k':'keepalive_num',
+        # Remote logname (from identd, if supplied). This will return a
+        # dash unless mod_ident is present and IdentityCheck is set On.
+        '%l':'remote_logname',
+        # The request method
+        '%m':'request_method',
+        # The contents of note Foobar from another module.
+        #'%{Foobar}n':'',
+        '%{}n':'note',
+        # The contents of Foobar: header line(s) in the reply.
+        #'%{Foobar}o':'',
+        '%{}o':'reply_header',
+        # The canonical port of the server serving the request
+        '%p':'server_port',
+        # The canonical port of the server serving the request or the
+        # server's actual port or the client's actual port. Valid
+        # formats are canonical, local, or remote.
+        #'%{format}p':"",
+        '%{}p':'port',
+        # The process ID of the child that serviced the request.
+        '%P':'process_id',
+        # The process ID or thread id of the child that serviced the
+        # request. Valid formats are pid, tid, and hextid. hextid requires
+        # APR 1.2.0 or higher.
+        #'%{format}P':'',
+        '%{}P':'pid',
+        # The query string (prepended with a ? if a query string exists,
+        # otherwise an empty string)
+        '%q':'query_string',
+        # First line of request
+        # e.g., what you'd see in the logs as 'GET / HTTP/1.1'
+        '%r':'first_line',
+        # The handler generating the response (if any).
+        '%R':'response_handler',
+        # Status. For requests that got internally redirected, this is
+        # the status of the *original* request --- %>s for the last.
+        '%s':'status',
+        '%>s':'last_status',
+        # Time the request was received (standard english format)
+        '%t':'time',
+        # The time, in the form given by format, which should be in
+        # strftime(3) format. (potentially localized)
+        #'%{format}t':'TODO',
+        # The time taken to serve the request, in seconds.
+        '%T':'response_time_sec',
+        # Remote user (from auth; may be bogus if return status (%s) is 401)
+        '%u':'remote_user',
+        # The URL path requested, not including any query string.
+        '%U':'url_path',
+        # The canonical ServerName of the server serving the request.
+        '%v':'canonical_server_name',
+        # The server name according to the UseCanonicalName setting.
+        '%V':'server_name_config', #TODO: Needs better name
+        # Connection status when response is completed:
+        # X = connection aborted before the response completed.
+        # + = connection may be kept alive after the response is sent.
+        # - = connection will be closed after the response is sent.
+        '%X':'completed_connection_status',
+        # Bytes received, including request and headers, cannot be zero.
+        # You need to enable mod_logio to use this.
+        '%I':'bytes_received',
+        # Bytes sent, including headers, cannot be zero. You need to
+        # enable mod_logio to use this
+        '%O':'bytes_sent',
+    }
+
+    def __init__(self, format, use_friendly_names=False):
+        """
+        Takes the log format from an Apache configuration file.
+
+        Best just copy and paste directly from the .conf file
+        and pass using a Python raw string e.g.
+
+        format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
+        p = apachelog.parser(format)
+        """
+        self._names = []
+        self._regex = None
+        self._pattern = ''
+        self._use_friendly_names = use_friendly_names
+        self._parse_format(format)
+
+    def _parse_format(self, format):
+        """
+        Converts the input format to a regular
+        expression, as well as extracting fields
+
+        Raises an exception if it couldn't compile
+        the generated regex.
+        """
+        format = format.strip()
+        format = re.sub('[ \t]+',' ',format)
+
+        subpatterns = []
+
+        findquotes = re.compile(r'^\\"')
+        findreferreragent = re.compile('Referer|User-Agent', re.I)
+        findpercent = re.compile('^%.*t$')
+        lstripquotes = re.compile(r'^\\"')
+        rstripquotes = re.compile(r'\\"$')
+        self._names = []
+
+        for element in format.split(' '):
+
+            hasquotes = 0
+            if findquotes.search(element): hasquotes = 1
+
+            if hasquotes:
+                element = lstripquotes.sub('', element)
+                element = rstripquotes.sub('', element)
+
+            if self._use_friendly_names:
+                self._names.append(self.alias(element))
+            else:
+                self._names.append(element)
+
+            subpattern = '(\S*)'
+
+            if hasquotes:
+                if element == '%r' or findreferreragent.search(element):
+                    subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
+                else:
+                    subpattern = r'\"([^\"]*)\"'
+
+            elif findpercent.search(element):
+                subpattern = r'(\[[^\]]+\])'
+
+            elif element == '%U':
+                subpattern = '(.+?)'
+
+            subpatterns.append(subpattern)
+
+        self._pattern = '^' + ' '.join(subpatterns) + '$'
+        try:
+            self._regex = re.compile(self._pattern)
+        except Exception, e:
+            raise ApacheLogParserError(e)
+
+    def parse(self, line):
+        """
+        Parses a single line from the log file and returns
+        a dictionary of it's contents.
+
+        Raises and exception if it couldn't parse the line
+        """
+        line = line.strip()
+        match = self._regex.match(line)
+
+        if match:
+            data = AttrDict()
+            for k, v in zip(self._names, match.groups()):
+                data[k] = v
+            return data
+
+        raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
+
+    def alias(self, name):
+        """
+        Override / replace this method if you want to map format
+        field names to something else. This method is called
+        when the parser is constructed, not when actually parsing
+        a log file
+
+        For custom format names, such as %{Foobar}C, 'Foobar' is referred to
+        (in this function) as the custom_format and '%{}C' as the name
+
+        If the custom_format has a '-' in it (and is not a time format), then the
+        '-' is replaced with a '_' so the name remains a valid identifier.
+
+        Takes and returns a string fieldname
+        """
+
+        custom_format = ''
+
+        if name.startswith('%{'):
+            custom_format = '_' + name[2:-2]
+            name = '%{}' + name[-1]
+
+            if name != '%{}t':
+                custom_format = custom_format.replace('-', '_')
+
+        try:
+            return self.format_to_name[name] + custom_format
+        except KeyError:
+            return name
+
+    def pattern(self):
+        """
+        Returns the compound regular expression the parser extracted
+        from the input format (a string)
+        """
+        return self._pattern
+
+    def names(self):
+        """
+        Returns the field names the parser extracted from the
+        input format (a list)
+        """
+        return self._names
+
+months = {
+    'Jan':'01',
+    'Feb':'02',
+    'Mar':'03',
+    'Apr':'04',
+    'May':'05',
+    'Jun':'06',
+    'Jul':'07',
+    'Aug':'08',
+    'Sep':'09',
+    'Oct':'10',
+    'Nov':'11',
+    'Dec':'12'
+    }
+
+def parse_date(date):
+    """
+    Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
+    (including square brackets) and returns a two element
+    tuple containing first a timestamp of the form
+    YYYYMMDDHH24IISS e.g. 20061205105144 and second the
+    timezone offset as is e.g.;
+
+    parse_date('[05/Dec/2006:10:51:44 +0000]')
+    >> ('20061205105144', '+0000')
+
+    It does not attempt to adjust the timestamp according
+    to the timezone - this is your problem.
+    """
+    date = date[1:-1]
+    elems = [
+        date[7:11],
+        months[date[3:6]],
+        date[0:2],
+        date[12:14],
+        date[15:17],
+        date[18:20],
+        ]
+    return (''.join(elems),date[21:])
+
+
+"""
+Frequenty used log formats stored here
+"""
+formats = {
+    # Common Log Format (CLF)
+    'common':r'%h %l %u %t \"%r\" %>s %b',
+
+    # Common Log Format with Virtual Host
+    'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
+
+    # NCSA extended/combined log format
+    'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
+    }
diff --git a/apachelog/test/__init__.py b/apachelog/test/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/apachelog/test/test_parser.py b/apachelog/test/test_parser.py
new file mode 100644 (file)
index 0000000..2af9896
--- /dev/null
@@ -0,0 +1,291 @@
+import unittest
+
+from ..parser import ApacheLogParserError, parser, months, parse_date, formats
+
+
+class TestApacheLogParser(unittest.TestCase):
+
+    def setUp(self):
+        self.format = r'%h %l %u %t \"%r\" %>s '\
+                      r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
+        self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
+                      '%{User-Agent}i'.split(' ')
+        self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
+                       '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
+                       '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
+                       '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
+        self.line1  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
+                      r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
+                      r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
+                      r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
+                      r'Gecko/20021202"'
+        self.line2  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
+                      r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
+                      r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
+                      r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
+                      r'Gecko/20021202"'
+        self.line3  = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
+                      r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
+                      r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
+                      r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
+                      r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
+                      r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
+                      r'YPC 3.0.3; yplus 4.0.00d)\""'
+#                          r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
+#                          r'YPC 3.0.3; yplus 4.0.00d)"'
+        self.p = parser(self.format)
+
+    def testpattern(self):
+        self.assertEqual(self.pattern, self.p.pattern())
+
+    def testnames(self):
+        self.assertEqual(self.fields, self.p.names())
+
+    def testline1(self):
+        data = self.p.parse(self.line1)
+        self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
+        self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
+        self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
+        self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
+        self.assertEqual(
+            data['%r'],
+            'GET /images/previous.png HTTP/1.1',
+            msg = 'Line 1 %r'
+            )
+        self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
+        self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
+        self.assertEqual(
+            data['%{Referer}i'],
+            'http://peterhi.dyndns.org/bandwidth/index.html',
+            msg = 'Line 1 %{Referer}i'
+            )
+        self.assertEqual(
+            data['%{User-Agent}i'],
+            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
+            msg = 'Line 1 %{User-Agent}i'
+            )
+
+
+    def testline2(self):
+        data = self.p.parse(self.line2)
+        self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
+        self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
+        self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
+        self.assertEqual(
+            data['%t'],
+            '[23/Jan/2004:11:36:20 +0000]',
+            msg = 'Line 2 %t'
+            )
+        self.assertEqual(
+            data['%r'],
+            r'GET /images/previous.png=\" HTTP/1.1',
+            msg = 'Line 2 %r'
+            )
+        self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
+        self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
+        self.assertEqual(
+            data['%{Referer}i'],
+            'http://peterhi.dyndns.org/bandwidth/index.html',
+            msg = 'Line 2 %{Referer}i'
+            )
+        self.assertEqual(
+            data['%{User-Agent}i'],
+            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
+            msg = 'Line 2 %{User-Agent}i'
+            )
+
+    def testline3(self):
+        data = self.p.parse(self.line3)
+        self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
+        self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
+        self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
+        self.assertEqual(
+            data['%t'],
+            '[20/Jul/2004:13:18:55 -0700]',
+            msg = 'Line 3 %t'
+            )
+        self.assertEqual(
+            data['%r'],
+            r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
+            r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
+            r'HTTP/1.1',
+            msg = 'Line 3 %r'
+            )
+        self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
+        self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
+        self.assertEqual(
+            data['%{Referer}i'],
+            r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
+            r'%20bimini\"',
+            msg = 'Line 3 %{Referer}i'
+            )
+        self.assertEqual(
+            data['%{User-Agent}i'],
+            '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
+            'yplus 4.0.00d)\\"',
+#                'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
+#                'yplus 4.0.00d)',
+            msg = 'Line 3 %{User-Agent}i'
+            )
+
+
+    def testjunkline(self):
+        self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
+
+    def testhasquotesaltn(self):
+        p = parser(r'%a \"%b\" %c')
+        line = r'foo "xyz" bar'
+        data = p.parse(line)
+        self.assertEqual(data['%a'],'foo', '%a')
+        self.assertEqual(data['%b'],'xyz', '%c')
+        self.assertEqual(data['%c'],'bar', '%c')
+
+    def testparsedate(self):
+        date = '[05/Dec/2006:10:51:44 +0000]'
+        self.assertEqual(('20061205105144','+0000'),parse_date(date))
+
+class TestApacheLogParserFriendlyNames(unittest.TestCase):
+
+    def setUp(self):
+        self.format = r'%h %l %u %t \"%r\" %>s '\
+                      r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
+        self.fields = ('remote_host remote_logname remote_user time '
+                       'first_line last_status response_bytes_clf '
+                       'header_Referer header_User_Agent').split(' ')
+        self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
+                       '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
+                       '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
+                       '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
+        self.line1  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
+                      r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
+                      r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
+                      r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
+                      r'Gecko/20021202"'
+        self.line2  = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
+                      r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
+                      r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
+                      r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
+                      r'Gecko/20021202"'
+        self.line3  = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
+                      r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
+                      r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
+                      r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
+                      r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
+                      r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
+                      r'YPC 3.0.3; yplus 4.0.00d)\""'
+#                          r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
+#                          r'YPC 3.0.3; yplus 4.0.00d)"'
+        self.p = parser(self.format, True)
+
+    def testpattern(self):
+        self.assertEqual(self.pattern, self.p.pattern())
+
+    def testnames(self):
+        self.assertEqual(self.fields, self.p.names())
+
+    def testline1(self):
+        data = self.p.parse(self.line1)
+        self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 1 remote_host')
+        self.assertEqual(data.remote_logname, '-', msg = 'Line 1 remote_logname')
+        self.assertEqual(data.remote_user, '-', msg = 'Line 1 remote_user')
+        self.assertEqual(data.time, '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 time')
+        self.assertEqual(
+            data.first_line,
+            'GET /images/previous.png HTTP/1.1',
+            msg = 'Line 1 first_line'
+            )
+        self.assertEqual(data.last_status, '200', msg = 'Line 1 last_status')
+        self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 1 response_bytes_clf')
+        self.assertEqual(
+            data.header_Referer,
+            'http://peterhi.dyndns.org/bandwidth/index.html',
+            msg = 'Line 1 %{Referer}i'
+            )
+        self.assertEqual(
+            data.header_User_Agent,
+            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
+            msg = 'Line 1 %{User-Agent}i'
+            )
+
+
+    def testline2(self):
+        data = self.p.parse(self.line2)
+        self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 2 remote_host')
+        self.assertEqual(data.remote_logname, '-', msg = 'Line 2 remote_logname')
+        self.assertEqual(data.remote_user, '-', msg = 'Line 2 remote_user')
+        self.assertEqual(
+            data.time,
+            '[23/Jan/2004:11:36:20 +0000]',
+            msg = 'Line 2 time'
+            )
+        self.assertEqual(
+            data.first_line,
+            r'GET /images/previous.png=\" HTTP/1.1',
+            msg = 'Line 2 first_line'
+            )
+        self.assertEqual(data.last_status, '200', msg = 'Line 2 last_status')
+        self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 2 response_bytes_clf')
+        self.assertEqual(
+            data.header_Referer,
+            'http://peterhi.dyndns.org/bandwidth/index.html',
+            msg = 'Line 2 %{Referer}i'
+            )
+        self.assertEqual(
+            data.header_User_Agent,
+            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
+            msg = 'Line 2 %{User-Agent}i'
+            )
+
+    def testline3(self):
+        data = self.p.parse(self.line3)
+        self.assertEqual(data.remote_host, '4.224.234.46', msg = 'Line 3 remote_host')
+        self.assertEqual(data.remote_logname, '-', msg = 'Line 3 remote_logname')
+        self.assertEqual(data.remote_user, '-', msg = 'Line 3 remote_user')
+        self.assertEqual(
+            data.time,
+            '[20/Jul/2004:13:18:55 -0700]',
+            msg = 'Line 3 time'
+            )
+        self.assertEqual(
+            data.first_line,
+            r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
+            r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
+            r'HTTP/1.1',
+            msg = 'Line 3 first_line'
+            )
+        self.assertEqual(data.last_status, '200', msg = 'Line 3 last_status')
+        self.assertEqual(data.response_bytes_clf, '2888', msg = 'Line 3 response_bytes_clf')
+        self.assertEqual(
+            data.header_Referer,
+            r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
+            r'%20bimini\"',
+            msg = 'Line 3 %{Referer}i'
+            )
+        self.assertEqual(
+            data.header_User_Agent,
+            '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
+            'yplus 4.0.00d)\\"',
+#                'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
+#                'yplus 4.0.00d)',
+            msg = 'Line 3 %{User-Agent}i'
+            )
+
+
+    def testjunkline(self):
+        self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
+
+    def testhasquotesaltn(self):
+        p = parser(r'%a \"%b\" %c')
+        line = r'foo "xyz" bar'
+        data = p.parse(line)
+        self.assertEqual(data['%a'],'foo', '%a')
+        self.assertEqual(data['%b'],'xyz', '%c')
+        self.assertEqual(data['%c'],'bar', '%c')
+
+    def testparsedate(self):
+        date = '[05/Dec/2006:10:51:44 +0000]'
+        self.assertEqual(('20061205105144','+0000'),parse_date(date))
+
+
+if __name__ is '__main__':
+    unittest.main()