From: W. Trevor King Date: Sat, 18 Feb 2012 17:00:09 +0000 (-0500) Subject: Restructure from single module into a package. X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=753a267e9c944b65241cdc33dba5bfe7963804e9;p=apachelog.git Restructure from single module into a package. I also split the tests out into their own submodule. You can run them with: nosetests apachelog Once we get some doctests involved, that will change to nosetests --with-doctest apachelog --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/apachelog.py b/apachelog.py deleted file mode 100755 index 5609ca5..0000000 --- a/apachelog.py +++ /dev/null @@ -1,691 +0,0 @@ -#!/usr/bin/env python -"""Apache Log Parser - -Parser for Apache log files. This is a port to python of Peter Hickman's -Apache::LogEntry Perl module: - - -Takes the Apache logging format defined in your httpd.conf and generates -a regular expression which is used to a line from the log file and -return it as a dictionary with keys corresponding to the fields defined -in the log format. - -Example: - - import apachelog, sys - - # Format copied and pasted from Apache conf - use raw string + single quotes - format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' - - p = apachelog.parser(format) - - for line in open('/var/apache/access.log'): - try: - data = p.parse(line) - except: - sys.stderr.write("Unable to parse %s" % line) - -The return dictionary from the parse method depends on the input format. -For the above example, the returned dictionary would look like; - - { - '%>s': '200', - '%b': '2607', - '%h': '212.74.15.68', - '%l': '-', - '%r': 'GET /images/previous.png HTTP/1.1', - '%t': '[23/Jan/2004:11:36:20 +0000]', - '%u': '-', - '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html', - '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202' - } - -...given an access log entry like (split across lines for formatting); - - 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1" - 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html" - "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202" - -You can also re-map the field names by subclassing (or re-pointing) the -alias method. - -Generally you should be able to copy and paste the format string from -your Apache configuration, but remember to place it in a raw string -using single-quotes, so that backslashes are handled correctly. - -This module provides three of the most common log formats in the -formats dictionary; - - # Common Log Format (CLF) - p = apachelog.parser(apachelog.formats['common']) - - # Common Log Format with Virtual Host - p = apachelog.parser(apachelog.formats['vhcommon']) - - # NCSA extended/combined log format - p = apachelog.parser(apachelog.formats['extended']) - -For notes regarding performance while reading lines from a file -in Python, see . -Further performance boost can be gained by using psyco - - -On my system, using a loop like; - - for line in open('access.log'): - p.parse(line) - -...was able to parse ~60,000 lines / second. Adding psyco to the mix, -up that to ~75,000 lines / second. - -The parse_date function is intended as a fast way to convert a log -date into something useful, without incurring a significant date -parsing overhead - good enough for basic stuff but will be a problem -if you need to deal with log from multiple servers in different -timezones. -""" - -__version__ = "1.1" -__license__ = """Released under the same terms as Perl. -See: http://dev.perl.org/licenses/ -""" -__author__ = "Harry Fuecks " -__contributors__ = [ - "Peter Hickman ", - "Loic Dachary " - ] - -import re - -class ApacheLogParserError(Exception): - pass - -class AttrDict(dict): - """ - Allows dicts to be accessed via dot notation as well as subscripts - Makes using the friendly names nicer - """ - def __getattr__(self, name): - return self[name] - -class parser: - format_to_name = { - # Explanatory comments copied from - # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html - # Remote IP-address - '%a':'remote_ip', - # Local IP-address - '%A':'local_ip', - # Size of response in bytes, excluding HTTP headers. - '%B':'response_bytes', - # Size of response in bytes, excluding HTTP headers. In CLF - # format, i.e. a "-" rather than a 0 when no bytes are sent. - '%b':'response_bytes_clf', - # The contents of cookie Foobar in the request sent to the server. - # Only version 0 cookies are fully supported. - #'%{Foobar}C':'', - '%{}C':'cookie', - # The time taken to serve the request, in microseconds. - '%D':'response_time_us', - # The contents of the environment variable FOOBAR - #'%{FOOBAR}e':'', - '%{}e':'env', - # Filename - '%f':'filename', - # Remote host - '%h':'remote_host', - # The request protocol - '%H':'request_protocol', - # The contents of Foobar: header line(s) in the request sent to - # the server. Changes made by other modules (e.g. mod_headers) - # affect this. - #'%{Foobar}i':'', - '%{}i':'header', - # Number of keepalive requests handled on this connection. - # Interesting if KeepAlive is being used, so that, for example, - # a "1" means the first keepalive request after the initial one, - # "2" the second, etc...; otherwise this is always 0 (indicating - # the initial request). Available in versions 2.2.11 and later. - '%k':'keepalive_num', - # Remote logname (from identd, if supplied). This will return a - # dash unless mod_ident is present and IdentityCheck is set On. - '%l':'remote_logname', - # The request method - '%m':'request_method', - # The contents of note Foobar from another module. - #'%{Foobar}n':'', - '%{}n':'note', - # The contents of Foobar: header line(s) in the reply. - #'%{Foobar}o':'', - '%{}o':'reply_header', - # The canonical port of the server serving the request - '%p':'server_port', - # The canonical port of the server serving the request or the - # server's actual port or the client's actual port. Valid - # formats are canonical, local, or remote. - #'%{format}p':"", - '%{}p':'port', - # The process ID of the child that serviced the request. - '%P':'process_id', - # The process ID or thread id of the child that serviced the - # request. Valid formats are pid, tid, and hextid. hextid requires - # APR 1.2.0 or higher. - #'%{format}P':'', - '%{}P':'pid', - # The query string (prepended with a ? if a query string exists, - # otherwise an empty string) - '%q':'query_string', - # First line of request - # e.g., what you'd see in the logs as 'GET / HTTP/1.1' - '%r':'first_line', - # The handler generating the response (if any). - '%R':'response_handler', - # Status. For requests that got internally redirected, this is - # the status of the *original* request --- %>s for the last. - '%s':'status', - '%>s':'last_status', - # Time the request was received (standard english format) - '%t':'time', - # The time, in the form given by format, which should be in - # strftime(3) format. (potentially localized) - #'%{format}t':'TODO', - # The time taken to serve the request, in seconds. - '%T':'response_time_sec', - # Remote user (from auth; may be bogus if return status (%s) is 401) - '%u':'remote_user', - # The URL path requested, not including any query string. - '%U':'url_path', - # The canonical ServerName of the server serving the request. - '%v':'canonical_server_name', - # The server name according to the UseCanonicalName setting. - '%V':'server_name_config', #TODO: Needs better name - # Connection status when response is completed: - # X = connection aborted before the response completed. - # + = connection may be kept alive after the response is sent. - # - = connection will be closed after the response is sent. - '%X':'completed_connection_status', - # Bytes received, including request and headers, cannot be zero. - # You need to enable mod_logio to use this. - '%I':'bytes_received', - # Bytes sent, including headers, cannot be zero. You need to - # enable mod_logio to use this - '%O':'bytes_sent', - } - - def __init__(self, format, use_friendly_names=False): - """ - Takes the log format from an Apache configuration file. - - Best just copy and paste directly from the .conf file - and pass using a Python raw string e.g. - - format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' - p = apachelog.parser(format) - """ - self._names = [] - self._regex = None - self._pattern = '' - self._use_friendly_names = use_friendly_names - self._parse_format(format) - - def _parse_format(self, format): - """ - Converts the input format to a regular - expression, as well as extracting fields - - Raises an exception if it couldn't compile - the generated regex. - """ - format = format.strip() - format = re.sub('[ \t]+',' ',format) - - subpatterns = [] - - findquotes = re.compile(r'^\\"') - findreferreragent = re.compile('Referer|User-Agent', re.I) - findpercent = re.compile('^%.*t$') - lstripquotes = re.compile(r'^\\"') - rstripquotes = re.compile(r'\\"$') - self._names = [] - - for element in format.split(' '): - - hasquotes = 0 - if findquotes.search(element): hasquotes = 1 - - if hasquotes: - element = lstripquotes.sub('', element) - element = rstripquotes.sub('', element) - - if self._use_friendly_names: - self._names.append(self.alias(element)) - else: - self._names.append(element) - - subpattern = '(\S*)' - - if hasquotes: - if element == '%r' or findreferreragent.search(element): - subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"' - else: - subpattern = r'\"([^\"]*)\"' - - elif findpercent.search(element): - subpattern = r'(\[[^\]]+\])' - - elif element == '%U': - subpattern = '(.+?)' - - subpatterns.append(subpattern) - - self._pattern = '^' + ' '.join(subpatterns) + '$' - try: - self._regex = re.compile(self._pattern) - except Exception, e: - raise ApacheLogParserError(e) - - def parse(self, line): - """ - Parses a single line from the log file and returns - a dictionary of it's contents. - - Raises and exception if it couldn't parse the line - """ - line = line.strip() - match = self._regex.match(line) - - if match: - data = AttrDict() - for k, v in zip(self._names, match.groups()): - data[k] = v - return data - - raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) ) - - def alias(self, name): - """ - Override / replace this method if you want to map format - field names to something else. This method is called - when the parser is constructed, not when actually parsing - a log file - - For custom format names, such as %{Foobar}C, 'Foobar' is referred to - (in this function) as the custom_format and '%{}C' as the name - - If the custom_format has a '-' in it (and is not a time format), then the - '-' is replaced with a '_' so the name remains a valid identifier. - - Takes and returns a string fieldname - """ - - custom_format = '' - - if name.startswith('%{'): - custom_format = '_' + name[2:-2] - name = '%{}' + name[-1] - - if name != '%{}t': - custom_format = custom_format.replace('-', '_') - - try: - return self.format_to_name[name] + custom_format - except KeyError: - return name - - def pattern(self): - """ - Returns the compound regular expression the parser extracted - from the input format (a string) - """ - return self._pattern - - def names(self): - """ - Returns the field names the parser extracted from the - input format (a list) - """ - return self._names - -months = { - 'Jan':'01', - 'Feb':'02', - 'Mar':'03', - 'Apr':'04', - 'May':'05', - 'Jun':'06', - 'Jul':'07', - 'Aug':'08', - 'Sep':'09', - 'Oct':'10', - 'Nov':'11', - 'Dec':'12' - } - -def parse_date(date): - """ - Takes a date in the format: [05/Dec/2006:10:51:44 +0000] - (including square brackets) and returns a two element - tuple containing first a timestamp of the form - YYYYMMDDHH24IISS e.g. 20061205105144 and second the - timezone offset as is e.g.; - - parse_date('[05/Dec/2006:10:51:44 +0000]') - >> ('20061205105144', '+0000') - - It does not attempt to adjust the timestamp according - to the timezone - this is your problem. - """ - date = date[1:-1] - elems = [ - date[7:11], - months[date[3:6]], - date[0:2], - date[12:14], - date[15:17], - date[18:20], - ] - return (''.join(elems),date[21:]) - - -""" -Frequenty used log formats stored here -""" -formats = { - # Common Log Format (CLF) - 'common':r'%h %l %u %t \"%r\" %>s %b', - - # Common Log Format with Virtual Host - 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b', - - # NCSA extended/combined log format - 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"', - } - -if __name__ == '__main__': - import unittest - - class TestApacheLogParser(unittest.TestCase): - - def setUp(self): - self.format = r'%h %l %u %t \"%r\" %>s '\ - r'%b \"%{Referer}i\" \"%{User-Agent}i\"' - self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\ - '%{User-Agent}i'.split(' ') - self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\ - '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ - '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ - '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$' - self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ - r'"GET /images/previous.png HTTP/1.1" 200 2607 '\ - r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ - r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ - r'Gecko/20021202"' - self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ - r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\ - r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ - r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ - r'Gecko/20021202"' - self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\ - r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\ - r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\ - r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\ - r'bin/search?p=\"grady%20white%20306%20bimini\"" '\ - r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ - r'YPC 3.0.3; yplus 4.0.00d)\""' -# r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ -# r'YPC 3.0.3; yplus 4.0.00d)"' - self.p = parser(self.format) - - def testpattern(self): - self.assertEqual(self.pattern, self.p.pattern()) - - def testnames(self): - self.assertEqual(self.fields, self.p.names()) - - def testline1(self): - data = self.p.parse(self.line1) - self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h') - self.assertEqual(data['%l'], '-', msg = 'Line 1 %l') - self.assertEqual(data['%u'], '-', msg = 'Line 1 %u') - self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t') - self.assertEqual( - data['%r'], - 'GET /images/previous.png HTTP/1.1', - msg = 'Line 1 %r' - ) - self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s') - self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b') - self.assertEqual( - data['%{Referer}i'], - 'http://peterhi.dyndns.org/bandwidth/index.html', - msg = 'Line 1 %{Referer}i' - ) - self.assertEqual( - data['%{User-Agent}i'], - 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', - msg = 'Line 1 %{User-Agent}i' - ) - - - def testline2(self): - data = self.p.parse(self.line2) - self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h') - self.assertEqual(data['%l'], '-', msg = 'Line 2 %l') - self.assertEqual(data['%u'], '-', msg = 'Line 2 %u') - self.assertEqual( - data['%t'], - '[23/Jan/2004:11:36:20 +0000]', - msg = 'Line 2 %t' - ) - self.assertEqual( - data['%r'], - r'GET /images/previous.png=\" HTTP/1.1', - msg = 'Line 2 %r' - ) - self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s') - self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b') - self.assertEqual( - data['%{Referer}i'], - 'http://peterhi.dyndns.org/bandwidth/index.html', - msg = 'Line 2 %{Referer}i' - ) - self.assertEqual( - data['%{User-Agent}i'], - 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', - msg = 'Line 2 %{User-Agent}i' - ) - - def testline3(self): - data = self.p.parse(self.line3) - self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h') - self.assertEqual(data['%l'], '-', msg = 'Line 3 %l') - self.assertEqual(data['%u'], '-', msg = 'Line 3 %u') - self.assertEqual( - data['%t'], - '[20/Jul/2004:13:18:55 -0700]', - msg = 'Line 3 %t' - ) - self.assertEqual( - data['%r'], - r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\ - r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\ - r'HTTP/1.1', - msg = 'Line 3 %r' - ) - self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s') - self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b') - self.assertEqual( - data['%{Referer}i'], - r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\ - r'%20bimini\"', - msg = 'Line 3 %{Referer}i' - ) - self.assertEqual( - data['%{User-Agent}i'], - '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\ - 'yplus 4.0.00d)\\"', -# 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\ -# 'yplus 4.0.00d)', - msg = 'Line 3 %{User-Agent}i' - ) - - - def testjunkline(self): - self.assertRaises(ApacheLogParserError,self.p.parse,'foobar') - - def testhasquotesaltn(self): - p = parser(r'%a \"%b\" %c') - line = r'foo "xyz" bar' - data = p.parse(line) - self.assertEqual(data['%a'],'foo', '%a') - self.assertEqual(data['%b'],'xyz', '%c') - self.assertEqual(data['%c'],'bar', '%c') - - def testparsedate(self): - date = '[05/Dec/2006:10:51:44 +0000]' - self.assertEqual(('20061205105144','+0000'),parse_date(date)) - - class TestApacheLogParserFriendlyNames(unittest.TestCase): - - def setUp(self): - self.format = r'%h %l %u %t \"%r\" %>s '\ - r'%b \"%{Referer}i\" \"%{User-Agent}i\"' - self.fields = ('remote_host remote_logname remote_user time ' - 'first_line last_status response_bytes_clf ' - 'header_Referer header_User_Agent').split(' ') - self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\ - '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ - '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ - '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$' - self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ - r'"GET /images/previous.png HTTP/1.1" 200 2607 '\ - r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ - r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ - r'Gecko/20021202"' - self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ - r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\ - r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ - r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ - r'Gecko/20021202"' - self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\ - r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\ - r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\ - r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\ - r'bin/search?p=\"grady%20white%20306%20bimini\"" '\ - r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ - r'YPC 3.0.3; yplus 4.0.00d)\""' -# r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ -# r'YPC 3.0.3; yplus 4.0.00d)"' - self.p = parser(self.format, True) - - def testpattern(self): - self.assertEqual(self.pattern, self.p.pattern()) - - def testnames(self): - self.assertEqual(self.fields, self.p.names()) - - def testline1(self): - data = self.p.parse(self.line1) - self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 1 remote_host') - self.assertEqual(data.remote_logname, '-', msg = 'Line 1 remote_logname') - self.assertEqual(data.remote_user, '-', msg = 'Line 1 remote_user') - self.assertEqual(data.time, '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 time') - self.assertEqual( - data.first_line, - 'GET /images/previous.png HTTP/1.1', - msg = 'Line 1 first_line' - ) - self.assertEqual(data.last_status, '200', msg = 'Line 1 last_status') - self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 1 response_bytes_clf') - self.assertEqual( - data.header_Referer, - 'http://peterhi.dyndns.org/bandwidth/index.html', - msg = 'Line 1 %{Referer}i' - ) - self.assertEqual( - data.header_User_Agent, - 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', - msg = 'Line 1 %{User-Agent}i' - ) - - - def testline2(self): - data = self.p.parse(self.line2) - self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 2 remote_host') - self.assertEqual(data.remote_logname, '-', msg = 'Line 2 remote_logname') - self.assertEqual(data.remote_user, '-', msg = 'Line 2 remote_user') - self.assertEqual( - data.time, - '[23/Jan/2004:11:36:20 +0000]', - msg = 'Line 2 time' - ) - self.assertEqual( - data.first_line, - r'GET /images/previous.png=\" HTTP/1.1', - msg = 'Line 2 first_line' - ) - self.assertEqual(data.last_status, '200', msg = 'Line 2 last_status') - self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 2 response_bytes_clf') - self.assertEqual( - data.header_Referer, - 'http://peterhi.dyndns.org/bandwidth/index.html', - msg = 'Line 2 %{Referer}i' - ) - self.assertEqual( - data.header_User_Agent, - 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', - msg = 'Line 2 %{User-Agent}i' - ) - - def testline3(self): - data = self.p.parse(self.line3) - self.assertEqual(data.remote_host, '4.224.234.46', msg = 'Line 3 remote_host') - self.assertEqual(data.remote_logname, '-', msg = 'Line 3 remote_logname') - self.assertEqual(data.remote_user, '-', msg = 'Line 3 remote_user') - self.assertEqual( - data.time, - '[20/Jul/2004:13:18:55 -0700]', - msg = 'Line 3 time' - ) - self.assertEqual( - data.first_line, - r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\ - r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\ - r'HTTP/1.1', - msg = 'Line 3 first_line' - ) - self.assertEqual(data.last_status, '200', msg = 'Line 3 last_status') - self.assertEqual(data.response_bytes_clf, '2888', msg = 'Line 3 response_bytes_clf') - self.assertEqual( - data.header_Referer, - r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\ - r'%20bimini\"', - msg = 'Line 3 %{Referer}i' - ) - self.assertEqual( - data.header_User_Agent, - '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\ - 'yplus 4.0.00d)\\"', -# 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\ -# 'yplus 4.0.00d)', - msg = 'Line 3 %{User-Agent}i' - ) - - - def testjunkline(self): - self.assertRaises(ApacheLogParserError,self.p.parse,'foobar') - - def testhasquotesaltn(self): - p = parser(r'%a \"%b\" %c') - line = r'foo "xyz" bar' - data = p.parse(line) - self.assertEqual(data['%a'],'foo', '%a') - self.assertEqual(data['%b'],'xyz', '%c') - self.assertEqual(data['%c'],'bar', '%c') - - def testparsedate(self): - date = '[05/Dec/2006:10:51:44 +0000]' - self.assertEqual(('20061205105144','+0000'),parse_date(date)) - - - unittest.main() diff --git a/apachelog/__init__.py b/apachelog/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apachelog/parser.py b/apachelog/parser.py new file mode 100644 index 0000000..925b128 --- /dev/null +++ b/apachelog/parser.py @@ -0,0 +1,401 @@ +"""Apache Log Parser + +Parser for Apache log files. This is a port to python of Peter Hickman's +Apache::LogEntry Perl module: + + +Takes the Apache logging format defined in your httpd.conf and generates +a regular expression which is used to a line from the log file and +return it as a dictionary with keys corresponding to the fields defined +in the log format. + +Example: + + import apachelog, sys + + # Format copied and pasted from Apache conf - use raw string + single quotes + format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' + + p = apachelog.parser(format) + + for line in open('/var/apache/access.log'): + try: + data = p.parse(line) + except: + sys.stderr.write("Unable to parse %s" % line) + +The return dictionary from the parse method depends on the input format. +For the above example, the returned dictionary would look like; + + { + '%>s': '200', + '%b': '2607', + '%h': '212.74.15.68', + '%l': '-', + '%r': 'GET /images/previous.png HTTP/1.1', + '%t': '[23/Jan/2004:11:36:20 +0000]', + '%u': '-', + '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html', + '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202' + } + +...given an access log entry like (split across lines for formatting); + + 212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1" + 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html" + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202" + +You can also re-map the field names by subclassing (or re-pointing) the +alias method. + +Generally you should be able to copy and paste the format string from +your Apache configuration, but remember to place it in a raw string +using single-quotes, so that backslashes are handled correctly. + +This module provides three of the most common log formats in the +formats dictionary; + + # Common Log Format (CLF) + p = apachelog.parser(apachelog.formats['common']) + + # Common Log Format with Virtual Host + p = apachelog.parser(apachelog.formats['vhcommon']) + + # NCSA extended/combined log format + p = apachelog.parser(apachelog.formats['extended']) + +For notes regarding performance while reading lines from a file +in Python, see . +Further performance boost can be gained by using psyco + + +On my system, using a loop like; + + for line in open('access.log'): + p.parse(line) + +...was able to parse ~60,000 lines / second. Adding psyco to the mix, +up that to ~75,000 lines / second. + +The parse_date function is intended as a fast way to convert a log +date into something useful, without incurring a significant date +parsing overhead - good enough for basic stuff but will be a problem +if you need to deal with log from multiple servers in different +timezones. +""" + +__version__ = "1.1" +__license__ = """Released under the same terms as Perl. +See: http://dev.perl.org/licenses/ +""" +__author__ = "Harry Fuecks " +__contributors__ = [ + "Peter Hickman ", + "Loic Dachary " + ] + +import re + +class ApacheLogParserError(Exception): + pass + +class AttrDict(dict): + """ + Allows dicts to be accessed via dot notation as well as subscripts + Makes using the friendly names nicer + """ + def __getattr__(self, name): + return self[name] + +class parser: + format_to_name = { + # Explanatory comments copied from + # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html + # Remote IP-address + '%a':'remote_ip', + # Local IP-address + '%A':'local_ip', + # Size of response in bytes, excluding HTTP headers. + '%B':'response_bytes', + # Size of response in bytes, excluding HTTP headers. In CLF + # format, i.e. a "-" rather than a 0 when no bytes are sent. + '%b':'response_bytes_clf', + # The contents of cookie Foobar in the request sent to the server. + # Only version 0 cookies are fully supported. + #'%{Foobar}C':'', + '%{}C':'cookie', + # The time taken to serve the request, in microseconds. + '%D':'response_time_us', + # The contents of the environment variable FOOBAR + #'%{FOOBAR}e':'', + '%{}e':'env', + # Filename + '%f':'filename', + # Remote host + '%h':'remote_host', + # The request protocol + '%H':'request_protocol', + # The contents of Foobar: header line(s) in the request sent to + # the server. Changes made by other modules (e.g. mod_headers) + # affect this. + #'%{Foobar}i':'', + '%{}i':'header', + # Number of keepalive requests handled on this connection. + # Interesting if KeepAlive is being used, so that, for example, + # a "1" means the first keepalive request after the initial one, + # "2" the second, etc...; otherwise this is always 0 (indicating + # the initial request). Available in versions 2.2.11 and later. + '%k':'keepalive_num', + # Remote logname (from identd, if supplied). This will return a + # dash unless mod_ident is present and IdentityCheck is set On. + '%l':'remote_logname', + # The request method + '%m':'request_method', + # The contents of note Foobar from another module. + #'%{Foobar}n':'', + '%{}n':'note', + # The contents of Foobar: header line(s) in the reply. + #'%{Foobar}o':'', + '%{}o':'reply_header', + # The canonical port of the server serving the request + '%p':'server_port', + # The canonical port of the server serving the request or the + # server's actual port or the client's actual port. Valid + # formats are canonical, local, or remote. + #'%{format}p':"", + '%{}p':'port', + # The process ID of the child that serviced the request. + '%P':'process_id', + # The process ID or thread id of the child that serviced the + # request. Valid formats are pid, tid, and hextid. hextid requires + # APR 1.2.0 or higher. + #'%{format}P':'', + '%{}P':'pid', + # The query string (prepended with a ? if a query string exists, + # otherwise an empty string) + '%q':'query_string', + # First line of request + # e.g., what you'd see in the logs as 'GET / HTTP/1.1' + '%r':'first_line', + # The handler generating the response (if any). + '%R':'response_handler', + # Status. For requests that got internally redirected, this is + # the status of the *original* request --- %>s for the last. + '%s':'status', + '%>s':'last_status', + # Time the request was received (standard english format) + '%t':'time', + # The time, in the form given by format, which should be in + # strftime(3) format. (potentially localized) + #'%{format}t':'TODO', + # The time taken to serve the request, in seconds. + '%T':'response_time_sec', + # Remote user (from auth; may be bogus if return status (%s) is 401) + '%u':'remote_user', + # The URL path requested, not including any query string. + '%U':'url_path', + # The canonical ServerName of the server serving the request. + '%v':'canonical_server_name', + # The server name according to the UseCanonicalName setting. + '%V':'server_name_config', #TODO: Needs better name + # Connection status when response is completed: + # X = connection aborted before the response completed. + # + = connection may be kept alive after the response is sent. + # - = connection will be closed after the response is sent. + '%X':'completed_connection_status', + # Bytes received, including request and headers, cannot be zero. + # You need to enable mod_logio to use this. + '%I':'bytes_received', + # Bytes sent, including headers, cannot be zero. You need to + # enable mod_logio to use this + '%O':'bytes_sent', + } + + def __init__(self, format, use_friendly_names=False): + """ + Takes the log format from an Apache configuration file. + + Best just copy and paste directly from the .conf file + and pass using a Python raw string e.g. + + format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' + p = apachelog.parser(format) + """ + self._names = [] + self._regex = None + self._pattern = '' + self._use_friendly_names = use_friendly_names + self._parse_format(format) + + def _parse_format(self, format): + """ + Converts the input format to a regular + expression, as well as extracting fields + + Raises an exception if it couldn't compile + the generated regex. + """ + format = format.strip() + format = re.sub('[ \t]+',' ',format) + + subpatterns = [] + + findquotes = re.compile(r'^\\"') + findreferreragent = re.compile('Referer|User-Agent', re.I) + findpercent = re.compile('^%.*t$') + lstripquotes = re.compile(r'^\\"') + rstripquotes = re.compile(r'\\"$') + self._names = [] + + for element in format.split(' '): + + hasquotes = 0 + if findquotes.search(element): hasquotes = 1 + + if hasquotes: + element = lstripquotes.sub('', element) + element = rstripquotes.sub('', element) + + if self._use_friendly_names: + self._names.append(self.alias(element)) + else: + self._names.append(element) + + subpattern = '(\S*)' + + if hasquotes: + if element == '%r' or findreferreragent.search(element): + subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"' + else: + subpattern = r'\"([^\"]*)\"' + + elif findpercent.search(element): + subpattern = r'(\[[^\]]+\])' + + elif element == '%U': + subpattern = '(.+?)' + + subpatterns.append(subpattern) + + self._pattern = '^' + ' '.join(subpatterns) + '$' + try: + self._regex = re.compile(self._pattern) + except Exception, e: + raise ApacheLogParserError(e) + + def parse(self, line): + """ + Parses a single line from the log file and returns + a dictionary of it's contents. + + Raises and exception if it couldn't parse the line + """ + line = line.strip() + match = self._regex.match(line) + + if match: + data = AttrDict() + for k, v in zip(self._names, match.groups()): + data[k] = v + return data + + raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) ) + + def alias(self, name): + """ + Override / replace this method if you want to map format + field names to something else. This method is called + when the parser is constructed, not when actually parsing + a log file + + For custom format names, such as %{Foobar}C, 'Foobar' is referred to + (in this function) as the custom_format and '%{}C' as the name + + If the custom_format has a '-' in it (and is not a time format), then the + '-' is replaced with a '_' so the name remains a valid identifier. + + Takes and returns a string fieldname + """ + + custom_format = '' + + if name.startswith('%{'): + custom_format = '_' + name[2:-2] + name = '%{}' + name[-1] + + if name != '%{}t': + custom_format = custom_format.replace('-', '_') + + try: + return self.format_to_name[name] + custom_format + except KeyError: + return name + + def pattern(self): + """ + Returns the compound regular expression the parser extracted + from the input format (a string) + """ + return self._pattern + + def names(self): + """ + Returns the field names the parser extracted from the + input format (a list) + """ + return self._names + +months = { + 'Jan':'01', + 'Feb':'02', + 'Mar':'03', + 'Apr':'04', + 'May':'05', + 'Jun':'06', + 'Jul':'07', + 'Aug':'08', + 'Sep':'09', + 'Oct':'10', + 'Nov':'11', + 'Dec':'12' + } + +def parse_date(date): + """ + Takes a date in the format: [05/Dec/2006:10:51:44 +0000] + (including square brackets) and returns a two element + tuple containing first a timestamp of the form + YYYYMMDDHH24IISS e.g. 20061205105144 and second the + timezone offset as is e.g.; + + parse_date('[05/Dec/2006:10:51:44 +0000]') + >> ('20061205105144', '+0000') + + It does not attempt to adjust the timestamp according + to the timezone - this is your problem. + """ + date = date[1:-1] + elems = [ + date[7:11], + months[date[3:6]], + date[0:2], + date[12:14], + date[15:17], + date[18:20], + ] + return (''.join(elems),date[21:]) + + +""" +Frequenty used log formats stored here +""" +formats = { + # Common Log Format (CLF) + 'common':r'%h %l %u %t \"%r\" %>s %b', + + # Common Log Format with Virtual Host + 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b', + + # NCSA extended/combined log format + 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"', + } diff --git a/apachelog/test/__init__.py b/apachelog/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apachelog/test/test_parser.py b/apachelog/test/test_parser.py new file mode 100644 index 0000000..2af9896 --- /dev/null +++ b/apachelog/test/test_parser.py @@ -0,0 +1,291 @@ +import unittest + +from ..parser import ApacheLogParserError, parser, months, parse_date, formats + + +class TestApacheLogParser(unittest.TestCase): + + def setUp(self): + self.format = r'%h %l %u %t \"%r\" %>s '\ + r'%b \"%{Referer}i\" \"%{User-Agent}i\"' + self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\ + '%{User-Agent}i'.split(' ') + self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\ + '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ + '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ + '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$' + self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ + r'"GET /images/previous.png HTTP/1.1" 200 2607 '\ + r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ + r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ + r'Gecko/20021202"' + self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ + r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\ + r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ + r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ + r'Gecko/20021202"' + self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\ + r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\ + r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\ + r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\ + r'bin/search?p=\"grady%20white%20306%20bimini\"" '\ + r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ + r'YPC 3.0.3; yplus 4.0.00d)\""' +# r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ +# r'YPC 3.0.3; yplus 4.0.00d)"' + self.p = parser(self.format) + + def testpattern(self): + self.assertEqual(self.pattern, self.p.pattern()) + + def testnames(self): + self.assertEqual(self.fields, self.p.names()) + + def testline1(self): + data = self.p.parse(self.line1) + self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h') + self.assertEqual(data['%l'], '-', msg = 'Line 1 %l') + self.assertEqual(data['%u'], '-', msg = 'Line 1 %u') + self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t') + self.assertEqual( + data['%r'], + 'GET /images/previous.png HTTP/1.1', + msg = 'Line 1 %r' + ) + self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s') + self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b') + self.assertEqual( + data['%{Referer}i'], + 'http://peterhi.dyndns.org/bandwidth/index.html', + msg = 'Line 1 %{Referer}i' + ) + self.assertEqual( + data['%{User-Agent}i'], + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', + msg = 'Line 1 %{User-Agent}i' + ) + + + def testline2(self): + data = self.p.parse(self.line2) + self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h') + self.assertEqual(data['%l'], '-', msg = 'Line 2 %l') + self.assertEqual(data['%u'], '-', msg = 'Line 2 %u') + self.assertEqual( + data['%t'], + '[23/Jan/2004:11:36:20 +0000]', + msg = 'Line 2 %t' + ) + self.assertEqual( + data['%r'], + r'GET /images/previous.png=\" HTTP/1.1', + msg = 'Line 2 %r' + ) + self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s') + self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b') + self.assertEqual( + data['%{Referer}i'], + 'http://peterhi.dyndns.org/bandwidth/index.html', + msg = 'Line 2 %{Referer}i' + ) + self.assertEqual( + data['%{User-Agent}i'], + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', + msg = 'Line 2 %{User-Agent}i' + ) + + def testline3(self): + data = self.p.parse(self.line3) + self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h') + self.assertEqual(data['%l'], '-', msg = 'Line 3 %l') + self.assertEqual(data['%u'], '-', msg = 'Line 3 %u') + self.assertEqual( + data['%t'], + '[20/Jul/2004:13:18:55 -0700]', + msg = 'Line 3 %t' + ) + self.assertEqual( + data['%r'], + r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\ + r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\ + r'HTTP/1.1', + msg = 'Line 3 %r' + ) + self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s') + self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b') + self.assertEqual( + data['%{Referer}i'], + r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\ + r'%20bimini\"', + msg = 'Line 3 %{Referer}i' + ) + self.assertEqual( + data['%{User-Agent}i'], + '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\ + 'yplus 4.0.00d)\\"', +# 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\ +# 'yplus 4.0.00d)', + msg = 'Line 3 %{User-Agent}i' + ) + + + def testjunkline(self): + self.assertRaises(ApacheLogParserError,self.p.parse,'foobar') + + def testhasquotesaltn(self): + p = parser(r'%a \"%b\" %c') + line = r'foo "xyz" bar' + data = p.parse(line) + self.assertEqual(data['%a'],'foo', '%a') + self.assertEqual(data['%b'],'xyz', '%c') + self.assertEqual(data['%c'],'bar', '%c') + + def testparsedate(self): + date = '[05/Dec/2006:10:51:44 +0000]' + self.assertEqual(('20061205105144','+0000'),parse_date(date)) + +class TestApacheLogParserFriendlyNames(unittest.TestCase): + + def setUp(self): + self.format = r'%h %l %u %t \"%r\" %>s '\ + r'%b \"%{Referer}i\" \"%{User-Agent}i\"' + self.fields = ('remote_host remote_logname remote_user time ' + 'first_line last_status response_bytes_clf ' + 'header_Referer header_User_Agent').split(' ') + self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\ + '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ + '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\ + '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$' + self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ + r'"GET /images/previous.png HTTP/1.1" 200 2607 '\ + r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ + r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ + r'Gecko/20021202"' + self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\ + r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\ + r'"http://peterhi.dyndns.org/bandwidth/index.html" '\ + r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\ + r'Gecko/20021202"' + self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\ + r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\ + r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\ + r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\ + r'bin/search?p=\"grady%20white%20306%20bimini\"" '\ + r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ + r'YPC 3.0.3; yplus 4.0.00d)\""' +# r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\ +# r'YPC 3.0.3; yplus 4.0.00d)"' + self.p = parser(self.format, True) + + def testpattern(self): + self.assertEqual(self.pattern, self.p.pattern()) + + def testnames(self): + self.assertEqual(self.fields, self.p.names()) + + def testline1(self): + data = self.p.parse(self.line1) + self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 1 remote_host') + self.assertEqual(data.remote_logname, '-', msg = 'Line 1 remote_logname') + self.assertEqual(data.remote_user, '-', msg = 'Line 1 remote_user') + self.assertEqual(data.time, '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 time') + self.assertEqual( + data.first_line, + 'GET /images/previous.png HTTP/1.1', + msg = 'Line 1 first_line' + ) + self.assertEqual(data.last_status, '200', msg = 'Line 1 last_status') + self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 1 response_bytes_clf') + self.assertEqual( + data.header_Referer, + 'http://peterhi.dyndns.org/bandwidth/index.html', + msg = 'Line 1 %{Referer}i' + ) + self.assertEqual( + data.header_User_Agent, + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', + msg = 'Line 1 %{User-Agent}i' + ) + + + def testline2(self): + data = self.p.parse(self.line2) + self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 2 remote_host') + self.assertEqual(data.remote_logname, '-', msg = 'Line 2 remote_logname') + self.assertEqual(data.remote_user, '-', msg = 'Line 2 remote_user') + self.assertEqual( + data.time, + '[23/Jan/2004:11:36:20 +0000]', + msg = 'Line 2 time' + ) + self.assertEqual( + data.first_line, + r'GET /images/previous.png=\" HTTP/1.1', + msg = 'Line 2 first_line' + ) + self.assertEqual(data.last_status, '200', msg = 'Line 2 last_status') + self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 2 response_bytes_clf') + self.assertEqual( + data.header_Referer, + 'http://peterhi.dyndns.org/bandwidth/index.html', + msg = 'Line 2 %{Referer}i' + ) + self.assertEqual( + data.header_User_Agent, + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202', + msg = 'Line 2 %{User-Agent}i' + ) + + def testline3(self): + data = self.p.parse(self.line3) + self.assertEqual(data.remote_host, '4.224.234.46', msg = 'Line 3 remote_host') + self.assertEqual(data.remote_logname, '-', msg = 'Line 3 remote_logname') + self.assertEqual(data.remote_user, '-', msg = 'Line 3 remote_user') + self.assertEqual( + data.time, + '[20/Jul/2004:13:18:55 -0700]', + msg = 'Line 3 time' + ) + self.assertEqual( + data.first_line, + r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\ + r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\ + r'HTTP/1.1', + msg = 'Line 3 first_line' + ) + self.assertEqual(data.last_status, '200', msg = 'Line 3 last_status') + self.assertEqual(data.response_bytes_clf, '2888', msg = 'Line 3 response_bytes_clf') + self.assertEqual( + data.header_Referer, + r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\ + r'%20bimini\"', + msg = 'Line 3 %{Referer}i' + ) + self.assertEqual( + data.header_User_Agent, + '\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\ + 'yplus 4.0.00d)\\"', +# 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\ +# 'yplus 4.0.00d)', + msg = 'Line 3 %{User-Agent}i' + ) + + + def testjunkline(self): + self.assertRaises(ApacheLogParserError,self.p.parse,'foobar') + + def testhasquotesaltn(self): + p = parser(r'%a \"%b\" %c') + line = r'foo "xyz" bar' + data = p.parse(line) + self.assertEqual(data['%a'],'foo', '%a') + self.assertEqual(data['%b'],'xyz', '%c') + self.assertEqual(data['%c'],'bar', '%c') + + def testparsedate(self): + date = '[05/Dec/2006:10:51:44 +0000]' + self.assertEqual(('20061205105144','+0000'),parse_date(date)) + + +if __name__ is '__main__': + unittest.main()