4 class ApacheLogParserError(Exception):
10 Allows dicts to be accessed via dot notation as well as subscripts
11 Makes using the friendly names nicer
13 def __getattr__(self, name):
17 Frequenty used log formats stored here
20 # Common Log Format (CLF)
21 'common':r'%h %l %u %t \"%r\" %>s %b',
23 # Common Log Format with Virtual Host
24 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
26 # NCSA extended/combined log format
27 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
31 class Parser (object):
33 # Explanatory comments copied from
34 # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
39 # Size of response in bytes, excluding HTTP headers.
40 '%B':'response_bytes',
41 # Size of response in bytes, excluding HTTP headers. In CLF
42 # format, i.e. a "-" rather than a 0 when no bytes are sent.
43 '%b':'response_bytes_clf',
44 # The contents of cookie Foobar in the request sent to the server.
45 # Only version 0 cookies are fully supported.
48 # The time taken to serve the request, in microseconds.
49 '%D':'response_time_us',
50 # The contents of the environment variable FOOBAR
57 # The request protocol
58 '%H':'request_protocol',
59 # The contents of Foobar: header line(s) in the request sent to
60 # the server. Changes made by other modules (e.g. mod_headers)
64 # Number of keepalive requests handled on this connection.
65 # Interesting if KeepAlive is being used, so that, for example,
66 # a "1" means the first keepalive request after the initial one,
67 # "2" the second, etc...; otherwise this is always 0 (indicating
68 # the initial request). Available in versions 2.2.11 and later.
70 # Remote logname (from identd, if supplied). This will return a
71 # dash unless mod_ident is present and IdentityCheck is set On.
72 '%l':'remote_logname',
74 '%m':'request_method',
75 # The contents of note Foobar from another module.
78 # The contents of Foobar: header line(s) in the reply.
80 '%{}o':'reply_header',
81 # The canonical port of the server serving the request
83 # The canonical port of the server serving the request or the
84 # server's actual port or the client's actual port. Valid
85 # formats are canonical, local, or remote.
88 # The process ID of the child that serviced the request.
90 # The process ID or thread id of the child that serviced the
91 # request. Valid formats are pid, tid, and hextid. hextid requires
92 # APR 1.2.0 or higher.
95 # The query string (prepended with a ? if a query string exists,
96 # otherwise an empty string)
98 # First line of request
99 # e.g., what you'd see in the logs as 'GET / HTTP/1.1'
101 # The handler generating the response (if any).
102 '%R':'response_handler',
103 # Status. For requests that got internally redirected, this is
104 # the status of the *original* request --- %>s for the last.
107 # Time the request was received (standard english format)
109 # The time, in the form given by format, which should be in
110 # strftime(3) format. (potentially localized)
111 #'%{format}t':'TODO',
112 # The time taken to serve the request, in seconds.
113 '%T':'response_time_sec',
114 # Remote user (from auth; may be bogus if return status (%s) is 401)
116 # The URL path requested, not including any query string.
118 # The canonical ServerName of the server serving the request.
119 '%v':'canonical_server_name',
120 # The server name according to the UseCanonicalName setting.
121 '%V':'server_name_config', #TODO: Needs better name
122 # Connection status when response is completed:
123 # X = connection aborted before the response completed.
124 # + = connection may be kept alive after the response is sent.
125 # - = connection will be closed after the response is sent.
126 '%X':'completed_connection_status',
127 # Bytes received, including request and headers, cannot be zero.
128 # You need to enable mod_logio to use this.
129 '%I':'bytes_received',
130 # Bytes sent, including headers, cannot be zero. You need to
131 # enable mod_logio to use this
135 def __init__(self, format, use_friendly_names=False):
137 Takes the log format from an Apache configuration file.
139 Best just copy and paste directly from the .conf file
140 and pass using a Python raw string e.g.
142 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
143 p = apachelog.parser(format)
148 self._use_friendly_names = use_friendly_names
149 self._parse_format(format)
151 def _parse_format(self, format):
153 Converts the input format to a regular
154 expression, as well as extracting fields
156 Raises an exception if it couldn't compile
159 format = format.strip()
160 format = re.sub('[ \t]+',' ',format)
164 findquotes = re.compile(r'^\\"')
165 findreferreragent = re.compile('Referer|User-Agent', re.I)
166 findpercent = re.compile('^%.*t$')
167 lstripquotes = re.compile(r'^\\"')
168 rstripquotes = re.compile(r'\\"$')
171 for element in format.split(' '):
174 if findquotes.search(element): hasquotes = 1
177 element = lstripquotes.sub('', element)
178 element = rstripquotes.sub('', element)
180 if self._use_friendly_names:
181 self._names.append(self.alias(element))
183 self._names.append(element)
188 if element == '%r' or findreferreragent.search(element):
189 subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
191 subpattern = r'\"([^\"]*)\"'
193 elif findpercent.search(element):
194 subpattern = r'(\[[^\]]+\])'
196 elif element == '%U':
199 subpatterns.append(subpattern)
201 self._pattern = '^' + ' '.join(subpatterns) + '$'
203 self._regex = re.compile(self._pattern)
205 raise ApacheLogParserError(e)
207 def parse(self, line):
209 Parses a single line from the log file and returns
210 a dictionary of it's contents.
212 Raises and exception if it couldn't parse the line
215 match = self._regex.match(line)
219 for k, v in zip(self._names, match.groups()):
223 raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
225 def alias(self, name):
227 Override / replace this method if you want to map format
228 field names to something else. This method is called
229 when the parser is constructed, not when actually parsing
232 For custom format names, such as %{Foobar}C, 'Foobar' is referred to
233 (in this function) as the custom_format and '%{}C' as the name
235 If the custom_format has a '-' in it (and is not a time format), then the
236 '-' is replaced with a '_' so the name remains a valid identifier.
238 Takes and returns a string fieldname
243 if name.startswith('%{'):
244 custom_format = '_' + name[2:-2]
245 name = '%{}' + name[-1]
248 custom_format = custom_format.replace('-', '_')
251 return self.format_to_name[name] + custom_format
257 Returns the compound regular expression the parser extracted
258 from the input format (a string)
264 Returns the field names the parser extracted from the
265 input format (a list)