4 class ApacheLogParserError(Exception):
10 Allows dicts to be accessed via dot notation as well as subscripts
11 Makes using the friendly names nicer
13 def __getattr__(self, name):
17 Frequenty used log formats stored here
20 # Common Log Format (CLF)
21 'common':r'%h %l %u %t \"%r\" %>s %b',
23 # Common Log Format with Virtual Host
24 'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
26 # NCSA extended/combined log format
27 # (common + "%{Referer}i" + "%{User-Agent}i")
28 'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
30 # Nginx default log format (extended + "$gzip_ratio")
31 'nginx':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{gzip-ratio}i\"',
35 class Parser (object):
37 # Explanatory comments copied from
38 # http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
43 # Size of response in bytes, excluding HTTP headers.
44 '%B':'response_bytes',
45 # Size of response in bytes, excluding HTTP headers. In CLF
46 # format, i.e. a "-" rather than a 0 when no bytes are sent.
47 '%b':'response_bytes_clf',
48 # The contents of cookie Foobar in the request sent to the server.
49 # Only version 0 cookies are fully supported.
52 # The time taken to serve the request, in microseconds.
53 '%D':'response_time_us',
54 # The contents of the environment variable FOOBAR
61 # The request protocol
62 '%H':'request_protocol',
63 # The contents of Foobar: header line(s) in the request sent to
64 # the server. Changes made by other modules (e.g. mod_headers)
68 # Number of keepalive requests handled on this connection.
69 # Interesting if KeepAlive is being used, so that, for example,
70 # a "1" means the first keepalive request after the initial one,
71 # "2" the second, etc...; otherwise this is always 0 (indicating
72 # the initial request). Available in versions 2.2.11 and later.
74 # Remote logname (from identd, if supplied). This will return a
75 # dash unless mod_ident is present and IdentityCheck is set On.
76 '%l':'remote_logname',
78 '%m':'request_method',
79 # The contents of note Foobar from another module.
82 # The contents of Foobar: header line(s) in the reply.
84 '%{}o':'reply_header',
85 # The canonical port of the server serving the request
87 # The canonical port of the server serving the request or the
88 # server's actual port or the client's actual port. Valid
89 # formats are canonical, local, or remote.
92 # The process ID of the child that serviced the request.
94 # The process ID or thread id of the child that serviced the
95 # request. Valid formats are pid, tid, and hextid. hextid requires
96 # APR 1.2.0 or higher.
99 # The query string (prepended with a ? if a query string exists,
100 # otherwise an empty string)
102 # First line of request
103 # e.g., what you'd see in the logs as 'GET / HTTP/1.1'
105 # The handler generating the response (if any).
106 '%R':'response_handler',
107 # Status. For requests that got internally redirected, this is
108 # the status of the *original* request --- %>s for the last.
111 # Time the request was received (standard english format)
113 # The time, in the form given by format, which should be in
114 # strftime(3) format. (potentially localized)
115 #'%{format}t':'TODO',
116 # The time taken to serve the request, in seconds.
117 '%T':'response_time_sec',
118 # Remote user (from auth; may be bogus if return status (%s) is 401)
120 # The URL path requested, not including any query string.
122 # The canonical ServerName of the server serving the request.
123 '%v':'canonical_server_name',
124 # The server name according to the UseCanonicalName setting.
125 '%V':'server_name_config', #TODO: Needs better name
126 # Connection status when response is completed:
127 # X = connection aborted before the response completed.
128 # + = connection may be kept alive after the response is sent.
129 # - = connection will be closed after the response is sent.
130 '%X':'completed_connection_status',
131 # Bytes received, including request and headers, cannot be zero.
132 # You need to enable mod_logio to use this.
133 '%I':'bytes_received',
134 # Bytes sent, including headers, cannot be zero. You need to
135 # enable mod_logio to use this
139 def __init__(self, format, use_friendly_names=False):
141 Takes the log format from an Apache configuration file.
143 Best just copy and paste directly from the .conf file
144 and pass using a Python raw string e.g.
146 format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
147 p = apachelog.parser(format)
152 self._use_friendly_names = use_friendly_names
153 self._parse_format(format)
155 def _parse_format(self, format):
157 Converts the input format to a regular
158 expression, as well as extracting fields
160 Raises an exception if it couldn't compile
163 format = format.strip()
164 format = re.sub('[ \t]+',' ',format)
168 findquotes = re.compile(r'^\\"')
169 findreferreragent = re.compile('Referer|User-Agent', re.I)
170 findpercent = re.compile('^%.*t$')
171 lstripquotes = re.compile(r'^\\"')
172 rstripquotes = re.compile(r'\\"$')
175 for element in format.split(' '):
178 if findquotes.search(element): hasquotes = 1
181 element = lstripquotes.sub('', element)
182 element = rstripquotes.sub('', element)
184 if self._use_friendly_names:
185 self._names.append(self.alias(element))
187 self._names.append(element)
192 if element == '%r' or findreferreragent.search(element):
193 subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
195 subpattern = r'\"([^\"]*)\"'
197 elif findpercent.search(element):
198 subpattern = r'(\[[^\]]+\])'
200 elif element == '%U':
203 subpatterns.append(subpattern)
205 self._pattern = '^' + ' '.join(subpatterns) + '$'
207 self._regex = re.compile(self._pattern)
209 raise ApacheLogParserError(e)
211 def parse(self, line):
213 Parses a single line from the log file and returns
214 a dictionary of it's contents.
216 Raises and exception if it couldn't parse the line
219 match = self._regex.match(line)
223 for k, v in zip(self._names, match.groups()):
227 raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
229 def alias(self, name):
231 Override / replace this method if you want to map format
232 field names to something else. This method is called
233 when the parser is constructed, not when actually parsing
236 For custom format names, such as %{Foobar}C, 'Foobar' is referred to
237 (in this function) as the custom_format and '%{}C' as the name
239 If the custom_format has a '-' in it (and is not a time format), then the
240 '-' is replaced with a '_' so the name remains a valid identifier.
242 Takes and returns a string fieldname
247 if name.startswith('%{'):
248 custom_format = '_' + name[2:-2]
249 name = '%{}' + name[-1]
252 custom_format = custom_format.replace('-', '_')
255 return self.format_to_name[name] + custom_format
261 Returns the compound regular expression the parser extracted
262 from the input format (a string)
268 Returns the field names the parser extracted from the
269 input format (a list)