Skip to content

Commit

Permalink
Change IIS log format in log importer script to be more generic and f…
Browse files Browse the repository at this point in the history
…ormat any log in W3C extended log format. Keeps iis format for backwards compatibility.
  • Loading branch information
diosmosis committed Dec 12, 2014
1 parent 5c44ca1 commit a344174
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 48 deletions.
82 changes: 47 additions & 35 deletions misc/log-analytics/import_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,9 @@ def get(self, key):
def get_all(self,):
return self.matched.groupdict()

class IisFormat(RegexFormat):
class W3cExtendedFormat(RegexFormat):

FIELDS_LINE_PREFIX = '#Fields: '

fields = {
'date': '(?P<date>^\d+[-\d+]+',
Expand All @@ -205,48 +207,56 @@ class IisFormat(RegexFormat):
}

def __init__(self):
super(IisFormat, self).__init__('iis', None, '%Y-%m-%d %H:%M:%S')
super(W3cExtendedFormat, self).__init__('w3c_extended', None, '%Y-%m-%d %H:%M:%S')

def check_format(self, file):
header_lines = [file.readline() for i in xrange(3)]
# collect all header lines and the first line of the logfile
header_lines = []
while True:
line = file.readline()

if not header_lines[0].startswith('#'):
if line.startswith('#'):
header_lines.append(line)
else:
break
first_line = line
fields_line = next((line for line in header_lines if line.startswith(W3cExtendedFormat.FIELDS_LINE_PREFIX)), None)

if not header_lines or not fields_line:
file.seek(0)
return

# Parse the 4th 'Fields: ' line to create the regex to use
full_regex = []
line = file.readline()

expected_fields = IisFormat.fields.copy() # turn custom field mapping into field => regex mapping
for mapped_field_name, field_name in config.options.custom_iis_fields.iteritems():
expected_fields[mapped_field_name] = IisFormat.fields[field_name]
expected_fields = W3cExtendedFormat.fields.copy() # turn custom field mapping into field => regex mapping
for mapped_field_name, field_name in config.options.custom_w3c_fields.iteritems():
expected_fields[mapped_field_name] = W3cExtendedFormat.fields[field_name]
del expected_fields[field_name]

# if the --iis-time-taken-secs option is used, make sure the time-taken field is interpreted as seconds
if config.options.iis_time_taken_in_secs:
# if the --w3c-time-taken-secs option is used, make sure the time-taken field is interpreted as seconds
if config.options.w3c_time_taken_in_secs:
expected_fields['time-taken'] = '(?P<generation_time_secs>\S+)'
else:
# check if we're importing netscaler logs and if so, issue a warning
if 'netscaler' in header_lines[1].lower():
logging.info("WARNING: netscaler log file being parsed without --iis-time-taken-secs option. Netscaler"
logging.info("WARNING: netscaler log file being parsed without --w3c-time-taken-secs option. Netscaler"
" stores second values in the time-taken field. If your logfile does this, the aforementioned"
" option must be used in order to get accurate generation times.")

# Skip the 'Fields: ' prefix.
line = line[9:]
for field in line.split():
fields_line = fields_line[9:]
for field in fields_line.split():
try:
regex = expected_fields[field]
except KeyError:
regex = '\S+'
full_regex.append(regex)
self.regex = re.compile(' '.join(full_regex))

start_pos = file.tell()
nextline = file.readline()
start_pos = file.tell() - len(first_line)
file.seek(start_pos)
return self.check_format_line(nextline)
return self.check_format_line(first_line)

_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)? '
_COMMON_LOG_FORMAT = (
Expand All @@ -270,7 +280,8 @@ def check_format(self, file):
'common_vhost': RegexFormat('common_vhost', _HOST_PREFIX + _COMMON_LOG_FORMAT),
'ncsa_extended': RegexFormat('ncsa_extended', _NCSA_EXTENDED_LOG_FORMAT),
'common_complete': RegexFormat('common_complete', _HOST_PREFIX + _NCSA_EXTENDED_LOG_FORMAT),
'iis': IisFormat(),
'w3c_extended': W3cExtendedFormat(),
'iis': W3cExtendedFormat(), # for backwards compatibility TODO test
's3': RegexFormat('s3', _S3_LOG_FORMAT),
'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT),
'nginx_json': JsonFormat('nginx_json'),
Expand Down Expand Up @@ -504,34 +515,35 @@ def _create_parser(self):
help="By default Piwik tracks as Downloads the most popular file extensions. If you set this parameter (format: pdf,doc,...) then files with an extension found in the list will be imported as Downloads, other file extensions downloads will be skipped."
)
option_parser.add_option(
'--iis-map-field', action='callback', callback=self._set_iis_field_map, type='string',
help="Map a custom log entry field in your IIS log to a default one. Use this option to load custom IIS log "
"files such as those from the Advanced Logging IIS module. Used as, eg, --iis-map-field my-date=date. "
"Recognized default fields include: %s" % (', '.join(IisFormat.fields.keys()))
'--w3c-map-field', action='callback', callback=self._set_w3c_field_map, type='string',
help="Map a custom log entry field in your W3C log to a default one. Use this option to load custom log "
"files that use the W3C extended log format such as those from the Advanced Logging W3C module. Used "
"as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s"
% (', '.join(W3cExtendedFormat.fields.keys()))
)
option_parser.add_option(
'--iis-time-taken-secs', action='store_true', default=False, dest='iis_time_taken_in_secs',
help="If set, interprets the time-taken IIS log field as a number of seconds. This must be set for importing"
'--w3c-time-taken-secs', action='store_true', default=False, dest='w3c_time_taken_in_secs',
help="If set, interprets the time-taken W3C log field as a number of seconds. This must be set for importing"
" netscaler logs."
)
return option_parser

def _set_iis_field_map(self, option, opt_str, value, parser):
def _set_w3c_field_map(self, option, opt_str, value, parser):
parts = value.split('=')

if len(parts) != 2:
fatal_error("Invalid --iis-map-field option: '%s'" % value)
fatal_error("Invalid --w3c-map-field option: '%s'" % value)

custom_name, default_name = parts

if default_name not in IisFormat.fields:
fatal_error("custom IIS field mapping error: don't know how to parse and use the '%' field" % default_name)
if default_name not in W3cExtendedFormat.fields:
fatal_error("custom W3C field mapping error: don't know how to parse and use the '%' field" % default_name)
return

if not hasattr(parser.values, 'custom_iis_fields'):
parser.values.custom_iis_fields = {}
if not hasattr(parser.values, 'custom_w3c_fields'):
parser.values.custom_w3c_fields = {}

parser.values.custom_iis_fields[custom_name] = default_name
parser.values.custom_w3c_fields[custom_name] = default_name

def _parse_args(self, option_parser):
"""
Expand All @@ -546,8 +558,8 @@ def _parse_args(self, option_parser):
print(option_parser.format_help())
sys.exit(1)

if not hasattr(self.options, 'custom_iis_fields'):
self.options.custom_iis_fields = {}
if not hasattr(self.options, 'custom_w3c_fields'):
self.options.custom_w3c_fields = {}

# Configure logging before calling logging.{debug,info}.
logging.basicConfig(
Expand Down Expand Up @@ -1549,7 +1561,7 @@ def detect_format(file):

format = False

# check the format using the file (for formats like the IIS one)
# check the format using the file (for formats like the W3cExtendedFormat one)
format = Parser.check_format(file)

# check the format using the first N lines (to avoid irregular ones)
Expand Down Expand Up @@ -1663,7 +1675,7 @@ def invalid_line(line, reason):
except BaseFormatException:
hit.path, _, hit.query_string = hit.full_path.partition(config.options.query_string_delimiter)

# IIS detaults to - when there is no query string, but we want empty string
# W3cExtendedFormat detaults to - when there is no query string, but we want empty string
if hit.query_string == '-':
hit.query_string = ''

Expand All @@ -1690,7 +1702,7 @@ def invalid_line(line, reason):
try:
hit.length = int(format.get('length'))
except (ValueError, BaseFormatException):
# Some lines or formats don't have a length (e.g. 304 redirects, IIS logs)
# Some lines or formats don't have a length (e.g. 304 redirects, W3C logs)
hit.length = 0

try:
Expand Down
30 changes: 21 additions & 9 deletions misc/log-analytics/tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ def _test(format_name):
import_logs.config = Config()
format = import_logs.Parser.detect_format(file)
assert(format is not None)
assert(format.name == format_name)
if format_name == 'iis':
assert(format.name == 'w3c_extended')
else:
assert(format.name == format_name)

def _test_junk(format_name):
tmp_path = add_junk_to_file('logs/%s.log' % format_name)
Expand All @@ -36,9 +39,15 @@ def _test_junk(format_name):
import_logs.config = Config()
format = import_logs.Parser.detect_format(file)
assert(format is not None)
assert(format.name == format_name)
if format_name == 'iis':
assert(format.name == 'w3c_extended')
else:
assert(format.name == format_name)

for format_name in import_logs.FORMATS.iterkeys():
if format_name == 'w3c_extended': # tested by iis and netscaler log files
continue

f = functools.partial(_test, format_name)
f.description = 'Testing autodetection of format ' + format_name
yield f
Expand Down Expand Up @@ -67,8 +76,8 @@ class Options(object):
included_paths = []
enable_http_errors = False
download_extensions = 'doc,pdf'
custom_iis_fields = {}
iis_time_taken_in_secs = False
custom_w3c_fields = {}
w3c_time_taken_in_secs = False

class Config(object):
"""Mock configuration."""
Expand Down Expand Up @@ -188,7 +197,7 @@ def test_replay_tracking_arguments():
def parse_log_file_line(format_name, file_):
format = import_logs.FORMATS[format_name]

import_logs.config.options.custom_iis_fields = {}
import_logs.config.options.custom_w3c_fields = {}

file = open(file_)
match = format.check_format(file)
Expand Down Expand Up @@ -280,6 +289,9 @@ def _test_with_junk(format_name, path):
_test(format_name, tmp_path)

for format_name in import_logs.FORMATS.iterkeys():
if format_name == 'w3c_extended': # tested by IIS and netscaler logs
continue

f = functools.partial(_test, format_name, 'logs/' + format_name + '.log')
f.description = 'Testing parsing of format "%s"' % format_name
yield f
Expand All @@ -299,7 +311,7 @@ def test_iis_custom_format():
file_ = 'logs/iis_custom.log'

# have to override previous globals override for this test
import_logs.config.options.custom_iis_fields = {
import_logs.config.options.custom_w3c_fields = {
'date-local': 'date',
'time-local': 'time',
'cs(Host)': 'cs-host',
Expand Down Expand Up @@ -373,19 +385,19 @@ def test_iis_custom_format():
assert hits[2]['user_agent'] == u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36'

def test_netscaler_parsing():
"""test parsing of netscaler logs (which are similar to IIS logs)"""
"""test parsing of netscaler logs (which use extended W3C log format)"""

file_ = 'logs/netscaler.log'

# have to override previous globals override for this test
import_logs.config.options.custom_iis_fields = {}
import_logs.config.options.custom_w3c_fields = {}
Recorder.recorders = []
import_logs.parser = import_logs.Parser()
import_logs.config.format = None
import_logs.config.options.enable_http_redirects = True
import_logs.config.options.enable_http_errors = True
import_logs.config.options.replay_tracking = False
import_logs.config.options.iis_time_taken_in_secs = True
import_logs.config.options.w3c_time_taken_in_secs = True
import_logs.parser.parse(file_)

hits = [hit.__dict__ for hit in Recorder.recorders]
Expand Down
8 changes: 4 additions & 4 deletions tests/PHPUnit/Fixtures/ManySitesImportedLogs.php
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ private function logIisWithCustomFormat()

$opts = array('--idsite' => $this->idSite,
'--token-auth' => self::getTokenAuth(),
'--iis-map-field' => array('date-local=date', 'time-local=time', 'cs(Host)=cs-host', 'TimeTakenMS=time-taken'),
'--w3c-map-field' => array('date-local=date', 'time-local=time', 'cs(Host)=cs-host', 'TimeTakenMS=time-taken'),
'--enable-http-errors' => false,
'--enable-http-redirects' => false);

Expand All @@ -259,12 +259,12 @@ private function logNetscaler()

$opts = array('--idsite' => $this->idSite,
'--token-auth' => self::getTokenAuth(),
'--iis-map-field' => array(),
'--w3c-map-field' => array(),
'--enable-http-redirects' => false);

$output = self::executeLogImporter($logFile, $opts);

// make sure warning about --iis-time-taken-secs appears in importer output
self::assertContains("WARNING: netscaler log file being parsed without --iis-time-taken-secs option.", implode("\n", $output));
// make sure warning about --w3c-time-taken-secs appears in importer output
self::assertContains("WARNING: netscaler log file being parsed without --w3c-time-taken-secs option.", implode("\n", $output));
}
}

0 comments on commit a344174

Please sign in to comment.