Skip to content

Commit

Permalink
Merge pull request matomo-org#6851 from piwik/cloudfront_and_iis_bugfix
Browse files Browse the repository at this point in the history
Cloudfront log importing support (web + RTMP) and W3CExtendedFormat refactor (allow derived types to modify recognized fields and interpret time-taken as seconds by default).
  • Loading branch information
diosmosis committed Dec 15, 2014
2 parents dd1dde9 + 1dbec81 commit 2a1c4c7
Show file tree
Hide file tree
Showing 74 changed files with 1,530 additions and 424 deletions.
148 changes: 115 additions & 33 deletions misc/log-analytics/import_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class W3cExtendedFormat(RegexFormat):
'sc-bytes': '(?P<length>\S+)',
'cs-host': '(?P<host>\S+)',
'cs-username': '(?P<userid>\S+)',
'time-taken': '(?P<generation_time_milli>\d+)'
'time-taken': '(?P<generation_time_secs>[.\d]+)'
}

def __init__(self):
Expand All @@ -226,24 +226,22 @@ def check_format(self, file):
file.seek(0)
return

# store the header lines for a later check for IIS
self.header_lines = header_lines

# Parse the 4th 'Fields: ' line to create the regex to use
full_regex = []

expected_fields = W3cExtendedFormat.fields.copy() # turn custom field mapping into field => regex mapping
expected_fields = type(self).fields.copy() # turn custom field mapping into field => regex mapping

# if the --w3c-time-taken-millisecs option is used, make sure the time-taken field is interpreted as milliseconds
if config.options.w3c_time_taken_in_millisecs:
expected_fields['time-taken'] = '(?P<generation_time_milli>[\d.]+)'

for mapped_field_name, field_name in config.options.custom_w3c_fields.iteritems():
expected_fields[mapped_field_name] = W3cExtendedFormat.fields[field_name]
expected_fields[mapped_field_name] = type(self).fields[field_name]
del expected_fields[field_name]

# if the --w3c-time-taken-secs option is used, make sure the time-taken field is interpreted as seconds
if config.options.w3c_time_taken_in_secs:
expected_fields['time-taken'] = '(?P<generation_time_secs>\S+)'
else:
# check if we're importing netscaler logs and if so, issue a warning
if 'netscaler' in header_lines[1].lower():
logging.info("WARNING: netscaler log file being parsed without --w3c-time-taken-secs option. Netscaler"
" stores second values in the time-taken field. If your logfile does this, the aforementioned"
" option must be used in order to get accurate generation times.")

# Skip the 'Fields: ' prefix.
fields_line = fields_line[9:]
for field in fields_line.split():
Expand All @@ -252,12 +250,63 @@ def check_format(self, file):
except KeyError:
regex = '\S+'
full_regex.append(regex)
self.regex = re.compile(' '.join(full_regex))
full_regex = '\s+'.join(full_regex)
self.regex = re.compile(full_regex)

start_pos = file.tell() - len(first_line)
file.seek(start_pos)
file.seek(0)
return self.check_format_line(first_line)

def check_for_iis_option(self):
if not config.options.w3c_time_taken_in_millisecs and self._is_time_taken_milli() and self._is_iis():
logging.info("WARNING: IIS log file being parsed without --w3c-time-taken-milli option. IIS"
" stores millisecond values in the time-taken field. If your logfile does this, the aforementioned"
" option must be used in order to get accurate generation times.")

def _is_iis(self):
return len([line for line in self.header_lines if 'internet information services' in line.lower() or 'iis' in line.lower()]) > 0

def _is_time_taken_milli(self):
return 'generation_time_milli' not in self.regex.pattern

class IisFormat(W3cExtendedFormat):

fields = W3cExtendedFormat.fields.copy()
fields.update({
'time-taken': '(?P<generation_time_milli>[.\d]+)',
'sc-win32-status': '(?P<__win32_status>\S+)' # this group is useless for log importing, but capturing it
# will ensure we always select IIS for the format instead of
# W3C logs when detecting the format. This way there will be
# less accidental importing of IIS logs w/o --w3c-time-taken-milli.
})

def __init__(self):
super(IisFormat, self).__init__()

self.name = 'iis'

class AmazonCloudFrontFormat(W3cExtendedFormat):

fields = W3cExtendedFormat.fields.copy()
fields.update({
'x-event': '(?P<event_action>\S+)',
'x-sname': '(?P<event_name>\S+)',
'cs-uri-stem': '(?:rtmp:/)?(?P<path>/\S*)',
'c-user-agent': '(?P<user_agent>".*?"|\S+)'
})

def __init__(self):
super(AmazonCloudFrontFormat, self).__init__()

self.name = 'amazon_cloudfront'

def get(self, key):
if key == 'event_category' and 'event_category' not in self.matched.groupdict():
return 'cloudfront_rtmp'
elif key == 'status' and 'status' not in self.matched.groupdict():
return '200'
else:
return super(AmazonCloudFrontFormat, self).get(key)

_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)? '
_COMMON_LOG_FORMAT = (
'(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] '
Expand All @@ -281,7 +330,8 @@ def check_format(self, file):
'ncsa_extended': RegexFormat('ncsa_extended', _NCSA_EXTENDED_LOG_FORMAT),
'common_complete': RegexFormat('common_complete', _HOST_PREFIX + _NCSA_EXTENDED_LOG_FORMAT),
'w3c_extended': W3cExtendedFormat(),
'iis': W3cExtendedFormat(), # for backwards compatibility TODO test
'amazon_cloudfront': AmazonCloudFrontFormat(),
'iis': IisFormat(),
's3': RegexFormat('s3', _S3_LOG_FORMAT),
'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT),
'nginx_json': JsonFormat('nginx_json'),
Expand Down Expand Up @@ -518,13 +568,15 @@ def _create_parser(self):
'--w3c-map-field', action='callback', callback=self._set_w3c_field_map, type='string',
help="Map a custom log entry field in your W3C log to a default one. Use this option to load custom log "
"files that use the W3C extended log format such as those from the Advanced Logging W3C module. Used "
"as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s"
"as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s\n\n"
"Formats that extend the W3C extended log format (like the cloudfront RTMP log format) may define more "
"fields that can be mapped."
% (', '.join(W3cExtendedFormat.fields.keys()))
)
option_parser.add_option(
'--w3c-time-taken-secs', action='store_true', default=False, dest='w3c_time_taken_in_secs',
help="If set, interprets the time-taken W3C log field as a number of seconds. This must be set for importing"
" netscaler logs."
'--w3c-time-taken-millisecs', action='store_true', default=False, dest='w3c_time_taken_in_millisecs',
help="If set, interprets the time-taken W3C log field as a number of milliseconds. This must be set for importing"
" IIS logs."
)
return option_parser

Expand All @@ -536,10 +588,6 @@ def _set_w3c_field_map(self, option, opt_str, value, parser):

custom_name, default_name = parts

if default_name not in W3cExtendedFormat.fields:
fatal_error("custom W3C field mapping error: don't know how to parse and use the '%' field" % default_name)
return

if not hasattr(parser.values, 'custom_w3c_fields'):
parser.values.custom_w3c_fields = {}

Expand All @@ -558,9 +606,6 @@ def _parse_args(self, option_parser):
print(option_parser.format_help())
sys.exit(1)

if not hasattr(self.options, 'custom_w3c_fields'):
self.options.custom_w3c_fields = {}

# Configure logging before calling logging.{debug,info}.
logging.basicConfig(
format='%(asctime)s: [%(levelname)s] %(message)s',
Expand Down Expand Up @@ -598,6 +643,15 @@ def _parse_args(self, option_parser):
else:
self.format = None

if not hasattr(self.options, 'custom_w3c_fields'):
self.options.custom_w3c_fields = {}
elif self.format is not None:
# validate custom field mappings
for custom_name, default_name in self.options.custom_w3c_fields.iteritems():
if default_name not in type(format).fields:
fatal_error("custom W3C field mapping error: don't know how to parse and use the '%' field" % default_name)
return

if not self.options.piwik_url:
fatal_error('no URL given for Piwik')

Expand Down Expand Up @@ -1342,7 +1396,15 @@ def _get_hit_args(self, hit):
)

if hit.generation_time_milli > 0:
args['gt_ms'] = hit.generation_time_milli
args['gt_ms'] = int(hit.generation_time_milli)

if hit.event_category and hit.event_action:
args['e_c'] = hit.event_category
args['e_a'] = hit.event_action

if hit.event_name:
args['e_n'] = hit.event_name

return args

def _record_hits(self, hits):
Expand Down Expand Up @@ -1531,7 +1593,8 @@ def check_format(lineOrFile):
match = candidate_format.check_format_line(lineOrFile)
else:
match = candidate_format.check_format(lineOrFile)
except:
except Exception, e:
logging.debug('Error in format checking: %s', str(e))
pass

if match:
Expand All @@ -1550,6 +1613,11 @@ def check_format(lineOrFile):
else:
logging.debug('Format %s does not match', name)

# if the format is W3cExtendedFormat, check if the logs are from IIS and if so, issue a warning if the
# --w3c-time-taken-milli option isn't set
if isinstance(format, W3cExtendedFormat):
format.check_for_iis_option()

return format

@staticmethod
Expand Down Expand Up @@ -1583,6 +1651,7 @@ def detect_format(file):
pass

if not format:
raise RuntimeError('sdfjlsd')
fatal_error("cannot automatically determine the log format using the first %d lines of the log file. " % limit +
"\nMaybe try specifying the format with the --log-format-name command line argument." )
return
Expand Down Expand Up @@ -1706,13 +1775,13 @@ def invalid_line(line, reason):
hit.length = 0

try:
hit.generation_time_milli = int(format.get('generation_time_milli'))
hit.generation_time_milli = float(format.get('generation_time_milli'))
except BaseFormatException:
try:
hit.generation_time_milli = int(format.get('generation_time_micro')) / 1000
hit.generation_time_milli = float(format.get('generation_time_micro')) / 1000
except BaseFormatException:
try:
hit.generation_time_milli = int(format.get('generation_time_secs')) * 1000
hit.generation_time_milli = float(format.get('generation_time_secs')) * 1000
except BaseFormatException:
hit.generation_time_milli = 0

Expand All @@ -1735,6 +1804,19 @@ def invalid_line(line, reason):
except:
pass

# add event info
try:
hit.event_category = hit.event_action = hit.event_name = None

hit.event_category = format.get('event_category')
hit.event_action = format.get('event_action')

hit.event_name = format.get('event_name')
if hit.event_name == '-':
hit.event_name = None
except:
pass

# Check if the hit must be excluded.
if not all((method(hit) for method in self.check_methods)):
continue
Expand Down
4 changes: 4 additions & 0 deletions misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#Version: 1.0
#Fields: date time x-edge-location c-ip x-event sc-bytes x-cf-status x-cf-client-id cs-uri-stem cs-uri-query c-referrer x-page-url​ c-user-agent x-sname x-sname-query x-file-ext x-sid
2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1
3 changes: 3 additions & 0 deletions misc/log-analytics/tests/logs/amazon_cloudfront_web.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#Version: 1.0
#Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken
2014-05-23 01:13:11 FRA2 182 192.0.2.10 GET d111111abcdef8.cloudfront.net /view/my/file.html 200 www.displaymyfiles.com Mozilla/4.0%20(compatible;%20MSIE%205.0b1;%20Mac_PowerPC) - zip=98101 RefreshHit MRVMF7KydIvxMWfJIglgwHQwZsbG2IhRJ07sn9AkKUFSHS9EXAMPLE== d111111abcdef8.cloudfront.net http - 0.001
Loading

0 comments on commit 2a1c4c7

Please sign in to comment.