diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index 9e57eeb4cb4..dfd8236d4ae 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -203,7 +203,7 @@ class W3cExtendedFormat(RegexFormat): 'sc-bytes': '(?P\S+)', 'cs-host': '(?P\S+)', 'cs-username': '(?P\S+)', - 'time-taken': '(?P\d+)' + 'time-taken': '(?P[.\d]+)' } def __init__(self): @@ -229,21 +229,22 @@ def check_format(self, file): # Parse the 4th 'Fields: ' line to create the regex to use full_regex = [] - expected_fields = W3cExtendedFormat.fields.copy() # turn custom field mapping into field => regex mapping - for mapped_field_name, field_name in config.options.custom_w3c_fields.iteritems(): - expected_fields[mapped_field_name] = W3cExtendedFormat.fields[field_name] - del expected_fields[field_name] + expected_fields = type(self).fields.copy() # turn custom field mapping into field => regex mapping - # if the --w3c-time-taken-secs option is used, make sure the time-taken field is interpreted as seconds - if config.options.w3c_time_taken_in_secs: - expected_fields['time-taken'] = '(?P\S+)' + # if the --w3c-time-taken-millisecs option is used, make sure the time-taken field is interpreted as seconds + if config.options.w3c_time_taken_in_millisecs: + expected_fields['time-taken'] = '(?P[\d.]+)' else: - # check if we're importing netscaler logs and if so, issue a warning - if 'netscaler' in header_lines[1].lower(): - logging.info("WARNING: netscaler log file being parsed without --w3c-time-taken-secs option. Netscaler" - " stores second values in the time-taken field. If your logfile does this, the aforementioned" + # check if we're importing IIS logs and if so, issue a warning TODO: should be done after correct format found so only one warning made + if 'generation_time_milli' not in expected_fields['time-taken'] and self._is_iis(header_lines): + logging.info("WARNING: IIS log file being parsed without --w3c-time-taken-milli option. IIS" + " stores millisecond values in the time-taken field. If your logfile does this, the aforementioned" " option must be used in order to get accurate generation times.") + for mapped_field_name, field_name in config.options.custom_w3c_fields.iteritems(): + expected_fields[mapped_field_name] = type(self).fields[field_name] + del expected_fields[field_name] + # Skip the 'Fields: ' prefix. fields_line = fields_line[9:] for field in fields_line.split(): @@ -252,12 +253,56 @@ def check_format(self, file): except KeyError: regex = '\S+' full_regex.append(regex) - self.regex = re.compile(' '.join(full_regex)) + full_regex = '\s+'.join(full_regex) + logging.debug(full_regex) + logging.debug(first_line) + self.regex = re.compile(full_regex) - start_pos = file.tell() - len(first_line) - file.seek(start_pos) + file.seek(0) return self.check_format_line(first_line) + def _is_iis(self, header_lines): + return len([line for line in header_lines if 'internet information services' in line.lower() or 'iis' in line.lower()]) > 0 + +class IisFormat(W3cExtendedFormat): + + fields = W3cExtendedFormat.fields.copy() + fields.update({ + 'time-taken': '(?P[.\d]+)', + 'sc-win32-status': '(?P<__win32_status>\S+)' # this group is useless for log importing, but capturing it + # will ensure we always select IIS for the format instead of + # W3C logs when detecting the format. This way there will be + # less accidental importing of IIS logs w/o --w3c-time-taken-milli. + }) + + def __init__(self): + super(IisFormat, self).__init__() + + self.name = 'iis' + +class AmazonCloudFrontFormat(W3cExtendedFormat): + + fields = W3cExtendedFormat.fields.copy() + fields.update({ + 'x-event': '(?P\S+)', + 'x-sname': '(?P\S+)', + 'cs-uri-stem': '(?:rtmp:/)?(?P/\S*)', + 'c-user-agent': '(?P".*?"|\S+)' + }) + + def __init__(self): + super(AmazonCloudFrontFormat, self).__init__() + + self.name = 'amazon_cloudfront' + + def get(self, key): + if key == 'event_category' and 'event_category' not in self.matched.groupdict(): + return 'cloudfront_rtmp' + elif key == 'status' and 'status' not in self.matched.groupdict(): + return '200' + else: + return super(AmazonCloudFrontFormat, self).get(key) + _HOST_PREFIX = '(?P[\w\-\.]*)(?::\d+)? ' _COMMON_LOG_FORMAT = ( '(?P\S+) \S+ \S+ \[(?P.*?) (?P.*?)\] ' @@ -281,7 +326,8 @@ def check_format(self, file): 'ncsa_extended': RegexFormat('ncsa_extended', _NCSA_EXTENDED_LOG_FORMAT), 'common_complete': RegexFormat('common_complete', _HOST_PREFIX + _NCSA_EXTENDED_LOG_FORMAT), 'w3c_extended': W3cExtendedFormat(), - 'iis': W3cExtendedFormat(), # for backwards compatibility TODO test + 'amazon_cloudfront': AmazonCloudFrontFormat(), + 'iis': IisFormat(), 's3': RegexFormat('s3', _S3_LOG_FORMAT), 'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT), 'nginx_json': JsonFormat('nginx_json'), @@ -518,13 +564,15 @@ def _create_parser(self): '--w3c-map-field', action='callback', callback=self._set_w3c_field_map, type='string', help="Map a custom log entry field in your W3C log to a default one. Use this option to load custom log " "files that use the W3C extended log format such as those from the Advanced Logging W3C module. Used " - "as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s" + "as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s\n\n" + "Formats that extend the W3C extended log format (like the cloudfront RTMP log format) may define more " + "fields that can be mapped." % (', '.join(W3cExtendedFormat.fields.keys())) ) option_parser.add_option( - '--w3c-time-taken-secs', action='store_true', default=False, dest='w3c_time_taken_in_secs', - help="If set, interprets the time-taken W3C log field as a number of seconds. This must be set for importing" - " netscaler logs." + '--w3c-time-taken-millisecs', action='store_true', default=False, dest='w3c_time_taken_in_millisecs', + help="If set, interprets the time-taken W3C log field as a number of milliseconds. This must be set for importing" + " IIS logs." ) return option_parser @@ -536,10 +584,6 @@ def _set_w3c_field_map(self, option, opt_str, value, parser): custom_name, default_name = parts - if default_name not in W3cExtendedFormat.fields: - fatal_error("custom W3C field mapping error: don't know how to parse and use the '%' field" % default_name) - return - if not hasattr(parser.values, 'custom_w3c_fields'): parser.values.custom_w3c_fields = {} @@ -558,9 +602,6 @@ def _parse_args(self, option_parser): print(option_parser.format_help()) sys.exit(1) - if not hasattr(self.options, 'custom_w3c_fields'): - self.options.custom_w3c_fields = {} - # Configure logging before calling logging.{debug,info}. logging.basicConfig( format='%(asctime)s: [%(levelname)s] %(message)s', @@ -598,6 +639,15 @@ def _parse_args(self, option_parser): else: self.format = None + if not hasattr(self.options, 'custom_w3c_fields'): + self.options.custom_w3c_fields = {} + elif self.format is not None: + # validate custom field mappings + for custom_name, default_name in self.options.custom_w3c_fields.iteritems(): + if default_name not in type(format).fields: + fatal_error("custom W3C field mapping error: don't know how to parse and use the '%' field" % default_name) + return + if not self.options.piwik_url: fatal_error('no URL given for Piwik') @@ -1343,6 +1393,7 @@ def _get_hit_args(self, hit): if hit.generation_time_milli > 0: args['gt_ms'] = hit.generation_time_milli + logging.debug(args) return args def _record_hits(self, hits): @@ -1531,7 +1582,8 @@ def check_format(lineOrFile): match = candidate_format.check_format_line(lineOrFile) else: match = candidate_format.check_format(lineOrFile) - except: + except Exception, e: + logging.debug(str(e)) pass if match: @@ -1583,6 +1635,7 @@ def detect_format(file): pass if not format: + raise RuntimeError('sdfjlsd') fatal_error("cannot automatically determine the log format using the first %d lines of the log file. " % limit + "\nMaybe try specifying the format with the --log-format-name command line argument." ) return @@ -1706,13 +1759,13 @@ def invalid_line(line, reason): hit.length = 0 try: - hit.generation_time_milli = int(format.get('generation_time_milli')) + hit.generation_time_milli = float(format.get('generation_time_milli')) except BaseFormatException: try: - hit.generation_time_milli = int(format.get('generation_time_micro')) / 1000 + hit.generation_time_milli = float(format.get('generation_time_micro')) / 1000 except BaseFormatException: try: - hit.generation_time_milli = int(format.get('generation_time_secs')) * 1000 + hit.generation_time_milli = float(format.get('generation_time_secs')) * 1000 except BaseFormatException: hit.generation_time_milli = 0 @@ -1735,6 +1788,14 @@ def invalid_line(line, reason): except: pass + # add event info + try: + hit.event_category = format.get('event_category') + hit.event_action = format.get('event_action') + hit.event_name = format.get('event_name') + except: + pass + # Check if the hit must be excluded. if not all((method(hit) for method in self.check_methods)): continue diff --git a/misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log b/misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log new file mode 100644 index 00000000000..7b226473d07 --- /dev/null +++ b/misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log @@ -0,0 +1,4 @@ +#Version: 1.0 +#Fields: date time x-edge-location c-ip x-event sc-bytes x-cf-status x-cf-client-id cs-uri-stem cs-uri-query c-referrer x-page-url​ c-user-agent x-sname x-sname-query x-file-ext x-sid +2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - - +2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1 diff --git a/misc/log-analytics/tests/logs/amazon_cloudfront_web.log b/misc/log-analytics/tests/logs/amazon_cloudfront_web.log new file mode 100644 index 00000000000..30db4a152af --- /dev/null +++ b/misc/log-analytics/tests/logs/amazon_cloudfront_web.log @@ -0,0 +1,3 @@ +#Version: 1.0 +#Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken +2014-05-23 01:13:11 FRA2 182 192.0.2.10 GET d111111abcdef8.cloudfront.net /view/my/file.html 200 www.displaymyfiles.com Mozilla/4.0%20(compatible;%20MSIE%205.0b1;%20Mac_PowerPC) - zip=98101 RefreshHit MRVMF7KydIvxMWfJIglgwHQwZsbG2IhRJ07sn9AkKUFSHS9EXAMPLE== d111111abcdef8.cloudfront.net http - 0.001 diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py index 73811a37176..6e0051c5edb 100644 --- a/misc/log-analytics/tests/tests.py +++ b/misc/log-analytics/tests/tests.py @@ -22,30 +22,31 @@ def tearDownModule(): os.remove('tmp.log') def test_format_detection(): - def _test(format_name): - file = open('logs/%s.log' % format_name) + def _test(format_name, log_file = None): + if log_file is None: + log_file = 'logs/%s.log' % format_name + + file = open(log_file) import_logs.config = Config() format = import_logs.Parser.detect_format(file) assert(format is not None) - if format_name == 'iis': - assert(format.name == 'w3c_extended') - else: - assert(format.name == format_name) + assert(format.name == format_name) - def _test_junk(format_name): - tmp_path = add_junk_to_file('logs/%s.log' % format_name) + def _test_junk(format_name, log_file = None): + if log_file is None: + log_file = 'logs/%s.log' % format_name + + tmp_path = add_junk_to_file(log_file) file = open(tmp_path) import_logs.config = Config() format = import_logs.Parser.detect_format(file) assert(format is not None) - if format_name == 'iis': - assert(format.name == 'w3c_extended') - else: - assert(format.name == format_name) + assert(format.name == format_name) for format_name in import_logs.FORMATS.iterkeys(): - if format_name == 'w3c_extended': # tested by iis and netscaler log files + # w3c extended tested by iis and netscaler log files; amazon cloudfront tested later + if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront': continue f = functools.partial(_test, format_name) @@ -56,6 +57,23 @@ def _test_junk(format_name): f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line' yield f + # add tests for amazon cloudfront (normal web + rtmp) + f = functools.partial(_test, 'w3c_extended', 'logs/amazon_cloudfront_web.log') + f.description = 'Testing autodetection of amazon cloudfront (web) logs.' + yield f + + f = functools.partial(_test_junk, 'w3c_extended', 'logs/amazon_cloudfront_web.log') + f.description = 'Testing autodetection of amazon cloudfront (web) logs w/ garbage at end of line' + yield f + + f = functools.partial(_test, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log') + f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs.' + yield f + + f = functools.partial(_test_junk, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log') + f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs w/ garbage at end of line.' + yield f + class Options(object): """Mock config options necessary to run checkers from Parser class.""" debug = False @@ -77,7 +95,7 @@ class Options(object): enable_http_errors = False download_extensions = 'doc,pdf' custom_w3c_fields = {} - w3c_time_taken_in_secs = False + w3c_time_taken_in_millisecs = False class Config(object): """Mock configuration.""" @@ -242,7 +260,8 @@ def check_iis_groups(groups): assert groups['host'] == 'example.com' expected_hit_properties = ['date', 'path', 'query_string', 'ip', 'referrer', 'user_agent', - 'status', 'length', 'host', 'userid', 'generation_time_milli'] + 'status', 'length', 'host', 'userid', 'generation_time_milli', + '__win32_status'] for property_name in groups.keys(): assert property_name in expected_hit_properties @@ -289,7 +308,8 @@ def _test_with_junk(format_name, path): _test(format_name, tmp_path) for format_name in import_logs.FORMATS.iterkeys(): - if format_name == 'w3c_extended': # tested by IIS and netscaler logs + # w3c extended tested by IIS and netscaler logs; amazon cloudfront tested individually + if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront': continue f = functools.partial(_test, format_name, 'logs/' + format_name + '.log') @@ -304,7 +324,6 @@ def _test_with_junk(format_name, path): f.description = 'Testing parsing of format "common" with ncsa_extended log' yield f - def test_iis_custom_format(): """test IIS custom format name parsing.""" @@ -323,6 +342,7 @@ def test_iis_custom_format(): import_logs.config.options.enable_http_redirects = True import_logs.config.options.enable_http_errors = True import_logs.config.options.replay_tracking = False + # import_logs.config.options.w3c_time_taken_in_millisecs = True test that even w/o this, we get the right format import_logs.parser.parse(file_) hits = [hit.__dict__ for hit in Recorder.recorders] @@ -397,7 +417,7 @@ def test_netscaler_parsing(): import_logs.config.options.enable_http_redirects = True import_logs.config.options.enable_http_errors = True import_logs.config.options.replay_tracking = False - import_logs.config.options.w3c_time_taken_in_secs = True + import_logs.config.options.w3c_time_taken_in_millisecs = False import_logs.parser.parse(file_) hits = [hit.__dict__ for hit in Recorder.recorders] @@ -420,4 +440,115 @@ def test_netscaler_parsing(): assert hits[0]['path'] == u'/Citrix/XenApp/Wan/auth/login.jsp' assert hits[0]['is_robot'] == False assert hits[0]['full_path'] == u'/Citrix/XenApp/Wan/auth/login.jsp' - assert hits[0]['user_agent'] == u'Mozilla/4.0+(compatible;+MSIE+7.0;+Windows+NT+5.1;+Trident/4.0;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.04506.648;+.NET+CLR+3.5.21022)' \ No newline at end of file + assert hits[0]['user_agent'] == u'Mozilla/4.0+(compatible;+MSIE+7.0;+Windows+NT+5.1;+Trident/4.0;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.04506.648;+.NET+CLR+3.5.21022)' + +def test_amazon_cloudfront_web_parsing(): + """test parsing of amazon cloudfront logs (which use extended W3C log format)""" + + file_ = 'logs/amazon_cloudfront_web.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = False + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + import_logs.logging.debug(hits) + + assert hits[0]['status'] == u'200' + assert hits[0]['userid'] == None + assert hits[0]['is_error'] == False + assert hits[0]['extension'] == u'html' + assert hits[0]['is_download'] == False + assert hits[0]['referrer'] == u'www.displaymyfiles.com' + assert hits[0]['args'] == {} + assert hits[0]['generation_time_milli'] == 1.0 + assert hits[0]['host'] == 'foo' + assert hits[0]['filename'] == 'logs/amazon_cloudfront_web.log' + assert hits[0]['is_redirect'] == False + assert hits[0]['date'] == datetime.datetime(2014, 5, 23, 1, 13, 11) + assert hits[0]['lineno'] == 2 + assert hits[0]['ip'] == u'192.0.2.10' + assert hits[0]['query_string'] == '' + assert hits[0]['path'] == u'/view/my/file.html' + assert hits[0]['is_robot'] == False + assert hits[0]['full_path'] == u'/view/my/file.html' + assert hits[0]['user_agent'] == u'Mozilla/4.0%20(compatible;%20MSIE%205.0b1;%20Mac_PowerPC)' + + assert len(hits) == 1 + +def test_amazon_cloudfront_rtmp_parsing(): + """test parsing of amazon cloudfront rtmp logs (which use extended W3C log format w/ custom fields for event info)""" + + file_ = 'logs/amazon_cloudfront_rtmp.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = False + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + import_logs.logging.debug(hits) + + assert hits[0]['is_download'] == False + assert hits[0]['ip'] == u'192.0.2.147' + assert hits[0]['is_redirect'] == False + assert hits[0]['filename'] == 'logs/amazon_cloudfront_rtmp.log' + assert hits[0]['event_category'] == 'cloudfront_rtmp' + assert hits[0]['event_action'] == u'connect' + assert hits[0]['lineno'] == 2 + assert hits[0]['status'] == '200' + assert hits[0]['is_error'] == False + assert hits[0]['event_name'] == u'-' + assert hits[0]['args'] == {} + assert hits[0]['host'] == 'foo' + assert hits[0]['date'] == datetime.datetime(2010, 3, 12, 23, 51, 20) + assert hits[0]['path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b' + assert hits[0]['extension'] == u'net/cfx/st\u200b' + assert hits[0]['referrer'] == '' + assert hits[0]['userid'] == None + assert hits[0]['user_agent'] == u'LNX%2010,0,32,18' + assert hits[0]['generation_time_milli'] == 0 + assert hits[0]['query_string'] == u'key=value' + assert hits[0]['is_robot'] == False + assert hits[0]['full_path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b' + + assert hits[1]['is_download'] == False + assert hits[1]['ip'] == u'192.0.2.222' + assert hits[1]['is_redirect'] == False + assert hits[1]['filename'] == 'logs/amazon_cloudfront_rtmp.log' + assert hits[1]['event_category'] == 'cloudfront_rtmp' + assert hits[1]['event_action'] == u'play' + assert hits[1]['lineno'] == 3 + assert hits[1]['status'] == '200' + assert hits[1]['is_error'] == False + assert hits[1]['event_name'] == u'myvideo' + assert hits[1]['args'] == {} + assert hits[1]['host'] == 'foo' + assert hits[1]['date'] == datetime.datetime(2010, 3, 12, 23, 51, 21) + assert hits[1]['path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b' + assert hits[1]['extension'] == u'net/cfx/st\u200b' + assert hits[1]['referrer'] == '' + assert hits[1]['userid'] == None + assert hits[1]['length'] == 3914 + assert hits[1]['user_agent'] == u'LNX%2010,0,32,18' + assert hits[1]['generation_time_milli'] == 0 + assert hits[1]['query_string'] == u'key=value' + assert hits[1]['is_robot'] == False + assert hits[1]['full_path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b' + + assert len(hits) == 2 diff --git a/tests/PHPUnit/Fixtures/ManySitesImportedLogs.php b/tests/PHPUnit/Fixtures/ManySitesImportedLogs.php index 830d22778cd..7642befc165 100644 --- a/tests/PHPUnit/Fixtures/ManySitesImportedLogs.php +++ b/tests/PHPUnit/Fixtures/ManySitesImportedLogs.php @@ -262,9 +262,6 @@ private function logNetscaler() '--w3c-map-field' => array(), '--enable-http-redirects' => false); - $output = self::executeLogImporter($logFile, $opts); - - // make sure warning about --w3c-time-taken-secs appears in importer output - self::assertContains("WARNING: netscaler log file being parsed without --w3c-time-taken-secs option.", implode("\n", $output)); + return self::executeLogImporter($logFile, $opts); } } \ No newline at end of file diff --git a/tests/PHPUnit/Framework/Fixture.php b/tests/PHPUnit/Framework/Fixture.php index a2d6a6327a8..8a1673538d8 100644 --- a/tests/PHPUnit/Framework/Fixture.php +++ b/tests/PHPUnit/Framework/Fixture.php @@ -776,6 +776,8 @@ protected static function executeLogImporter($logFile, $options) // run the command exec($cmd, $output, $result); + echo "$cmd - \n"; + echo implode("\n", $output)."\n"; if ($result !== 0) { throw new Exception("log importer failed: " . implode("\n", $output) . "\n\ncommand used: $cmd"); }