From 269bb1a61ca25028a755e1d643baa595e0769bb8 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Tue, 5 Sep 2023 14:28:00 +0100 Subject: [PATCH] test: script for executing A/B-Tests This script is intended to be executed by our CI to perform an A/B-test across two commits (for instance, a buildkite pipeline would get triggered on PR merge, and the pipeline will call this script with the commit range of the just merged PR). It compiles two git revisions of firecracker using the revisions devtool, and then passes these binaries to the relevant A/B-test. After collecting data for both A and B revision, it analyzes the produced EMF logs for raw time series (e.g. EMF properties that are assigned lists of values). For any such data series found, it will then perform a statistical test to assert that there is no regression in this data series (for this, it asserts that both A and B revision produce the same EMF messages (based on dimensions), and that for each unique dimension, the same data series are emitted). We choose a Permutation Test as it is non-parametric (which we need since we cannot make normality assumptions about arbitrary performance data). Non-parametric here means it compares two arbitrary sets of samples, and then gives us a p-value about the H_0 hypothesis "both sets of samples were drawn from the same (unknown) distribution". The p-value is easy to interpret, as it tells us the probability of observing a result as bad as the actually measured one, given that performance did not change. Signed-off-by: Patrick Roy --- tests/framework/ab_test.py | 37 +++++- tools/ab_test.py | 260 +++++++++++++++++++++++++++++++++++++ tools/devtool | 14 +- 3 files changed, 305 insertions(+), 6 deletions(-) create mode 100755 tools/ab_test.py diff --git a/tests/framework/ab_test.py b/tests/framework/ab_test.py index 6520d9c73ee8..316ecc5704b9 100644 --- a/tests/framework/ab_test.py +++ b/tests/framework/ab_test.py @@ -23,9 +23,12 @@ """ import contextlib import os +import statistics from pathlib import Path from tempfile import TemporaryDirectory -from typing import Callable, Optional, TypeVar +from typing import Callable, List, Optional, TypeVar + +import scipy from framework import utils @@ -97,6 +100,29 @@ def git_ab_test( return result_a, result_b, comparison +def check_regression(a_samples: List[float], b_samples: List[float]): + """Checks for a regression by performing a permutation test. A permutation test is a non-parametric test that takes + three parameters: Two populations (sets of samples) and a function computing a "statistic" based on two populations. + First, the test computes the statistic for the initial populations. It then randomly + permutes the two populations (e.g. merges them and then randomly splits them again). For each such permuted + population, the statistic is computed. Then, all the statistics are sorted, and the percentile of the statistic for the + initial populations is computed. We then look at the fraction of statistics that are larger/smaller than that of the + initial populations. The minimum of these two fractions will then become the p-value. + + The idea is that if the two populations are indeed drawn from the same distribution (e.g. if performance did not + change), then permuting will not affect the statistic (indeed, it should be approximately normal-distributed, and + the statistic for the initial populations will be somewhere "in the middle"). + + Useful for performance tests. + """ + return scipy.stats.permutation_test( + (a_samples, b_samples), + # Compute the difference of means, such that a positive different indicates potential for regression. + lambda x, y: statistics.mean(y) - statistics.mean(x), + vectorized=False, + ) + + @contextlib.contextmanager def temporary_checkout(revision: str): """ @@ -106,9 +132,12 @@ def temporary_checkout(revision: str): happen along the way. """ with TemporaryDirectory() as tmp_dir: - utils.run_cmd( - f"git clone https://github.com/firecracker-microvm/firecracker {tmp_dir}" - ) + # `git clone` can take a path instead of an URL, which causes it to create a copy of the + # repository at the given path. However, that path needs to point to the root of a repository, + # it cannot be some arbitrary subdirectory. Therefore: + _, git_root, _ = utils.run_cmd("git rev-parse --show-toplevel") + # split off the '\n' at the end of the stdout + utils.run_cmd(f"git clone {git_root.strip()} {tmp_dir}") with chdir(tmp_dir): utils.run_cmd(f"git checkout {revision}") diff --git a/tools/ab_test.py b/tools/ab_test.py new file mode 100755 index 000000000000..9049f1c5fbe8 --- /dev/null +++ b/tools/ab_test.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +""" +Script for running A/B-Tests + +The script takes two git revisions and a pytest integration test. It utilizes +our integration test frameworks --binary-dir parameter to execute the given +test using binaries compiled from each revision, and captures the EMF logs +output. It the searches for list-valued properties in the EMF, and runs a +regression test comparing these lists for the two runs. + +It performs the A/B-test as follows: +For each EMF log message output, look at the dimensions. The script assumes that +dimensions are unique across all log messages output from a single test run. In +each log message, then look for all properties that have lists assigned to them, +and collect them. For both runs of the test, the set of distinct dimensions +collected this way must be the same. Then, we match corresponding dimensions +between the two runs, performing statistical regression test across all the list- +valued properties collected. +""" +import argparse +import asyncio +import json +import platform +import shutil +import statistics +import sys +from pathlib import Path + +from aws_embedded_metrics.logger.metrics_logger_factory import create_metrics_logger + +# Hack to be able to use our test framework code +sys.path.append(str(Path(__file__).parent.parent / "tests")) + +# pylint:disable=wrong-import-position +from framework import utils +from framework.ab_test import chdir, check_regression, git_ab_test + + +def extract_dimensions(emf): + """Extracts the cloudwatch dimensions from an EMF log message""" + dimension_list = emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"][0] + dimensions = {} + + for key in emf: + if key in dimension_list: + dimensions[key] = emf[key] + + return dimensions + + +def extract_metrics(emf): + """Extracts the cloudwatch metrics from an EMF log message""" + metric_description = emf["_aws"]["CloudWatchMetrics"][0]["Metrics"] + metrics = {} + + for key in emf: + try: + metric = next( + metric for metric in metric_description if metric["Name"] == key + ) + metrics[key] = (emf[key], metric["Unit"]) + except StopIteration: + pass + + return metrics + + +def reemit_emf_log_entry(log_entry: str, revision: str): + """Parses the given EMF log entry, and reemits it, overwriting the attached "git_commit_id" field with the given revision""" + metrics_logger = create_metrics_logger() + emf = json.loads(log_entry) + emf["git_commit_id"] = revision + + dimensions = extract_dimensions(emf) + metrics = extract_metrics(emf) + + metrics_logger.set_dimensions(dimensions) + + for name, (value, unit) in metrics.items(): + metrics_logger.put_metric(name, value, unit) + + result = {} + + for key in emf: + if key not in dimensions and key not in metrics: + metrics_logger.set_property(key, emf[key]) + + if isinstance(emf[key], list): + result[key] = emf[key] + + asyncio.run(metrics_logger.flush()) + + return frozenset(dimensions.items()), result + + +def load_data_series(revision: str): + """Loads the data series relevant for A/B-testing from test_results/test-report.json + into a dictionary mapping each message's cloudwatch dimensions to a dictionary of + its list-valued properties. + + Also reemits all EMF logs using aws-embedded-metrics.""" + data = {} + + report = json.loads(Path("test_results/test-report.json").read_text("UTF-8")) + for test in report["tests"]: + for line in test["teardown"]["stdout"].splitlines(): + # Only look at EMF log messages. If we ever have other stdout that starts with braces, + # we will need to rethink this heuristic. + if line.startswith("{"): + dimensions, result = reemit_emf_log_entry(line, revision) + data[dimensions] = result + + return data + + +def collect_data(firecracker_checkout: Path, test: str): + """Executes the specified test using a firecracker binary compiled from the given checkout""" + with chdir(firecracker_checkout): + revision = utils.run_cmd("git rev-parse HEAD").stdout.strip() + + binary_dir = Path.cwd() / "build" / revision + + if not (binary_dir / "firecracker").exists(): + with chdir(firecracker_checkout): + print(f"Compiling firecracker from revision {binary_dir.name}") + utils.run_cmd("./tools/release.sh --libc musl --profile release") + build_dir = ( + firecracker_checkout + / f"build/cargo_target/{platform.machine()}-unknown-linux-musl/release" + ) + binary_dir.mkdir(parents=True, exist_ok=True) + shutil.copy(build_dir / "firecracker", binary_dir) + shutil.copy(build_dir / "jailer", binary_dir) + else: + print(f"Using existing binaries for revision {binary_dir.name}") + + print("Collecting samples") + _, stdout, _ = utils.run_cmd( + f"AWS_EMF_ENVIRONMENT=local AWS_EMF_NAMESPACE=local ./tools/test.sh --binary-dir=/firecracker/build/{revision} {test} -m nonci" + ) + print(stdout.strip()) + + return load_data_series(revision) + + +def analyze_data(data_a, data_b): + """ + Analyzes the A/B-test data produced by `collect_data`, by performing regression tests + as described this script's doc-comment. + + Returns a mapping of dimensions and properties to the result of their regression test. + """ + assert set(data_a.keys()) == set( + data_b.keys() + ), "A and B run produced incomparable data. This is a bug in the test!" + + results = {} + + metrics_logger = create_metrics_logger() + + for config in data_a: + a_result = data_a[config] + b_result = data_b[config] + + assert set(a_result.keys()) == set( + b_result.keys() + ), "A and B run produced incomparable data. This is a bug in the test!" + + for property_name in a_result: + print( + f"Doing A/B-test for dimensions {config} and property {property_name}" + ) + result = check_regression(a_result[property_name], b_result[property_name]) + + metrics_logger.set_dimensions(dict(config)) + metrics_logger.put_metric("p_value", result.pvalue, "None") + metrics_logger.put_metric("mean_difference", result.statistic, "None") + metrics_logger.set_property("data_a", a_result[property_name]) + metrics_logger.set_property("data_b", b_result[property_name]) + asyncio.run(metrics_logger.flush()) + + results[config, property_name] = result + + return results + + +def ab_performance_test(a_revision, b_revision, test, p_thresh, strength_thresh): + """Does an A/B-test of the specified test across the given revisions""" + _, commit_list, _ = utils.run_cmd( + f"git --no-pager log --oneline {a_revision}..{b_revision}" + ) + print( + f"Performance A/B-test across {a_revision}..{b_revision}. This includes the following commits:" + ) + print(commit_list.strip()) + + a_result, _, results = git_ab_test( + lambda checkout, _: collect_data(checkout, test), + analyze_data, + a_revision=a_revision, + b_revision=b_revision, + ) + + failures = [] + for (config, property_name), result in results.items(): + baseline = a_result[config][property_name] + if ( + result.pvalue < p_thresh + and abs(result.statistic) > abs(statistics.mean(baseline)) * strength_thresh + ): + failures.append((config, property_name, result)) + + failure_report = "\n".join( + f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a regression of \033[0;31m\033[1m{result.statistic:.2f}ms\033[0m for metric \033[1m{property_name}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. This means that observing a change of this magnitude or worse, assuming that performance characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. Tested Dimensions:\n{json.dumps(dict(config), indent=2)}" + for (config, property_name, result) in failures + ) + assert not failures, "\n" + failure_report + print("No regressions detected!") + + +def canonicalize_revision(revision): + """Canonicalizes the given revision to a 40 digit hex SHA""" + return utils.run_cmd(f"git rev-parse {revision}").stdout.strip() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Executes Firecracker's A/B testsuite across the specified commits" + ) + parser.add_argument( + "a_revision", + help="The baseline revision compared to which we want to avoid regressing", + ) + parser.add_argument( + "b_revision", + help="The revision whose performance we want to compare against the results from a_revision", + ) + parser.add_argument("--test", help="The test to run", required=True) + parser.add_argument( + "--significance", + help="The p-value threshold that needs to be crossed for a test result to be considered significant", + default=0.01, + ) + parser.add_argument( + "--relative-strength", + help="The minimal delta required before a regression will be considered valid", + default=0.2, + ) + args = parser.parse_args() + + ab_performance_test( + # These will show up in Cloudwatch, so canonicalize to long commit SHAs + canonicalize_revision(args.a_revision), + canonicalize_revision(args.b_revision), + args.test, + args.significance, + args.relative_strength, + ) diff --git a/tools/devtool b/tools/devtool index 66dabe5ebee3..cbabc5ccf001 100755 --- a/tools/devtool +++ b/tools/devtool @@ -532,7 +532,7 @@ ensure_ci_artifacts() { # Please see `$0 help` for more information. # cmd_test() { - + do_ab_test=0 # Parse any command line args. while [ $# -gt 0 ]; do case "$1" in @@ -545,6 +545,9 @@ cmd_test() { shift local cpuset_mems="$1" ;; + "--ab") + do_ab_test=1 + ;; "--") { shift; break; } ;; *) die "Unknown argument: $1. Please use --help for help." @@ -583,6 +586,12 @@ cmd_test() { say "Detected CI, tuning CPU frequency scaling for reduced variability" fi + test_script="./tools/test.sh" + + if [ $do_ab_test -eq 1 ]; then + test_script="./tools/ab_test.py" + fi + run_devctr \ --privileged \ --security-opt seccomp=unconfined \ @@ -594,7 +603,7 @@ cmd_test() { --cpuset-mems="$cpuset_mems" \ --env-file env.list \ -- \ - ./tools/test.sh "$@" + $test_script "$@" ret=$? @@ -610,6 +619,7 @@ cmd_test() { return $ret } + # `$0 shell` - drop to a shell prompt inside the dev container # Please see `$0 help` for more information. #