test: script for executing A/B-Tests

This script is intended to be executed by our CI to perform an A/B-test across two commits (for instance, a buildkite pipeline would get triggered on PR merge, and the pipeline will call this script with the commit range of the just merged PR). It compiles two git revisions of firecracker using the revisions devtool, and then passes these binaries to the relevant A/B-test. After collecting data for both A and B revision, it analyzes the produced EMF logs for raw time series (e.g. EMF properties/metrics that are assigned lists of values). For any such data series found, it will then perform a statistical test to assert that there is no regression in this data series (for this, it asserts that both A and B revision produce the same EMF messages (based on dimensions), and that for each unique dimension, the same data series are emitted). We choose a Permutation Test as it is non-parametric (which we need since we cannot make normality assumptions about arbitrary performance data). Non-parametric here means it compares two arbitrary sets of samples, and then gives us a p-value about the H_0 hypothesis "both sets of samples were drawn from the same (unknown) distribution". The p-value is easy to interpret, as it tells us the probability of observing a result as bad as the actually measured one, given that performance did not change. Signed-off-by: Patrick Roy <[email protected]>
firecracker-microvm · Sep 22, 2023 · d1dda22 · d1dda22
1 parent 67b2265
commit d1dda22
Show file tree

Hide file tree

Showing 4 changed files with 286 additions and 6 deletions.
diff --git a/tests/framework/ab_test.py b/tests/framework/ab_test.py
@@ -23,9 +23,12 @@
 """
 import contextlib
 import os
+import statistics
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Callable, Optional, TypeVar
+from typing import Callable, List, Optional, TypeVar
+
+import scipy
 
 from framework import utils
 
@@ -97,6 +100,29 @@ def git_ab_test(
         return result_a, result_b, comparison
 
 
+def check_regression(a_samples: List[float], b_samples: List[float]):
+    """Checks for a regression by performing a permutation test. A permutation test is a non-parametric test that takes
+    three parameters: Two populations (sets of samples) and a function computing a "statistic" based on two populations.
+    First, the test computes the statistic for the initial populations. It then randomly
+    permutes the two populations (e.g. merges them and then randomly splits them again). For each such permuted
+    population, the statistic is computed. Then, all the statistics are sorted, and the percentile of the statistic for the
+    initial populations is computed. We then look at the fraction of statistics that are larger/smaller than that of the
+    initial populations. The minimum of these two fractions will then become the p-value.
+
+    The idea is that if the two populations are indeed drawn from the same distribution (e.g. if performance did not
+    change), then permuting will not affect the statistic (indeed, it should be approximately normal-distributed, and
+    the statistic for the initial populations will be somewhere "in the middle").
+
+    Useful for performance tests.
+    """
+    return scipy.stats.permutation_test(
+        (a_samples, b_samples),
+        # Compute the difference of means, such that a positive different indicates potential for regression.
+        lambda x, y: statistics.mean(y) - statistics.mean(x),
+        vectorized=False,
+    )
+
+
 @contextlib.contextmanager
 def temporary_checkout(revision: str):
     """
@@ -106,9 +132,12 @@ def temporary_checkout(revision: str):
     happen along the way.
     """
     with TemporaryDirectory() as tmp_dir:
-        utils.run_cmd(
-            f"git clone https://github.com/firecracker-microvm/firecracker {tmp_dir}"
-        )
+        # `git clone` can take a path instead of an URL, which causes it to create a copy of the
+        # repository at the given path. However, that path needs to point to the root of a repository,
+        # it cannot be some arbitrary subdirectory. Therefore:
+        _, git_root, _ = utils.run_cmd("git rev-parse --show-toplevel")
+        # split off the '\n' at the end of the stdout
+        utils.run_cmd(f"git clone {git_root.strip()} {tmp_dir}")
 
         with chdir(tmp_dir):
             utils.run_cmd(f"git checkout {revision}")

diff --git a/tests/host_tools/metrics.py b/tests/host_tools/metrics.py
@@ -43,7 +43,10 @@
 """
 
 import asyncio
+import json
 import os
+import socket
+from urllib.parse import urlparse
 
 from aws_embedded_metrics.logger.metrics_logger_factory import create_metrics_logger
 
@@ -89,3 +92,16 @@ def get_metrics_logger():
     logger = create_metrics_logger()
     logger.reset_dimensions(False)
     return MetricsWrapper(logger)
+
+
+def emit_raw_emf(emf_msg: dict):
+    """Emites a raw EMF log message to the local cloudwatch agent"""
+    if "AWS_EMF_AGENT_ENDPOINT" not in os.environ:
+        return
+
+    emf_endpoint = urlparse(os.environ["AWS_EMF_AGENT_ENDPOINT"])
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
+        sock.sendto(
+            (json.dumps(emf_msg) + "\n").encode("utf-8"),
+            (emf_endpoint.hostname, emf_endpoint.port),
+        )
diff --git a/tools/ab_test.py b/tools/ab_test.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Script for running A/B-Tests
+
+The script takes two git revisions and a pytest integration test. It utilizes
+our integration test frameworks --binary-dir parameter to execute the given
+test using binaries compiled from each revision, and captures the EMF logs
+output. It the searches for list-valued properties/metrics in the EMF, and runs a
+regression test comparing these lists for the two runs.
+
+It performs the A/B-test as follows:
+For each EMF log message output, look at the dimensions. The script assumes that
+dimensions are unique across all log messages output from a single test run. In
+each log message, then look for all properties that have lists assigned to them,
+and collect them. For both runs of the test, the set of distinct dimensions
+collected this way must be the same. Then, we match corresponding dimensions
+between the two runs, performing statistical regression test across all the list-
+valued properties collected.
+"""
+import argparse
+import asyncio
+import json
+import platform
+import shutil
+import statistics
+import sys
+from pathlib import Path
+
+from aws_embedded_metrics.logger.metrics_logger_factory import create_metrics_logger
+
+# Hack to be able to use our test framework code
+sys.path.append(str(Path(__file__).parent.parent / "tests"))
+
+# pylint:disable=wrong-import-position
+from framework import utils
+from framework.ab_test import chdir, check_regression, git_ab_test
+from host_tools.metrics import emit_raw_emf
+
+
+def extract_dimensions(emf):
+    """Extracts the cloudwatch dimensions from an EMF log message"""
+    dimension_list = emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"][0]
+    return {key: emf[key] for key in emf if key in dimension_list}
+
+
+def reemit_emf_and_get_data(log_entry: str, revision: str):
+    """Parses the given EMF log entry, and reemits it, overwriting the attached "git_commit_id" field
+    with the given revision
+
+    Returns the entries dimensions and its list-valued properties/metrics"""
+    emf = json.loads(log_entry)
+    emf["git_commit_id"] = revision
+    emit_raw_emf(emf)
+
+    result = {key: value for key, value in emf.items() if isinstance(value, list)}
+
+    return extract_dimensions(emf), result
+
+
+def load_data_series(revision: str):
+    """Loads the data series relevant for A/B-testing from test_results/test-report.json
+    into a dictionary mapping each message's cloudwatch dimensions to a dictionary of
+    its list-valued properties/metrics.
+
+    Also reemits all EMF logs."""
+    data = {}
+
+    report = json.loads(Path("test_results/test-report.json").read_text("UTF-8"))
+    for test in report["tests"]:
+        for line in test["teardown"]["stdout"].splitlines():
+            # Only look at EMF log messages. If we ever have other stdout that starts with braces,
+            # we will need to rethink this heuristic.
+            if line.startswith("{"):
+                dimensions, result = reemit_emf_and_get_data(line, revision)
+
+                data[frozenset(dimensions.items())] = result
+
+    return data
+
+
+def collect_data(firecracker_checkout: Path, test: str):
+    """Executes the specified test using a firecracker binary compiled from the given checkout"""
+    with chdir(firecracker_checkout):
+        revision = utils.run_cmd("git rev-parse HEAD").stdout.strip()
+
+    binary_dir = Path.cwd() / "build" / revision
+
+    if not (binary_dir / "firecracker").exists():
+        with chdir(firecracker_checkout):
+            print(f"Compiling firecracker from revision {binary_dir.name}")
+            utils.run_cmd("./tools/release.sh --libc musl --profile release")
+        build_dir = (
+            firecracker_checkout
+            / f"build/cargo_target/{platform.machine()}-unknown-linux-musl/release"
+        )
+        binary_dir.mkdir(parents=True, exist_ok=True)
+        shutil.copy(build_dir / "firecracker", binary_dir)
+        shutil.copy(build_dir / "jailer", binary_dir)
+    else:
+        print(f"Using existing binaries for revision {binary_dir.name}")
+
+    print("Collecting samples")
+    _, stdout, _ = utils.run_cmd(
+        f"AWS_EMF_ENVIRONMENT=local AWS_EMF_NAMESPACE=local ./tools/test.sh --binary-dir=/firecracker/build/{revision} {test} -m nonci"
+    )
+    print(stdout.strip())
+
+    return load_data_series(revision)
+
+
+def analyze_data(data_a, data_b):
+    """
+    Analyzes the A/B-test data produced by `collect_data`, by performing regression tests
+    as described this script's doc-comment.
+
+    Returns a mapping of dimensions and properties/metrics to the result of their regression test.
+    """
+    assert set(data_a.keys()) == set(
+        data_b.keys()
+    ), "A and B run produced incomparable data. This is a bug in the test!"
+
+    results = {}
+
+    metrics_logger = create_metrics_logger()
+
+    for config in data_a:
+        a_result = data_a[config]
+        b_result = data_b[config]
+
+        assert set(a_result.keys()) == set(
+            b_result.keys()
+        ), "A and B run produced incomparable data. This is a bug in the test!"
+
+        for property_name in a_result:
+            print(
+                f"Doing A/B-test for dimensions {config} and property {property_name}"
+            )
+            result = check_regression(a_result[property_name], b_result[property_name])
+
+            metrics_logger.set_dimensions(dict(config))
+            metrics_logger.put_metric("p_value", result.pvalue, "None")
+            metrics_logger.put_metric("mean_difference", result.statistic, "None")
+            metrics_logger.set_property("data_a", a_result[property_name])
+            metrics_logger.set_property("data_b", b_result[property_name])
+            asyncio.run(metrics_logger.flush())
+
+            results[config, property_name] = result
+
+    return results
+
+
+def ab_performance_test(a_revision, b_revision, test, p_thresh, strength_thresh):
+    """Does an A/B-test of the specified test across the given revisions"""
+    _, commit_list, _ = utils.run_cmd(
+        f"git --no-pager log --oneline {a_revision}..{b_revision}"
+    )
+    print(
+        f"Performance A/B-test across {a_revision}..{b_revision}. This includes the following commits:"
+    )
+    print(commit_list.strip())
+
+    a_result, _, results = git_ab_test(
+        lambda checkout, _: collect_data(checkout, test),
+        analyze_data,
+        a_revision=a_revision,
+        b_revision=b_revision,
+    )
+
+    failures = []
+    for (config, property_name), result in results.items():
+        baseline = a_result[config][property_name]
+        if (
+            result.pvalue < p_thresh
+            and abs(result.statistic) > abs(statistics.mean(baseline)) * strength_thresh
+        ):
+            failures.append((config, property_name, result))
+
+    failure_report = "\n".join(
+        f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a regression of \033[0;31m\033[1m{result.statistic:.2f}ms\033[0m for metric \033[1m{property_name}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. This means that observing a change of this magnitude or worse, assuming that performance characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. Tested Dimensions:\n{json.dumps(dict(config), indent=2)}"
+        for (config, property_name, result) in failures
+    )
+    assert not failures, "\n" + failure_report
+    print("No regressions detected!")
+
+
+def canonicalize_revision(revision):
+    """Canonicalizes the given revision to a 40 digit hex SHA"""
+    return utils.run_cmd(f"git rev-parse {revision}").stdout.strip()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Executes Firecracker's A/B testsuite across the specified commits"
+    )
+    parser.add_argument(
+        "a_revision",
+        help="The baseline revision compared to which we want to avoid regressing",
+    )
+    parser.add_argument(
+        "b_revision",
+        help="The revision whose performance we want to compare against the results from a_revision",
+    )
+    parser.add_argument("--test", help="The test to run", required=True)
+    parser.add_argument(
+        "--significance",
+        help="The p-value threshold that needs to be crossed for a test result to be considered significant",
+        default=0.01,
+    )
+    parser.add_argument(
+        "--relative-strength",
+        help="The minimal delta required before a regression will be considered valid",
+        default=0.2,
+    )
+    args = parser.parse_args()
+
+    ab_performance_test(
+        # These will show up in Cloudwatch, so canonicalize to long commit SHAs
+        canonicalize_revision(args.a_revision),
+        canonicalize_revision(args.b_revision),
+        args.test,
+        args.significance,
+        args.relative_strength,
+    )
diff --git a/tools/devtool b/tools/devtool
@@ -532,7 +532,7 @@ ensure_ci_artifacts() {
 # Please see `$0 help` for more information.
 #
 cmd_test() {
-
+    do_ab_test=0
     # Parse any command line args.
     while [ $# -gt 0 ]; do
         case "$1" in
@@ -545,6 +545,9 @@ cmd_test() {
                 shift
                 local cpuset_mems="$1"
                 ;;
+            "--ab")
+                do_ab_test=1
+                ;;
             "--")               { shift; break;     } ;;
             *)
                 die "Unknown argument: $1. Please use --help for help."
@@ -583,6 +586,12 @@ cmd_test() {
       say "Detected CI, tuning CPU frequency scaling for reduced variability"
     fi
 
+    test_script="./tools/test.sh"
+
+    if [ $do_ab_test -eq 1 ]; then
+      test_script="./tools/ab_test.py"
+    fi
+
     run_devctr \
         --privileged \
         --security-opt seccomp=unconfined \
@@ -594,7 +603,7 @@ cmd_test() {
         --cpuset-mems="$cpuset_mems" \
         --env-file env.list \
         -- \
-        ./tools/test.sh "$@"
+        $test_script "$@"
 
     ret=$?
 
@@ -610,6 +619,7 @@ cmd_test() {
     return $ret
 }
 
+
 # `$0 shell` - drop to a shell prompt inside the dev container
 # Please see `$0 help` for more information.
 #