From 269bb1a61ca25028a755e1d643baa595e0769bb8 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 5 Sep 2023 14:28:00 +0100
Subject: [PATCH] test: script for executing A/B-Tests

This script is intended to be executed by our CI to perform an A/B-test
across two commits (for instance, a buildkite pipeline would get
triggered on PR merge, and the pipeline will call this script with the
commit range of the just merged PR).

It compiles two git revisions of firecracker using the revisions
devtool, and then passes these binaries to the relevant A/B-test.

After collecting data for both A and B revision, it analyzes the
produced EMF logs for raw time series (e.g. EMF properties that are
assigned lists of values). For any such data series found, it will then
perform a statistical test to assert that there is no regression in this
data series (for this, it asserts that both A and B revision produce the
same EMF messages (based on dimensions), and that for each unique
dimension, the same data series are emitted).

We choose a Permutation Test as it is non-parametric (which we need
since we cannot make normality assumptions about arbitrary performance
data). Non-parametric here means it compares two arbitrary sets of
samples, and then gives us a p-value about the H_0 hypothesis "both
sets of samples were drawn from the same (unknown) distribution".
The p-value is easy to interpret, as it tells us the probability of
observing a result as bad as the actually measured one, given that
performance did not change.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/framework/ab_test.py |  37 +++++-
 tools/ab_test.py           | 260 +++++++++++++++++++++++++++++++++++++
 tools/devtool              |  14 +-
 3 files changed, 305 insertions(+), 6 deletions(-)
 create mode 100755 tools/ab_test.py

diff --git a/tests/framework/ab_test.py b/tests/framework/ab_test.py
index 6520d9c73ee8..316ecc5704b9 100644
--- a/tests/framework/ab_test.py
+++ b/tests/framework/ab_test.py
@@ -23,9 +23,12 @@
 """
 import contextlib
 import os
+import statistics
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Callable, Optional, TypeVar
+from typing import Callable, List, Optional, TypeVar
+
+import scipy
 
 from framework import utils
 
@@ -97,6 +100,29 @@ def git_ab_test(
         return result_a, result_b, comparison
 
 
+def check_regression(a_samples: List[float], b_samples: List[float]):
+    """Checks for a regression by performing a permutation test. A permutation test is a non-parametric test that takes
+    three parameters: Two populations (sets of samples) and a function computing a "statistic" based on two populations.
+    First, the test computes the statistic for the initial populations. It then randomly
+    permutes the two populations (e.g. merges them and then randomly splits them again). For each such permuted
+    population, the statistic is computed. Then, all the statistics are sorted, and the percentile of the statistic for the
+    initial populations is computed. We then look at the fraction of statistics that are larger/smaller than that of the
+    initial populations. The minimum of these two fractions will then become the p-value.
+
+    The idea is that if the two populations are indeed drawn from the same distribution (e.g. if performance did not
+    change), then permuting will not affect the statistic (indeed, it should be approximately normal-distributed, and
+    the statistic for the initial populations will be somewhere "in the middle").
+
+    Useful for performance tests.
+    """
+    return scipy.stats.permutation_test(
+        (a_samples, b_samples),
+        # Compute the difference of means, such that a positive different indicates potential for regression.
+        lambda x, y: statistics.mean(y) - statistics.mean(x),
+        vectorized=False,
+    )
+
+
 @contextlib.contextmanager
 def temporary_checkout(revision: str):
     """
@@ -106,9 +132,12 @@ def temporary_checkout(revision: str):
     happen along the way.
     """
     with TemporaryDirectory() as tmp_dir:
-        utils.run_cmd(
-            f"git clone https://github.com/firecracker-microvm/firecracker {tmp_dir}"
-        )
+        # `git clone` can take a path instead of an URL, which causes it to create a copy of the
+        # repository at the given path. However, that path needs to point to the root of a repository,
+        # it cannot be some arbitrary subdirectory. Therefore:
+        _, git_root, _ = utils.run_cmd("git rev-parse --show-toplevel")
+        # split off the '\n' at the end of the stdout
+        utils.run_cmd(f"git clone {git_root.strip()} {tmp_dir}")
 
         with chdir(tmp_dir):
             utils.run_cmd(f"git checkout {revision}")
diff --git a/tools/ab_test.py b/tools/ab_test.py
new file mode 100755
index 000000000000..9049f1c5fbe8
--- /dev/null
+++ b/tools/ab_test.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Script for running A/B-Tests
+
+The script takes two git revisions and a pytest integration test. It utilizes
+our integration test frameworks --binary-dir parameter to execute the given
+test using binaries compiled from each revision, and captures the EMF logs
+output. It the searches for list-valued properties in the EMF, and runs a
+regression test comparing these lists for the two runs.
+
+It performs the A/B-test as follows:
+For each EMF log message output, look at the dimensions. The script assumes that
+dimensions are unique across all log messages output from a single test run. In
+each log message, then look for all properties that have lists assigned to them,
+and collect them. For both runs of the test, the set of distinct dimensions
+collected this way must be the same. Then, we match corresponding dimensions
+between the two runs, performing statistical regression test across all the list-
+valued properties collected.
+"""
+import argparse
+import asyncio
+import json
+import platform
+import shutil
+import statistics
+import sys
+from pathlib import Path
+
+from aws_embedded_metrics.logger.metrics_logger_factory import create_metrics_logger
+
+# Hack to be able to use our test framework code
+sys.path.append(str(Path(__file__).parent.parent / "tests"))
+
+# pylint:disable=wrong-import-position
+from framework import utils
+from framework.ab_test import chdir, check_regression, git_ab_test
+
+
+def extract_dimensions(emf):
+    """Extracts the cloudwatch dimensions from an EMF log message"""
+    dimension_list = emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"][0]
+    dimensions = {}
+
+    for key in emf:
+        if key in dimension_list:
+            dimensions[key] = emf[key]
+
+    return dimensions
+
+
+def extract_metrics(emf):
+    """Extracts the cloudwatch metrics from an EMF log message"""
+    metric_description = emf["_aws"]["CloudWatchMetrics"][0]["Metrics"]
+    metrics = {}
+
+    for key in emf:
+        try:
+            metric = next(
+                metric for metric in metric_description if metric["Name"] == key
+            )
+            metrics[key] = (emf[key], metric["Unit"])
+        except StopIteration:
+            pass
+
+    return metrics
+
+
+def reemit_emf_log_entry(log_entry: str, revision: str):
+    """Parses the given EMF log entry, and reemits it, overwriting the attached "git_commit_id" field with the given revision"""
+    metrics_logger = create_metrics_logger()
+    emf = json.loads(log_entry)
+    emf["git_commit_id"] = revision
+
+    dimensions = extract_dimensions(emf)
+    metrics = extract_metrics(emf)
+
+    metrics_logger.set_dimensions(dimensions)
+
+    for name, (value, unit) in metrics.items():
+        metrics_logger.put_metric(name, value, unit)
+
+    result = {}
+
+    for key in emf:
+        if key not in dimensions and key not in metrics:
+            metrics_logger.set_property(key, emf[key])
+
+            if isinstance(emf[key], list):
+                result[key] = emf[key]
+
+    asyncio.run(metrics_logger.flush())
+
+    return frozenset(dimensions.items()), result
+
+
+def load_data_series(revision: str):
+    """Loads the data series relevant for A/B-testing from test_results/test-report.json
+    into a dictionary mapping each message's cloudwatch dimensions to a dictionary of
+    its list-valued properties.
+
+    Also reemits all EMF logs using aws-embedded-metrics."""
+    data = {}
+
+    report = json.loads(Path("test_results/test-report.json").read_text("UTF-8"))
+    for test in report["tests"]:
+        for line in test["teardown"]["stdout"].splitlines():
+            # Only look at EMF log messages. If we ever have other stdout that starts with braces,
+            # we will need to rethink this heuristic.
+            if line.startswith("{"):
+                dimensions, result = reemit_emf_log_entry(line, revision)
+                data[dimensions] = result
+
+    return data
+
+
+def collect_data(firecracker_checkout: Path, test: str):
+    """Executes the specified test using a firecracker binary compiled from the given checkout"""
+    with chdir(firecracker_checkout):
+        revision = utils.run_cmd("git rev-parse HEAD").stdout.strip()
+
+    binary_dir = Path.cwd() / "build" / revision
+
+    if not (binary_dir / "firecracker").exists():
+        with chdir(firecracker_checkout):
+            print(f"Compiling firecracker from revision {binary_dir.name}")
+            utils.run_cmd("./tools/release.sh --libc musl --profile release")
+        build_dir = (
+            firecracker_checkout
+            / f"build/cargo_target/{platform.machine()}-unknown-linux-musl/release"
+        )
+        binary_dir.mkdir(parents=True, exist_ok=True)
+        shutil.copy(build_dir / "firecracker", binary_dir)
+        shutil.copy(build_dir / "jailer", binary_dir)
+    else:
+        print(f"Using existing binaries for revision {binary_dir.name}")
+
+    print("Collecting samples")
+    _, stdout, _ = utils.run_cmd(
+        f"AWS_EMF_ENVIRONMENT=local AWS_EMF_NAMESPACE=local ./tools/test.sh --binary-dir=/firecracker/build/{revision} {test} -m nonci"
+    )
+    print(stdout.strip())
+
+    return load_data_series(revision)
+
+
+def analyze_data(data_a, data_b):
+    """
+    Analyzes the A/B-test data produced by `collect_data`, by performing regression tests
+    as described this script's doc-comment.
+
+    Returns a mapping of dimensions and properties to the result of their regression test.
+    """
+    assert set(data_a.keys()) == set(
+        data_b.keys()
+    ), "A and B run produced incomparable data. This is a bug in the test!"
+
+    results = {}
+
+    metrics_logger = create_metrics_logger()
+
+    for config in data_a:
+        a_result = data_a[config]
+        b_result = data_b[config]
+
+        assert set(a_result.keys()) == set(
+            b_result.keys()
+        ), "A and B run produced incomparable data. This is a bug in the test!"
+
+        for property_name in a_result:
+            print(
+                f"Doing A/B-test for dimensions {config} and property {property_name}"
+            )
+            result = check_regression(a_result[property_name], b_result[property_name])
+
+            metrics_logger.set_dimensions(dict(config))
+            metrics_logger.put_metric("p_value", result.pvalue, "None")
+            metrics_logger.put_metric("mean_difference", result.statistic, "None")
+            metrics_logger.set_property("data_a", a_result[property_name])
+            metrics_logger.set_property("data_b", b_result[property_name])
+            asyncio.run(metrics_logger.flush())
+
+            results[config, property_name] = result
+
+    return results
+
+
+def ab_performance_test(a_revision, b_revision, test, p_thresh, strength_thresh):
+    """Does an A/B-test of the specified test across the given revisions"""
+    _, commit_list, _ = utils.run_cmd(
+        f"git --no-pager log --oneline {a_revision}..{b_revision}"
+    )
+    print(
+        f"Performance A/B-test across {a_revision}..{b_revision}. This includes the following commits:"
+    )
+    print(commit_list.strip())
+
+    a_result, _, results = git_ab_test(
+        lambda checkout, _: collect_data(checkout, test),
+        analyze_data,
+        a_revision=a_revision,
+        b_revision=b_revision,
+    )
+
+    failures = []
+    for (config, property_name), result in results.items():
+        baseline = a_result[config][property_name]
+        if (
+            result.pvalue < p_thresh
+            and abs(result.statistic) > abs(statistics.mean(baseline)) * strength_thresh
+        ):
+            failures.append((config, property_name, result))
+
+    failure_report = "\n".join(
+        f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a regression of \033[0;31m\033[1m{result.statistic:.2f}ms\033[0m for metric \033[1m{property_name}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. This means that observing a change of this magnitude or worse, assuming that performance characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. Tested Dimensions:\n{json.dumps(dict(config), indent=2)}"
+        for (config, property_name, result) in failures
+    )
+    assert not failures, "\n" + failure_report
+    print("No regressions detected!")
+
+
+def canonicalize_revision(revision):
+    """Canonicalizes the given revision to a 40 digit hex SHA"""
+    return utils.run_cmd(f"git rev-parse {revision}").stdout.strip()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Executes Firecracker's A/B testsuite across the specified commits"
+    )
+    parser.add_argument(
+        "a_revision",
+        help="The baseline revision compared to which we want to avoid regressing",
+    )
+    parser.add_argument(
+        "b_revision",
+        help="The revision whose performance we want to compare against the results from a_revision",
+    )
+    parser.add_argument("--test", help="The test to run", required=True)
+    parser.add_argument(
+        "--significance",
+        help="The p-value threshold that needs to be crossed for a test result to be considered significant",
+        default=0.01,
+    )
+    parser.add_argument(
+        "--relative-strength",
+        help="The minimal delta required before a regression will be considered valid",
+        default=0.2,
+    )
+    args = parser.parse_args()
+
+    ab_performance_test(
+        # These will show up in Cloudwatch, so canonicalize to long commit SHAs
+        canonicalize_revision(args.a_revision),
+        canonicalize_revision(args.b_revision),
+        args.test,
+        args.significance,
+        args.relative_strength,
+    )
diff --git a/tools/devtool b/tools/devtool
index 66dabe5ebee3..cbabc5ccf001 100755
--- a/tools/devtool
+++ b/tools/devtool
@@ -532,7 +532,7 @@ ensure_ci_artifacts() {
 # Please see `$0 help` for more information.
 #
 cmd_test() {
-
+    do_ab_test=0
     # Parse any command line args.
     while [ $# -gt 0 ]; do
         case "$1" in
@@ -545,6 +545,9 @@ cmd_test() {
                 shift
                 local cpuset_mems="$1"
                 ;;
+            "--ab")
+                do_ab_test=1
+                ;;
             "--")               { shift; break;     } ;;
             *)
                 die "Unknown argument: $1. Please use --help for help."
@@ -583,6 +586,12 @@ cmd_test() {
       say "Detected CI, tuning CPU frequency scaling for reduced variability"
     fi
 
+    test_script="./tools/test.sh"
+
+    if [ $do_ab_test -eq 1 ]; then
+      test_script="./tools/ab_test.py"
+    fi
+
     run_devctr \
         --privileged \
         --security-opt seccomp=unconfined \
@@ -594,7 +603,7 @@ cmd_test() {
         --cpuset-mems="$cpuset_mems" \
         --env-file env.list \
         -- \
-        ./tools/test.sh "$@"
+        $test_script "$@"
 
     ret=$?
 
@@ -610,6 +619,7 @@ cmd_test() {
     return $ret
 }
 
+
 # `$0 shell` - drop to a shell prompt inside the dev container
 # Please see `$0 help` for more information.
 #