From 78f75638d1bc69576cfa5855779f8318e033a6ab Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Tue, 3 Oct 2023 17:05:27 +0100 Subject: [PATCH] test(ab): Add noise threshold The idea here is the following: Intuitively, different parameterization of the same metric (e.g. the metric "snapshot restore latency" across different microvm configurations) are not completely independent. Therefore, we would generally expect impact to be reflected across multiple tested configurations (and particularly, if different configurations show mean changes in different directions, e.g. 1vcpu gives +5ms latency, but 2vcpu gives -5ms latency), then this is a further hint at a result being noise. To counteract this, we consider the average of the reported mean regressions, and only report failures for a metric whose average regression is at lesat 5%. Mathematically, this has the following justification: By the central limit theorem, the means of samples are normal distributed. Denote by A and B the distributions of the mean of samples from the 'A' and 'B' tests respectively. Under our null hypothesis, the distributions of the 'A' and 'B' samples are identical (although we dont know what the exact distributions are), meaning so are A and B, say A ~ B ~ N(mu, sigma^2). The difference of two normal distributions is also normal distributed, with the means being subtracted and the variances being added. Therefore, A - B ~ N(0, 2sigma^2). However, each parameterization (e.g. 1vcpu, 2vcpu, and so on) will potentially have different variances sigma^2. Here, we do the following assumption: For all parameterizations of the same metric, we have sigma^2/mu^2 = const. I have no mathematical justification for this (as it is a property of our tests), but empirically it works out. This means that (A-B)/mu ~ N(0, c), with c being a constant that is identical across all parameterizations of a metric. This means that we can combine the relative means across different parameterizations, and get a distributions whose expected value is 0, provided our null hypothesis was true. This is exactly what the code added in this commit verifies: The empirical average of our samples of this distribution only negligibly deviates from 0.i Only if it deviates significantly (here by more than 0.05), we actually allow failures in that metric. For all tests but the snapshot restore test on x86_64/{5.10,6.1}, this allows us to completely drop the per-test noise threshold to 0. Signed-off-by: Patrick Roy --- tools/ab_test.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tools/ab_test.py b/tools/ab_test.py index e38579dadf5d..7dd1f5955a8a 100755 --- a/tools/ab_test.py +++ b/tools/ab_test.py @@ -214,7 +214,9 @@ def analyze_data(processed_emf_a, processed_emf_b, *, n_resamples: int = 9999): return results -def ab_performance_test(a_revision, b_revision, test, p_thresh, strength_thresh): +def ab_performance_test( + a_revision, b_revision, test, p_thresh, strength_thresh, noise_threshold +): """Does an A/B-test of the specified test across the given revisions""" _, commit_list, _ = utils.run_cmd( f"git --no-pager log --oneline {a_revision}..{b_revision}" @@ -231,19 +233,28 @@ def ab_performance_test(a_revision, b_revision, test, p_thresh, strength_thresh) b_revision=b_revision, ) + relative_changes_by_metric = {} + failures = [] for (dimension_set, metric), (result, unit) in results.items(): if is_ignored(dict(dimension_set)): continue values_a = processed_emf_a[dimension_set][metric][0] + baseline_mean = statistics.mean(values_a) + + if metric not in relative_changes_by_metric: + relative_changes_by_metric[metric] = [] + relative_changes_by_metric[metric].append(result.statistic / baseline_mean) if ( result.pvalue < p_thresh - and abs(result.statistic) > abs(statistics.mean(values_a)) * strength_thresh + and abs(result.statistic) > baseline_mean * strength_thresh ): failures.append((dimension_set, metric, result, unit)) + print(relative_changes_by_metric) + failure_report = "\n".join( f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a change of " f"\033[0;31m\033[1m{format_with_reduced_unit(result.statistic, unit)}\033[0m " @@ -254,8 +265,9 @@ def ab_performance_test(a_revision, b_revision, test, p_thresh, strength_thresh) f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. " f"Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2)}" for (dimension_set, metric, result, unit) in failures + if abs(statistics.mean(relative_changes_by_metric[metric])) > noise_threshold ) - assert not failures, "\n" + failure_report + assert not failure_report, "\n" + failure_report print("No regressions detected!") @@ -287,8 +299,9 @@ def canonicalize_revision(revision): "--relative-strength", help="The minimal delta required before a regression will be considered valid", type=float, - default=0.2, + default=0.0, ) + parser.add_argument("--noise-threshold", help="", type=float, default=0.05) args = parser.parse_args() ab_performance_test( @@ -298,4 +311,5 @@ def canonicalize_revision(revision): args.test, args.significance, args.relative_strength, + args.noise_threshold, )