Skip to content

Commit

Permalink
test: print means of data series on failure
Browse files Browse the repository at this point in the history
To help put the magnitude of the change into perspective.

Signed-off-by: Patrick Roy <[email protected]>
  • Loading branch information
roypat committed Sep 27, 2023
1 parent 1a9c002 commit c332c4c
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions tools/ab_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def ab_performance_test(a_revision, b_revision, test, p_thresh, strength_thresh)
)
print(commit_list.strip())

processed_emf_a, _, results = git_ab_test(
processed_emf_a, processed_emf_b, results = git_ab_test(
lambda checkout, _: collect_data(checkout, test),
analyze_data,
a_revision=a_revision,
Expand All @@ -212,7 +212,14 @@ def ab_performance_test(a_revision, b_revision, test, p_thresh, strength_thresh)
failures.append((dimension_set, metric, result, unit))

failure_report = "\n".join(
f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a regression of \033[0;31m\033[1m{format_with_reduced_unit(result.statistic, unit)}\033[0m for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. This means that observing a change of this magnitude or worse, assuming that performance characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2)}"
f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a change of "
f"\033[0;31m\033[1m{format_with_reduced_unit(result.statistic, unit)}\033[0m "
f"(from {format_with_reduced_unit(statistics.mean(processed_emf_a[dimension_set][metric][0]), unit)} "
f"to {format_with_reduced_unit(statistics.mean(processed_emf_b[dimension_set][metric][0]), unit)}) "
f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. "
f"This means that observing a change of this magnitude or worse, assuming that performance "
f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. "
f"Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2)}"
for (dimension_set, metric, result, unit) in failures
)
assert not failures, "\n" + failure_report
Expand Down

0 comments on commit c332c4c

Please sign in to comment.