Skip to content

Commit

Permalink
test: emit metrics from A/B snapshot test
Browse files Browse the repository at this point in the history
Emit the pvalue and difference of means as cloudwatch metric, so we can
generate volcano graphs from them later. Also emit the raw time series
as properties.

Signed-off-by: Patrick Roy <[email protected]>
  • Loading branch information
roypat committed Sep 14, 2023
1 parent 557f86f commit 6a9a345
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 24 deletions.
87 changes: 63 additions & 24 deletions tests/integration_tests/performance/test_snapshot_ab.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from framework.ab_test import check_regression
from framework.defs import FC_WORKSPACE_DIR
from framework.microvm import Snapshot
from framework.properties import global_props

USEC_IN_MSEC = 1000
ITERATIONS = 30
Expand Down Expand Up @@ -50,19 +51,24 @@ class SnapshotRestoreTest:
blocks: int = 3
all_devices: bool = False

@property
def id(self):
"""Computes a unique id for this test instance"""
return "all_dev" if self.all_devices else f"{self.vcpus}vcpu_{self.mem}mb"

def create_snapshot(
self,
microvm_factory,
guest_kernel,
rootfs,
binaries: Path,
binaries: Path = None,
) -> Snapshot:
"""Creates the initial snapshot that will loaded repeatedly to sample latencies"""
vm = microvm_factory.build(
guest_kernel,
rootfs,
fc_binary_path=binaries / "firecracker",
jailer_binary_path=binaries / "jailer",
fc_binary_path=binaries / "firecracker" if binaries else None,
jailer_binary_path=binaries / "jailer" if binaries else None,
monitor_memory=False,
)
vm.spawn(log_level="Info")
Expand Down Expand Up @@ -94,7 +100,7 @@ def create_snapshot(
return snapshot

def sample_latency(
self, microvm_factory, guest_kernel, rootfs, binaries: Path
self, microvm_factory, guest_kernel, rootfs, binaries: Path = None
) -> List[float]:
"""Collects latency samples for the microvm configuration specified by this instance"""
snapshot = self.create_snapshot(microvm_factory, guest_kernel, rootfs, binaries)
Expand All @@ -103,8 +109,8 @@ def sample_latency(

for _ in range(ITERATIONS):
microvm = microvm_factory.build(
fc_binary_path=binaries / "firecracker",
jailer_binary_path=binaries / "jailer",
fc_binary_path=binaries / "firecracker" if binaries else None,
jailer_binary_path=binaries / "jailer" if binaries else None,
monitor_memory=False,
)
microvm.spawn()
Expand Down Expand Up @@ -148,7 +154,9 @@ def sample_latency(
],
ids=lambda x: x.id,
)
def test_restore_latency(microvm_factory, rootfs, guest_kernel_linux_4_14, test_setup):
def test_restore_latency(
microvm_factory, rootfs, guest_kernel_linux_4_14, test_setup, metrics
):
"""
Restores snapshots with vcpu/memory configuration, roughly scaling according to mem = (vcpus - 1) * 2048MB,
which resembles firecracker production setups. Also contains a test case for restoring a snapshot will all devices
Expand All @@ -159,23 +167,54 @@ def test_restore_latency(microvm_factory, rootfs, guest_kernel_linux_4_14, test_
a_revision = os.environ.get("PERFORMANCE_TEST_REVISION_A")
b_revision = os.environ.get("PERFORMANCE_TEST_REVISION_B")

ah = test_setup.sample_latency(
microvm_factory,
guest_kernel_linux_4_14,
rootfs,
FC_WORKSPACE_DIR / "build" / a_revision,
)
be = test_setup.sample_latency(
microvm_factory,
guest_kernel_linux_4_14,
rootfs,
FC_WORKSPACE_DIR / "build" / b_revision,
metrics.set_dimensions(
{
"instance": global_props.instance,
"cpu_model": global_props.cpu_model,
"host_kernel": "linux-" + global_props.host_linux_version,
"guest_kernel": "linux-4.14",
"rootfs": rootfs.name,
"performance_test": "test_snapshot_restore_performance",
"guest_config": test_setup.id,
}
)

result = check_regression(ah, be, larger_is_better=False)
if a_revision and b_revision:
ah = test_setup.sample_latency(
microvm_factory,
guest_kernel_linux_4_14,
rootfs,
FC_WORKSPACE_DIR / "build" / a_revision,
)
be = test_setup.sample_latency(
microvm_factory,
guest_kernel_linux_4_14,
rootfs,
FC_WORKSPACE_DIR / "build" / b_revision,
)

result = check_regression(ah, be, larger_is_better=False)

metrics.set_property("revision_a", a_revision)
metrics.set_property("revision_b", b_revision)
metrics.set_property("a_latencies", ah)
metrics.set_property("b_latencies", be)

metrics.put_metric("p_value", result.pvalue, "None")
metrics.put_metric("mean_difference", result.statistic, "Milliseconds")

metrics.flush()

assert (
result.pvalue > SIGNIFICANCE_THRESHOLD
or abs(result.statistic)
< abs(statistics.mean(ah)) * RELATIVE_STRENGTH_THRESHOLD_MS
), f"A/B-testing shows a regression of {result.statistic:.2f}ms with p={result.pvalue}. This means that observing a regression of this magnitude or worse, assuming that performance characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}%"
else:
latencies = test_setup.sample_latency(
microvm_factory, guest_kernel_linux_4_14, rootfs
)

assert (
result.pvalue > SIGNIFICANCE_THRESHOLD
or abs(result.statistic)
< abs(statistics.mean(ah)) * RELATIVE_STRENGTH_THRESHOLD_MS
), f"A/B-testing shows a regression of {result.statistic:.2f}ms with p={result.pvalue}. This means that observing a regression of this magnitude or worse, assuming that performance characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}%"
metrics.set_property("latency", latencies)
metrics.put_metric("latency_Avg", statistics.mean(latencies), "Milliseconds")
metrics.flush()
4 changes: 4 additions & 0 deletions tools/devtool
Original file line number Diff line number Diff line change
Expand Up @@ -676,13 +676,17 @@ cmd_shell() {
cmd_sh() {
ensure_build_dir
ensure_ci_artifacts

env |grep -P "^(AWS_EMF_|BUILDKITE_|CODECOV_)" > env.list
run_devctr \
--privileged \
--ulimit nofile=4096:4096 \
--ulimit memlock=-1:-1 \
--workdir "$CTR_FC_ROOT_DIR" \
--env-file env.list \
-- \
bash --norc -c "$*"
rm env.list
}

# Auto-format all source code, to match the Firecracker requirements. For the
Expand Down

0 comments on commit 6a9a345

Please sign in to comment.