test: emit metrics from A/B snapshot test

Emit the pvalue and difference of means as cloudwatch metric, so we can generate volcano graphs from them later. Also emit the raw time series as properties. Signed-off-by: Patrick Roy <[email protected]>
firecracker-microvm · Sep 14, 2023 · 6a9a345 · 6a9a345
1 parent 557f86f
commit 6a9a345
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 24 deletions.
diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -16,6 +16,7 @@
 from framework.ab_test import check_regression
 from framework.defs import FC_WORKSPACE_DIR
 from framework.microvm import Snapshot
+from framework.properties import global_props
 
 USEC_IN_MSEC = 1000
 ITERATIONS = 30
@@ -50,19 +51,24 @@ class SnapshotRestoreTest:
     blocks: int = 3
     all_devices: bool = False
 
+    @property
+    def id(self):
+        """Computes a unique id for this test instance"""
+        return "all_dev" if self.all_devices else f"{self.vcpus}vcpu_{self.mem}mb"
+
     def create_snapshot(
         self,
         microvm_factory,
         guest_kernel,
         rootfs,
-        binaries: Path,
+        binaries: Path = None,
     ) -> Snapshot:
         """Creates the initial snapshot that will loaded repeatedly to sample latencies"""
         vm = microvm_factory.build(
             guest_kernel,
             rootfs,
-            fc_binary_path=binaries / "firecracker",
-            jailer_binary_path=binaries / "jailer",
+            fc_binary_path=binaries / "firecracker" if binaries else None,
+            jailer_binary_path=binaries / "jailer" if binaries else None,
             monitor_memory=False,
         )
         vm.spawn(log_level="Info")
@@ -94,7 +100,7 @@ def create_snapshot(
         return snapshot
 
     def sample_latency(
-        self, microvm_factory, guest_kernel, rootfs, binaries: Path
+        self, microvm_factory, guest_kernel, rootfs, binaries: Path = None
     ) -> List[float]:
         """Collects latency samples for the microvm configuration specified by this instance"""
         snapshot = self.create_snapshot(microvm_factory, guest_kernel, rootfs, binaries)
@@ -103,8 +109,8 @@ def sample_latency(
 
         for _ in range(ITERATIONS):
             microvm = microvm_factory.build(
-                fc_binary_path=binaries / "firecracker",
-                jailer_binary_path=binaries / "jailer",
+                fc_binary_path=binaries / "firecracker" if binaries else None,
+                jailer_binary_path=binaries / "jailer" if binaries else None,
                 monitor_memory=False,
             )
             microvm.spawn()
@@ -148,7 +154,9 @@ def sample_latency(
     ],
     ids=lambda x: x.id,
 )
-def test_restore_latency(microvm_factory, rootfs, guest_kernel_linux_4_14, test_setup):
+def test_restore_latency(
+    microvm_factory, rootfs, guest_kernel_linux_4_14, test_setup, metrics
+):
     """
     Restores snapshots with vcpu/memory configuration, roughly scaling according to mem = (vcpus - 1) * 2048MB,
     which resembles firecracker production setups. Also contains a test case for restoring a snapshot will all devices
@@ -159,23 +167,54 @@ def test_restore_latency(microvm_factory, rootfs, guest_kernel_linux_4_14, test_
     a_revision = os.environ.get("PERFORMANCE_TEST_REVISION_A")
     b_revision = os.environ.get("PERFORMANCE_TEST_REVISION_B")
 
-    ah = test_setup.sample_latency(
-        microvm_factory,
-        guest_kernel_linux_4_14,
-        rootfs,
-        FC_WORKSPACE_DIR / "build" / a_revision,
-    )
-    be = test_setup.sample_latency(
-        microvm_factory,
-        guest_kernel_linux_4_14,
-        rootfs,
-        FC_WORKSPACE_DIR / "build" / b_revision,
+    metrics.set_dimensions(
+        {
+            "instance": global_props.instance,
+            "cpu_model": global_props.cpu_model,
+            "host_kernel": "linux-" + global_props.host_linux_version,
+            "guest_kernel": "linux-4.14",
+            "rootfs": rootfs.name,
+            "performance_test": "test_snapshot_restore_performance",
+            "guest_config": test_setup.id,
+        }
     )
 
-    result = check_regression(ah, be, larger_is_better=False)
+    if a_revision and b_revision:
+        ah = test_setup.sample_latency(
+            microvm_factory,
+            guest_kernel_linux_4_14,
+            rootfs,
+            FC_WORKSPACE_DIR / "build" / a_revision,
+        )
+        be = test_setup.sample_latency(
+            microvm_factory,
+            guest_kernel_linux_4_14,
+            rootfs,
+            FC_WORKSPACE_DIR / "build" / b_revision,
+        )
+
+        result = check_regression(ah, be, larger_is_better=False)
+
+        metrics.set_property("revision_a", a_revision)
+        metrics.set_property("revision_b", b_revision)
+        metrics.set_property("a_latencies", ah)
+        metrics.set_property("b_latencies", be)
+
+        metrics.put_metric("p_value", result.pvalue, "None")
+        metrics.put_metric("mean_difference", result.statistic, "Milliseconds")
+
+        metrics.flush()
+
+        assert (
+            result.pvalue > SIGNIFICANCE_THRESHOLD
+            or abs(result.statistic)
+            < abs(statistics.mean(ah)) * RELATIVE_STRENGTH_THRESHOLD_MS
+        ), f"A/B-testing shows a regression of {result.statistic:.2f}ms with p={result.pvalue}. This means that observing a regression of this magnitude or worse, assuming that performance characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}%"
+    else:
+        latencies = test_setup.sample_latency(
+            microvm_factory, guest_kernel_linux_4_14, rootfs
+        )
 
-    assert (
-        result.pvalue > SIGNIFICANCE_THRESHOLD
-        or abs(result.statistic)
-        < abs(statistics.mean(ah)) * RELATIVE_STRENGTH_THRESHOLD_MS
-    ), f"A/B-testing shows a regression of {result.statistic:.2f}ms with p={result.pvalue}. This means that observing a regression of this magnitude or worse, assuming that performance characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}%"
+        metrics.set_property("latency", latencies)
+        metrics.put_metric("latency_Avg", statistics.mean(latencies), "Milliseconds")
+        metrics.flush()
diff --git a/tools/devtool b/tools/devtool
@@ -676,13 +676,17 @@ cmd_shell() {
 cmd_sh() {
     ensure_build_dir
     ensure_ci_artifacts
+
+    env |grep -P "^(AWS_EMF_|BUILDKITE_|CODECOV_)" > env.list
     run_devctr \
         --privileged \
         --ulimit nofile=4096:4096 \
         --ulimit memlock=-1:-1 \
         --workdir "$CTR_FC_ROOT_DIR" \
+        --env-file env.list \
         -- \
         bash --norc -c "$*"
+    rm env.list
 }
 
 # Auto-format all source code, to match the Firecracker requirements. For the