-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
test: Add a snapshot restore test compatible with A/B-testing
We add a version of the nightly snapshot restore test that does not interface with the current baseline-based testing framework. Instead, it will only write the latency samples to test_results. It respects the --binary-dir option, meaning this test can be used to collect latency samples for firecracker binaries compiled from old revisions. We choose this approach of doing a "data production only" test, instead of having the actual A/B-test be done inside of pytest for multiple reasons: - We cannot compile old firecracker versions from inside pytest, as this would require us to nest docker (or rely on the old firecracker revision being compilable with the current docker container). - Doing the A/B-orchestration outside of the test means the test does not need to support "metrics only" and "A/B" modes (with the former being required for nightly data collection runs). Signed-off-by: Patrick Roy <[email protected]>
- Loading branch information
Showing
2 changed files
with
180 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
180 changes: 180 additions & 0 deletions
180
tests/integration_tests/performance/test_snapshot_ab.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
"""Performance benchmark for snapshot restore.""" | ||
import shutil | ||
import statistics | ||
import tempfile | ||
from dataclasses import dataclass | ||
from functools import lru_cache | ||
from pathlib import Path | ||
from typing import List | ||
|
||
import pytest | ||
|
||
import host_tools.drive as drive_tools | ||
from framework.microvm import Snapshot | ||
from framework.properties import global_props | ||
|
||
USEC_IN_MSEC = 1000 | ||
ITERATIONS = 30 | ||
|
||
|
||
@lru_cache | ||
def get_scratch_drives(): | ||
"""Create an array of scratch disks.""" | ||
scratchdisks = ["vdb", "vdc", "vdd", "vde"] | ||
return [ | ||
(drive, drive_tools.FilesystemFile(tempfile.mktemp(), size=64)) | ||
for drive in scratchdisks | ||
] | ||
|
||
|
||
@dataclass | ||
class SnapshotRestoreTest: | ||
"""Dataclass encapsulating properties of snapshot restore tests""" | ||
|
||
vcpus: int = 1 | ||
mem: int = 128 | ||
nets: int = 3 | ||
blocks: int = 3 | ||
all_devices: bool = False | ||
|
||
@property | ||
def id(self): | ||
"""Computes a unique id for this test instance""" | ||
return "all_dev" if self.all_devices else f"{self.vcpus}vcpu_{self.mem}mb" | ||
|
||
@property | ||
def dimensions(self): | ||
"""Gets the cloudflare dimensions for this test""" | ||
return { | ||
"vcpus": str(self.vcpus), | ||
"guest_memory": f"{self.mem}MB", | ||
"net_devices": str(self.nets), | ||
"block_devices": str(self.blocks), | ||
"vsock_devices": str(int(self.all_devices)), | ||
"balloon_devices": str(int(self.all_devices)), | ||
} | ||
|
||
def create_snapshot( | ||
self, | ||
microvm_factory, | ||
guest_kernel, | ||
rootfs, | ||
) -> Snapshot: | ||
"""Creates the initial snapshot that will be loaded repeatedly to sample latencies""" | ||
vm = microvm_factory.build( | ||
guest_kernel, | ||
rootfs, | ||
monitor_memory=False, | ||
) | ||
vm.spawn(log_level="Info") | ||
vm.time_api_requests = False | ||
vm.basic_config( | ||
vcpu_count=self.vcpus, | ||
mem_size_mib=self.mem, | ||
rootfs_io_engine="Sync", | ||
) | ||
|
||
for _ in range(self.nets): | ||
vm.add_net_iface() | ||
|
||
if self.blocks > 1: | ||
scratch_drives = get_scratch_drives() | ||
for name, diskfile in scratch_drives[: (self.blocks - 1)]: | ||
vm.add_drive(name, diskfile.path, io_engine="Sync") | ||
|
||
if self.all_devices: | ||
vm.api.balloon.put( | ||
amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1 | ||
) | ||
vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path="/v.sock") | ||
|
||
vm.start() | ||
snapshot = vm.snapshot_full() | ||
vm.kill() | ||
|
||
return snapshot | ||
|
||
def sample_latency(self, microvm_factory, guest_kernel, rootfs) -> List[float]: | ||
"""Collects latency samples for the microvm configuration specified by this instance""" | ||
snapshot = self.create_snapshot(microvm_factory, guest_kernel, rootfs) | ||
|
||
values = [] | ||
|
||
for _ in range(ITERATIONS): | ||
microvm = microvm_factory.build( | ||
monitor_memory=False, | ||
) | ||
microvm.spawn() | ||
microvm.restore_from_snapshot(snapshot, resume=True) | ||
|
||
# Check if guest still runs commands. | ||
exit_code, _, _ = microvm.ssh.run("true") | ||
assert exit_code == 0 | ||
|
||
value = 0 | ||
# Parse all metric data points in search of load_snapshot time. | ||
microvm.flush_metrics() | ||
metrics = microvm.get_all_metrics() | ||
for data_point in metrics: | ||
cur_value = data_point["latencies_us"]["load_snapshot"] | ||
if cur_value > 0: | ||
value = cur_value / USEC_IN_MSEC | ||
break | ||
assert value > 0 | ||
values.append(value) | ||
microvm.kill() | ||
shutil.rmtree(Path(microvm.chroot())) | ||
|
||
snapshot.delete() | ||
return values | ||
|
||
|
||
@pytest.mark.nonci | ||
@pytest.mark.parametrize( | ||
"test_setup", | ||
[ | ||
SnapshotRestoreTest(mem=128, vcpus=1), | ||
SnapshotRestoreTest(mem=1024, vcpus=1), | ||
SnapshotRestoreTest(mem=2048, vcpus=2), | ||
SnapshotRestoreTest(mem=4096, vcpus=3), | ||
SnapshotRestoreTest(mem=6144, vcpus=4), | ||
SnapshotRestoreTest(mem=8192, vcpus=5), | ||
SnapshotRestoreTest(mem=10240, vcpus=6), | ||
SnapshotRestoreTest(mem=12288, vcpus=7), | ||
SnapshotRestoreTest(all_devices=True), | ||
], | ||
ids=lambda x: x.id, | ||
) | ||
def test_restore_latency( | ||
microvm_factory, rootfs, guest_kernel_linux_4_14, test_setup, metrics | ||
): | ||
""" | ||
Restores snapshots with vcpu/memory configuration, roughly scaling according to mem = (vcpus - 1) * 2048MB, | ||
which resembles firecracker production setups. Also contains a test case for restoring a snapshot will all devices | ||
attached to it. | ||
We only test a single guest kernel, as the guest kernel does not "participate" in snapshot restore. | ||
""" | ||
|
||
samples = test_setup.sample_latency( | ||
microvm_factory, | ||
guest_kernel_linux_4_14, | ||
rootfs, | ||
) | ||
|
||
metrics.set_dimensions( | ||
{ | ||
"instance": global_props.instance, | ||
"cpu_model": global_props.cpu_model, | ||
"host_kernel": "linux-" + global_props.host_linux_version, | ||
"guest_kernel": guest_kernel_linux_4_14.stem[2:], | ||
"rootfs": rootfs.name, | ||
"performance_test": "test_restore_latency", | ||
**test_setup.dimensions, | ||
} | ||
) | ||
|
||
metrics.set_property("latency", samples) | ||
metrics.put_metric("latency_Avg", statistics.mean(samples), "Milliseconds") |