From b3b3cab746483e59f137d1acff87ad0a3afa08ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Barb=C3=A1chano?= Date: Mon, 21 Aug 2023 17:10:17 +0200 Subject: [PATCH 1/3] ci: decorate metrics with the host OS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now, kernel version has been enough to disambiguate the OS, but by being explicit we can disambiguate in the case we have distinct OSes with the same kernel. Signed-off-by: Pablo Barbáchano --- tests/framework/properties.py | 19 +++++++++++++++++++ .../performance/test_boottime.py | 1 + 2 files changed, 20 insertions(+) diff --git a/tests/framework/properties.py b/tests/framework/properties.py index f3572c7b1bb..d934e5d6296 100644 --- a/tests/framework/properties.py +++ b/tests/framework/properties.py @@ -35,6 +35,24 @@ def get_os_version(): return match.group(1) +def get_host_os(kv: str = None): + """ + Extract OS information from the kernel if it's there. + + This only works for AL2 and AL2023 + + >>> get_host_os("6.1.41-63.118.amzn2023.x86_64") + amzn2023 + """ + if kv is None: + kv = platform.release() + parts = kv.split("-") + misc = parts[1].split(".") + if len(misc) > 2 and misc[2] in {"amzn2", "amzn2023"}: + return misc[2] + return None + + class GlobalProps: """Class to hold metadata about the testrun environment""" @@ -52,6 +70,7 @@ def __init__(self): # major.minor.patch self.host_linux_patch = get_kernel_version(2) self.os = get_os_version() + self.host_os = get_host_os() self.libc_ver = "-".join(platform.libc_ver()) self.git_commit_id = run_cmd("git rev-parse HEAD") self.git_branch = run_cmd("git show -s --pretty=%D HEAD") diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 2a9f1e41a0c..96e7599e836 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -26,6 +26,7 @@ DIMENSIONS = { "instance": global_props.instance, "cpu_model": global_props.cpu_model, + "host_os": global_props.host_os, "host_kernel": "linux-" + global_props.host_linux_version, } From ecd36d11c260623bedf061d04a47fd1b6c7a53de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Barb=C3=A1chano?= Date: Wed, 21 Jun 2023 10:41:34 +0200 Subject: [PATCH 2/3] test: apply mitigations in Linux 6.1 to make boot times fast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux 6.1 brought a guest boottime performance regression. This commits documents the issue in ./docs/prod-host-setup.md and uses the mitigations to enforce our boot time SLA. Signed-off-by: Pablo Barbáchano Co-authored-by: Luiz Capitulino --- CHANGELOG.md | 4 + README.md | 4 +- docs/prod-host-setup.md | 83 +++++++++++++++++++ .../performance/test_boottime.py | 10 --- tools/devtool | 50 +++++++++++ 5 files changed, 139 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 778e5f13902..aadf78b45c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ ### Added +- [#3837](https://github.com/firecracker-microvm/firecracker/issues/3837): Added + official support for Linux 6.1. See + [prod-host-setup](./docs/prod-host-setup.md) for some security and performance + considerations. - [#4045](https://github.com/firecracker-microvm/firecracker/pull/4045) and [#4075](https://github.com/firecracker-microvm/firecracker/pull/4075): Added `snapshot-editor` tool for modifications of snapshot files. diff --git a/README.md b/README.md index b8679af8402..434d733f952 100644 --- a/README.md +++ b/README.md @@ -132,9 +132,9 @@ We test all combinations of: | Instance | Host OS & Kernel | Guest Rootfs | Guest Kernel | | :--------- | :----------------- | :------------- | :------------- | -| m5d.metal | al2 linux_4.1 | ubuntu 18.04 | linux_4.14 | +| m5d.metal | al2 linux_4.1 | ubuntu 22.04 | linux_4.14 | | m6i.metal | al2 linux_5.10 | | linux_5.10 | -| m6a.metal | | | | +| m6a.metal | al2023 linux_6.1 | | | | m6g.metal | | | | | c7g.metal | | | | diff --git a/docs/prod-host-setup.md b/docs/prod-host-setup.md index e597c56c480..00b65e48b89 100644 --- a/docs/prod-host-setup.md +++ b/docs/prod-host-setup.md @@ -328,3 +328,86 @@ wget -O - https://meltdown.ovh | bash [1]: https://elixir.free-electrons.com/linux/v4.14.203/source/virt/kvm/arm/hyp/timer-sr.c#L63 [2]: https://lists.cs.columbia.edu/pipermail/kvmarm/2017-January/023323.html + +### Linux 6.1 boot time regressions + +Linux 6.1 introduced some regressions in the time it takes to boot a VM, for the +x86_64 architecture. They can be mitigated depending on the CPU and the version +of cgroups in use. + +#### Explanation + +The regression happens in the `KVM_CREATE_VM` ioctl and there are two factors +that cause the issue: + +1. In the implementation of the mitigation for the iTLB multihit vulnerability, + KVM creates a worker thread called `kvm-nx-lpage-recovery`. This thread is + responsible for recovering huge pages split when the mitigation kicks-in. In + the process of creating this thread, KVM calls `cgroup_attach_task_all()` to + move it to the same cgroup used by the hypervisor thread +1. In kernel v4.4, upstream converted a cgroup per process read-write semaphore + into a per-cpu read-write semaphore to allow to perform operations across + multiple processes + ([commit](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?&id=1ed1328792ff46e4bb86a3d7f7be2971f4549f6c)). + It was found that this conversion introduced high latency for write paths, + which mainly includes moving tasks between cgroups. This was fixed in kernel + v4.9 by + [commit](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?&id=3942a9bd7b5842a924e99ee6ec1350b8006c94ec) + which chose to favor writers over readers since moving tasks between cgroups + is a common operation for Android. However, In kernel 6.0, upstream decided + to revert back again and favor readers over writers re-introducing the + original behavior of the rw semaphore + ([commit](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?&id=6a010a49b63ac8465851a79185d8deff966f8e1a)). + At the same time, this commit provided an option called favordynmods to favor + writers over readers. +1. Since the `kvm-nx-lpage-recovery` thread creation and its cgroup change is done + in the `KVM_CREATE_VM` call, the high latency we observe in 6.1 is due to the + upstream decision to favor readers over writers for this per-cpu rw + semaphore. While the 4.14 and 5.10 kernels favor writers over readers. + +The first step is to check if the host is vulnerable to iTLB multihit. Look at +the value of `cat /sys/devices/system/cpu/vulnerabilities/itlb_multihit`. If it +does says `Not affected`, the host is not vulnerable and you can apply +mitigation 2, and optionally 1 for best results. Otherwise it is vulnerable and +you can only apply mitigation 1. + +#### Mitigation 1: `favordynmods` + +The mitigation in this case is to enable `favordynmods` in cgroupsv1 or +cgroupsv2. This changes the behavior of all cgroups in the host, and makes it +closer to the performance of Linux 5.10 and 4.14. + +For cgroupsv2, run this command: + +```sh +sudo mount -o remount,favordynmods /sys/fs/cgroup +``` + +For cgroupsv1, remounting with `favordynmods` is not supported, so it has to be +done at boot time, through a kernel command line option[^1]. Add +`cgroup_favordynmods=true` to your kernel command line in GRUB. Refer to your +distribution's documentation for where to make this change[^2] + +[^2] Look for `GRUB_CMDLINE_LINUX` in file `/etc/default/grub` in RPM-based +systems, and [this doc for +Ubuntu](https://wiki.ubuntu.com/Kernel/KernelBootParameters). + +[^1]: this command line option is still unreleased at the moment of writing, but +will be part of 6.7 and may be backported to 6.1: + + +#### Mitigation 2: `kvm.nx_huge_pages=never` + +This mitigation is preferred to the previous one as it is less invasive (it +doesn't affect other cgroups), but it can also be combined with the cgroups +mitigation. + +```sh +KVM_VENDOR_MOD=$(lsmod |grep -P "^kvm_(amd|intel)" | awk '{print $1}') +sudo modprobe -r $KVM_VENDOR_MOD kvm +sudo modprobe kvm nx_huge_pages=never +sudo modprobe $KVM_VENDOR_MOD +``` + +To validate that the change took effect, the file +`/sys/module/kvm/parameters/nx_huge_pages` should say `never`. diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 96e7599e836..6042eeb1d06 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -56,11 +56,6 @@ def test_no_boottime(test_microvm_with_api): assert not timestamps -# temporarily disable this test in 6.1 -@pytest.mark.xfail( - global_props.host_linux_version == "6.1", - reason="perf regression under investigation", -) @pytest.mark.skipif( global_props.cpu_codename == "INTEL_SKYLAKE" and global_props.host_linux_version == "5.10", @@ -84,11 +79,6 @@ def test_boottime_no_network(fast_microvm, record_property, metrics): ), f"boot time {boottime_us} cannot be greater than: {MAX_BOOT_TIME_US} us" -# temporarily disable this test in 6.1 -@pytest.mark.xfail( - global_props.host_linux_version == "6.1", - reason="perf regression under investigation", -) @pytest.mark.skipif( global_props.cpu_codename == "INTEL_SKYLAKE" and global_props.host_linux_version == "5.10", diff --git a/tools/devtool b/tools/devtool index 9c724a346bf..4a3533705a6 100755 --- a/tools/devtool +++ b/tools/devtool @@ -531,6 +531,53 @@ ensure_ci_artifacts() { fi } +apply_linux_61_tweaks() { + KV=$(uname -r) + if [[ $KV != 6.1.* ]] || [ $(uname -m) != x86_64 ]; then + return + fi + say "Applying Linux 6.1 boot-time regression mitigations" + + KVM_VENDOR_MOD=$(lsmod |grep -P "^kvm_(amd|intel)" | awk '{print $1}') + ITLB_MULTIHIT=/sys/devices/system/cpu/vulnerabilities/itlb_multihit + NX_HUGEPAGES=/sys/module/kvm/parameters/nx_huge_pages + + # If m6a/m6i + if grep -q "Not affected" $ITLB_MULTIHIT; then + echo -e "CPU not vulnerable to iTLB multihit, using kvm.nx_huge_pages=never mitigation" + # we need a lock so another process is not running the same thing and to + # avoid race conditions. + lockfile="/tmp/.linux61_tweaks.lock" + set -C # noclobber + while true; do + if echo "$$" > "$lockfile"; then + echo "Successfully acquired lock" + if ! grep -q "never" $NX_HUGEPAGES; then + echo "Reloading KVM modules with nx_huge_pages=never" + sudo modprobe -r $KVM_VENDOR_MOD kvm + sudo modprobe kvm nx_huge_pages=never + sudo modprobe $KVM_VENDOR_MOD + fi + rm "$lockfile" + break + else + sleep 5s + fi + done + tail -v $ITLB_MULTIHIT $NX_HUGEPAGES + # else (m5d Skylake and CascadeLake) + else + echo "CPU vulnerable to iTLB_multihit, checking if favordynmods is enabled" + mount |grep cgroup |grep -q favordynmods + if [ $? -ne 0 ]; then + say_warn "cgroups' favordynmods option not enabled; VM creation performance may be impacted" + else + echo "favordynmods is enabled" + fi + fi +} + + # `$0 test` - run integration tests # Please see `$0 help` for more information. # @@ -565,9 +612,12 @@ cmd_test() { ensure_build_dir ensure_ci_artifacts + apply_linux_61_tweaks + # If we got to here, we've got all we need to continue. say "Kernel version: $(uname -r)" say "$(sed '/^processor.*: 0$/,/^processor.*: 1$/!d; /^processor.*: 1$/d' /proc/cpuinfo)" + say "RPM microcode_ctl version: $(rpm -q microcode_ctl)" say "Starting test run ..." # Testing (running Firecracker via the jailer) needs root access, From df5eb532b0ecc7cd2b246e4d360f86c894693439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Barb=C3=A1chano?= Date: Wed, 30 Aug 2023 20:55:31 +0200 Subject: [PATCH 3/3] tests: allow running outside of a git repository MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the git information for metrics optional. Useful when running outside of a classic git repository, for example extracted from a tarball or a git worktree. Signed-off-by: Pablo Barbáchano --- tests/framework/properties.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/framework/properties.py b/tests/framework/properties.py index d934e5d6296..036d3743b42 100644 --- a/tests/framework/properties.py +++ b/tests/framework/properties.py @@ -72,13 +72,19 @@ def __init__(self): self.os = get_os_version() self.host_os = get_host_os() self.libc_ver = "-".join(platform.libc_ver()) - self.git_commit_id = run_cmd("git rev-parse HEAD") - self.git_branch = run_cmd("git show -s --pretty=%D HEAD") - self.git_origin_url = run_cmd("git config --get remote.origin.url") self.rust_version = run_cmd("rustc --version |awk '{print $2}'") self.buildkite_pipeline_slug = os.environ.get("BUILDKITE_PIPELINE_SLUG") self.buildkite_build_number = os.environ.get("BUILDKITE_BUILD_NUMBER") + if self._in_git_repo(): + self.git_commit_id = run_cmd("git rev-parse HEAD") + self.git_branch = run_cmd("git show -s --pretty=%D HEAD") + self.git_origin_url = run_cmd("git config --get remote.origin.url") + else: + self.git_commit_id = None + self.git_branch = None + self.git_origin_url = None + self.environment = self._detect_environment() if self.is_ec2: self.instance = imdsv2_get("/meta-data/instance-type") @@ -105,6 +111,12 @@ def _detect_environment(self): except Exception: return "local" + def _in_git_repo(self): + try: + run_cmd("git rev-parse --show-toplevel") + except subprocess.CalledProcessError: + return False + return True + global_props = GlobalProps() -# TBD could do a props fixture for tests to use...