From ecd36d11c260623bedf061d04a47fd1b6c7a53de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Barb=C3=A1chano?= Date: Wed, 21 Jun 2023 10:41:34 +0200 Subject: [PATCH] test: apply mitigations in Linux 6.1 to make boot times fast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux 6.1 brought a guest boottime performance regression. This commits documents the issue in ./docs/prod-host-setup.md and uses the mitigations to enforce our boot time SLA. Signed-off-by: Pablo Barbáchano Co-authored-by: Luiz Capitulino --- CHANGELOG.md | 4 + README.md | 4 +- docs/prod-host-setup.md | 83 +++++++++++++++++++ .../performance/test_boottime.py | 10 --- tools/devtool | 50 +++++++++++ 5 files changed, 139 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 778e5f13902..aadf78b45c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ ### Added +- [#3837](https://github.com/firecracker-microvm/firecracker/issues/3837): Added + official support for Linux 6.1. See + [prod-host-setup](./docs/prod-host-setup.md) for some security and performance + considerations. - [#4045](https://github.com/firecracker-microvm/firecracker/pull/4045) and [#4075](https://github.com/firecracker-microvm/firecracker/pull/4075): Added `snapshot-editor` tool for modifications of snapshot files. diff --git a/README.md b/README.md index b8679af8402..434d733f952 100644 --- a/README.md +++ b/README.md @@ -132,9 +132,9 @@ We test all combinations of: | Instance | Host OS & Kernel | Guest Rootfs | Guest Kernel | | :--------- | :----------------- | :------------- | :------------- | -| m5d.metal | al2 linux_4.1 | ubuntu 18.04 | linux_4.14 | +| m5d.metal | al2 linux_4.1 | ubuntu 22.04 | linux_4.14 | | m6i.metal | al2 linux_5.10 | | linux_5.10 | -| m6a.metal | | | | +| m6a.metal | al2023 linux_6.1 | | | | m6g.metal | | | | | c7g.metal | | | | diff --git a/docs/prod-host-setup.md b/docs/prod-host-setup.md index e597c56c480..00b65e48b89 100644 --- a/docs/prod-host-setup.md +++ b/docs/prod-host-setup.md @@ -328,3 +328,86 @@ wget -O - https://meltdown.ovh | bash [1]: https://elixir.free-electrons.com/linux/v4.14.203/source/virt/kvm/arm/hyp/timer-sr.c#L63 [2]: https://lists.cs.columbia.edu/pipermail/kvmarm/2017-January/023323.html + +### Linux 6.1 boot time regressions + +Linux 6.1 introduced some regressions in the time it takes to boot a VM, for the +x86_64 architecture. They can be mitigated depending on the CPU and the version +of cgroups in use. + +#### Explanation + +The regression happens in the `KVM_CREATE_VM` ioctl and there are two factors +that cause the issue: + +1. In the implementation of the mitigation for the iTLB multihit vulnerability, + KVM creates a worker thread called `kvm-nx-lpage-recovery`. This thread is + responsible for recovering huge pages split when the mitigation kicks-in. In + the process of creating this thread, KVM calls `cgroup_attach_task_all()` to + move it to the same cgroup used by the hypervisor thread +1. In kernel v4.4, upstream converted a cgroup per process read-write semaphore + into a per-cpu read-write semaphore to allow to perform operations across + multiple processes + ([commit](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?&id=1ed1328792ff46e4bb86a3d7f7be2971f4549f6c)). + It was found that this conversion introduced high latency for write paths, + which mainly includes moving tasks between cgroups. This was fixed in kernel + v4.9 by + [commit](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?&id=3942a9bd7b5842a924e99ee6ec1350b8006c94ec) + which chose to favor writers over readers since moving tasks between cgroups + is a common operation for Android. However, In kernel 6.0, upstream decided + to revert back again and favor readers over writers re-introducing the + original behavior of the rw semaphore + ([commit](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?&id=6a010a49b63ac8465851a79185d8deff966f8e1a)). + At the same time, this commit provided an option called favordynmods to favor + writers over readers. +1. Since the `kvm-nx-lpage-recovery` thread creation and its cgroup change is done + in the `KVM_CREATE_VM` call, the high latency we observe in 6.1 is due to the + upstream decision to favor readers over writers for this per-cpu rw + semaphore. While the 4.14 and 5.10 kernels favor writers over readers. + +The first step is to check if the host is vulnerable to iTLB multihit. Look at +the value of `cat /sys/devices/system/cpu/vulnerabilities/itlb_multihit`. If it +does says `Not affected`, the host is not vulnerable and you can apply +mitigation 2, and optionally 1 for best results. Otherwise it is vulnerable and +you can only apply mitigation 1. + +#### Mitigation 1: `favordynmods` + +The mitigation in this case is to enable `favordynmods` in cgroupsv1 or +cgroupsv2. This changes the behavior of all cgroups in the host, and makes it +closer to the performance of Linux 5.10 and 4.14. + +For cgroupsv2, run this command: + +```sh +sudo mount -o remount,favordynmods /sys/fs/cgroup +``` + +For cgroupsv1, remounting with `favordynmods` is not supported, so it has to be +done at boot time, through a kernel command line option[^1]. Add +`cgroup_favordynmods=true` to your kernel command line in GRUB. Refer to your +distribution's documentation for where to make this change[^2] + +[^2] Look for `GRUB_CMDLINE_LINUX` in file `/etc/default/grub` in RPM-based +systems, and [this doc for +Ubuntu](https://wiki.ubuntu.com/Kernel/KernelBootParameters). + +[^1]: this command line option is still unreleased at the moment of writing, but +will be part of 6.7 and may be backported to 6.1: + + +#### Mitigation 2: `kvm.nx_huge_pages=never` + +This mitigation is preferred to the previous one as it is less invasive (it +doesn't affect other cgroups), but it can also be combined with the cgroups +mitigation. + +```sh +KVM_VENDOR_MOD=$(lsmod |grep -P "^kvm_(amd|intel)" | awk '{print $1}') +sudo modprobe -r $KVM_VENDOR_MOD kvm +sudo modprobe kvm nx_huge_pages=never +sudo modprobe $KVM_VENDOR_MOD +``` + +To validate that the change took effect, the file +`/sys/module/kvm/parameters/nx_huge_pages` should say `never`. diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 96e7599e836..6042eeb1d06 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -56,11 +56,6 @@ def test_no_boottime(test_microvm_with_api): assert not timestamps -# temporarily disable this test in 6.1 -@pytest.mark.xfail( - global_props.host_linux_version == "6.1", - reason="perf regression under investigation", -) @pytest.mark.skipif( global_props.cpu_codename == "INTEL_SKYLAKE" and global_props.host_linux_version == "5.10", @@ -84,11 +79,6 @@ def test_boottime_no_network(fast_microvm, record_property, metrics): ), f"boot time {boottime_us} cannot be greater than: {MAX_BOOT_TIME_US} us" -# temporarily disable this test in 6.1 -@pytest.mark.xfail( - global_props.host_linux_version == "6.1", - reason="perf regression under investigation", -) @pytest.mark.skipif( global_props.cpu_codename == "INTEL_SKYLAKE" and global_props.host_linux_version == "5.10", diff --git a/tools/devtool b/tools/devtool index 9c724a346bf..4a3533705a6 100755 --- a/tools/devtool +++ b/tools/devtool @@ -531,6 +531,53 @@ ensure_ci_artifacts() { fi } +apply_linux_61_tweaks() { + KV=$(uname -r) + if [[ $KV != 6.1.* ]] || [ $(uname -m) != x86_64 ]; then + return + fi + say "Applying Linux 6.1 boot-time regression mitigations" + + KVM_VENDOR_MOD=$(lsmod |grep -P "^kvm_(amd|intel)" | awk '{print $1}') + ITLB_MULTIHIT=/sys/devices/system/cpu/vulnerabilities/itlb_multihit + NX_HUGEPAGES=/sys/module/kvm/parameters/nx_huge_pages + + # If m6a/m6i + if grep -q "Not affected" $ITLB_MULTIHIT; then + echo -e "CPU not vulnerable to iTLB multihit, using kvm.nx_huge_pages=never mitigation" + # we need a lock so another process is not running the same thing and to + # avoid race conditions. + lockfile="/tmp/.linux61_tweaks.lock" + set -C # noclobber + while true; do + if echo "$$" > "$lockfile"; then + echo "Successfully acquired lock" + if ! grep -q "never" $NX_HUGEPAGES; then + echo "Reloading KVM modules with nx_huge_pages=never" + sudo modprobe -r $KVM_VENDOR_MOD kvm + sudo modprobe kvm nx_huge_pages=never + sudo modprobe $KVM_VENDOR_MOD + fi + rm "$lockfile" + break + else + sleep 5s + fi + done + tail -v $ITLB_MULTIHIT $NX_HUGEPAGES + # else (m5d Skylake and CascadeLake) + else + echo "CPU vulnerable to iTLB_multihit, checking if favordynmods is enabled" + mount |grep cgroup |grep -q favordynmods + if [ $? -ne 0 ]; then + say_warn "cgroups' favordynmods option not enabled; VM creation performance may be impacted" + else + echo "favordynmods is enabled" + fi + fi +} + + # `$0 test` - run integration tests # Please see `$0 help` for more information. # @@ -565,9 +612,12 @@ cmd_test() { ensure_build_dir ensure_ci_artifacts + apply_linux_61_tweaks + # If we got to here, we've got all we need to continue. say "Kernel version: $(uname -r)" say "$(sed '/^processor.*: 0$/,/^processor.*: 1$/!d; /^processor.*: 1$/d' /proc/cpuinfo)" + say "RPM microcode_ctl version: $(rpm -q microcode_ctl)" say "Starting test run ..." # Testing (running Firecracker via the jailer) needs root access,