From e9b07f83bd4447b6dbbcd7da57a4a5ef3c5bfa3a Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Wed, 13 Sep 2023 14:47:51 +0100 Subject: [PATCH] fix: Avoid reallocating fd table during snapshot restore Linux maintains a per-process file descriptor table. By default, this table has size 64. Whenever space in this table runs out, the file descriptor table gets reallocated, with its size increased to the next power of two. Firecracker creates a lot of eventfds and timerfds for its devices. These are created in the hot path of snapshot restore. For medium to large microVMs, firecracker uses more than 64 file descriptors, meaning we reallocate the file descriptor table on the hot path. However, this reallocation uses a RCU, meaning the kernel needs to hold a lock. To acquire this lock, it needs to wait for all other accesses to the file descriptors to cease, which introduces a severe latency to snapshot-restore times (between 30ms and 70ms). This commit avoids that latency by ensuring the file descriptor table is large enough to hold the jailer-defined limit of file descriptors at process start already. This avoids reallocating the file descriptor table at runtime (and the memory overhead is negligable, as each entry in the fdtable is simply a pointer). Signed-off-by: Patrick Roy --- CHANGELOG.md | 5 +++ src/firecracker/src/main.rs | 73 +++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c48d33649f7..538a7c4c82f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,11 @@ apply the mitigation against MMIO stale data vulnerability when it is running on a processor that does not enumerate FBSDP_NO, PSDP_NO and SBDR_SSDP_NO on IA32_ARCH_CAPABILITIES MSR. +- Made Firecracker resize its file descriptor table on process start. It now + preallocates the in-kernel fdtable to hold `RLIMIT_NOFILE` many fds (or 2048 + if no limit is set). This avoids the kernel reallocating the fdtable during + Firecracker operations, resulting in a 30ms to 70ms reduction of snapshot + restore times for medium to large microVMs with many devices attached. ### Fixed diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 1a1d3f836d6..31f05517fbb 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -55,12 +55,24 @@ enum MainError { MetricsInitialization(MetricsConfigError), #[error("Seccomp error: {0}")] SeccompFilter(FilterError), + #[error("Failed to resize fd table: {0}")] + ResizeFdtable(ResizeFdTableError), #[error("RunWithApiError error: {0}")] RunWithApi(ApiServerError), #[error("RunWithoutApiError error: {0}")] RunWithoutApiError(RunWithoutApiError), } +#[derive(Debug, thiserror::Error)] +enum ResizeFdTableError { + #[error("Failed to get RLIMIT_NOFILE")] + GetRlimit, + #[error("Failed to call dup2 to resize fdtable")] + Dup2(io::Error), + #[error("Failed to close dup2'd file descriptor")] + Close(io::Error), +} + impl From for ExitCode { fn from(value: MainError) -> Self { let exit_code = match value { @@ -96,6 +108,19 @@ fn main_exec() -> Result<(), MainError> { #[cfg(target_arch = "aarch64")] enable_ssbd_mitigation(); + if let Err(err) = resize_fdtable() { + match err { + // These errors are non-critical: In the worst case we have worse snapshot restore + // performance. + ResizeFdTableError::GetRlimit | ResizeFdTableError::Dup2(_) => { + logger::debug!("Failed to resize fdtable: {err}") + } + // This error means that we now have a random file descriptor lying around, abort to be + // cautious. + ResizeFdTableError::Close(_) => return Err(MainError::ResizeFdtable(err)), + } + } + // We need this so that we can reset terminal to canonical mode if panic occurs. let stdin = io::stdin(); @@ -397,6 +422,54 @@ fn main_exec() -> Result<(), MainError> { } } +/// Attempts to resize the processes file descriptor table to match RLIMIT_NOFILE or 2048 if no +/// RLIMIT_NOFILE is set (this can only happen if firecracker is run outside the jailer. 2048 is +/// the default the jailer would set). +/// +/// We do this resizing because the kernel default is 64, with a reallocation happening whenever +/// the tabel fills up. This was happening for some larger microVMs, and reallocating the +/// fdtable while a lot of file descriptors are active (due to being eventfds/timerfds registered +/// to epoll) incurs a penalty of 30ms-70ms on the snapshot restore path. +fn resize_fdtable() -> Result<(), ResizeFdTableError> { + let mut rlimit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + + // SAFETY: We pass a pointer to a valid area of memory to which we have exclusive mutable access + if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlimit as *mut libc::rlimit) } < 0 { + return Err(ResizeFdTableError::GetRlimit); + } + + // If no jailer is used, there might not be an NOFILE limit set. In this case, resize + // the table to the default that the jailer would usually impose (2048) + let limit: libc::c_int = if rlimit.rlim_cur == libc::RLIM_INFINITY { + 2048 + } else { + rlimit.rlim_cur.try_into().unwrap_or(2048) + }; + + // Resize the file descriptor table to its maximal possible size, to ensure that + // firecracker will not need to reallocate it later. If the file descriptor table + // needs to be reallocated (which by default happens once more than 64 fds exist, + // something that happens for reasonably complex microvms due to each device using + // a multitude of eventfds), this can incur a significant performance impact (it + // was responsible for a 30ms-70ms impact on snapshot restore times). + if limit > 3 { + // SAFETY: Duplicating stdin is safe + if unsafe { libc::dup2(0, limit - 1) } < 0 { + return Err(ResizeFdTableError::Dup2(io::Error::last_os_error())); + } + + // SAFETY: Closing the just created duplicate is safe + if unsafe { libc::close(limit - 1) } < 0 { + return Err(ResizeFdTableError::Close(io::Error::last_os_error())); + } + } + + Ok(()) +} + /// Enable SSBD mitigation through `prctl`. #[cfg(target_arch = "aarch64")] pub fn enable_ssbd_mitigation() {