From b61953e11bd6431ac6c2d0f1e4649dd9e8e56015 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Wed, 13 Sep 2023 14:47:51 +0100 Subject: [PATCH] fix: Avoid reallocating fd table during snapshot restore Linux maintains a per-process file descriptor table. By default, this table has size 64. Whenever space in this table runs out, the file descriptor table gets reallocated, with its size increased to the next power of two. Firecracker creates a lot of eventfds and timerfds for its devices. These are created in the hot path of snapshot restore. For medium to large microVMs, firecracker uses more than 64 file descriptors, meaning we reallocate the file descriptor table on the hot path. However, this reallocation uses a RCU, meaning the kernel needs to hold a lock. To acquire this lock, it needs to wait for all other accesses to the file descriptors to cease, which introduces a severe latency to snapshot-restore times (between 30ms and 70ms). This commit avoids that latency by ensuring the file descriptor table is large enough to hold the jailer-defined limit of file descriptors at process start already. This avoids reallocating the file descriptor table at runtime (and the memory overhead is negligable, as each entry in the fdtable is simply a pointer). Signed-off-by: Patrick Roy --- src/jailer/src/main.rs | 2 ++ src/jailer/src/resource_limits.rs | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/jailer/src/main.rs b/src/jailer/src/main.rs index 1f618b081707..bb206335dbce 100644 --- a/src/jailer/src/main.rs +++ b/src/jailer/src/main.rs @@ -56,6 +56,8 @@ pub enum JailerError { CloseNetNsFd(io::Error), #[error("Failed to close /dev/null fd: {0}")] CloseDevNullFd(io::Error), + #[error("Failed to close dup2'd stdin: {0}")] + CloseDup2(io::Error), #[error("Failed to call close range syscall: {0}")] CloseRange(io::Error), #[error("{}", format!("Failed to copy {:?} to {:?}: {}", .0, .1, .2).replace('\"', ""))] diff --git a/src/jailer/src/resource_limits.rs b/src/jailer/src/resource_limits.rs index 4cd2dc97e79f..2310debfcaab 100644 --- a/src/jailer/src/resource_limits.rs +++ b/src/jailer/src/resource_limits.rs @@ -98,6 +98,24 @@ impl ResourceLimits { // Set limit on number of file descriptors. ResourceLimits::set_limit(Resource::RlimitNoFile, self.no_file)?; + // Resize the file descriptor table to its maximal possible size, to ensure that + // firecracker will not need to reallocate it later. If the file descriptor table + // needs to be reallocated (which by default happens once more than 64 fds exist, + // something that happens for reasonably complex microvms due to each device using + // a multitude of eventfds), this can incur a significant performance impact (it + // was responsible for a 30ms-70ms impact on snapshot restore times). + if self.no_file >= 64 { + // SAFETY: Duplicating stdin is safe + if unsafe { libc::dup2(0, self.no_file as libc::c_int - 1) } < 0 { + return Err(JailerError::Dup2(std::io::Error::last_os_error())); + } + + // SAFETY: Closing the just created duplicate is safe + if unsafe { libc::close(self.no_file as libc::c_int - 1) } < 0 { + return Err(JailerError::CloseDup2(std::io::Error::last_os_error())); + } + } + Ok(()) }