From 9f068770d9e421ef8bb8428dce5a5a820dbd1f68 Mon Sep 17 00:00:00 2001 From: Muir Manders Date: Fri, 22 Nov 2024 16:40:59 -0800 Subject: [PATCH] status: add ProjFS race mitigation Summary: There is an inherent race condition on ProjectedFS (Windows EdenFS) where, after an fs write, "status" doesn't reflect the write because EdenFS hasn't received the async notification from ProjFS. This can wreak all sorts of havoc since "status" is used in Sapling for almost everything. Mitigate the race condition on the Sapling side by writing a touch file before we call status, and then waiting until the status result contains the touch file. The idea is that once we see the touch file in the status result from eden, we can be confident that previous filesystem writes are also reflected. The "derace" mitigation is enabled with the Sapling config experimental.derace-eden-status-mode=info|fatal (only for Windows). "info" will not propagate an error if we fail to see the touch file, where "fatal" will propagate an error. You can use experimental.derace-eden-status-timeout to control how long Sapling waits for the touch file to show up _after_ the first status. Defaults to 3s (picked unscientifically by me). Reviewed By: MichaelCuevas Differential Revision: D66391999 fbshipit-source-id: 2a88d3feeef68f9d8680715eccc171c0c5fb1176 --- .../lib/workingcopy/src/filesystem/edenfs.rs | 148 +++++++++++++++++- eden/scm/lib/workingcopy/src/workingcopy.rs | 1 + 2 files changed, 148 insertions(+), 1 deletion(-) diff --git a/eden/scm/lib/workingcopy/src/filesystem/edenfs.rs b/eden/scm/lib/workingcopy/src/filesystem/edenfs.rs index d53b4482a4f00..7581a4d0a9b0a 100644 --- a/eden/scm/lib/workingcopy/src/filesystem/edenfs.rs +++ b/eden/scm/lib/workingcopy/src/filesystem/edenfs.rs @@ -6,11 +6,15 @@ */ use std::cell::Cell; +use std::collections::BTreeMap; use std::path::Path; use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; +use std::time::Instant; +use anyhow::anyhow; +use anyhow::bail; use anyhow::Context; use anyhow::Result; use configmodel::Config; @@ -25,6 +29,8 @@ use storemodel::FileStore; use treestate::treestate::TreeState; use types::hgid::NULL_ID; use types::HgId; +use types::RepoPath; +use types::RepoPathBuf; use vfs::VFS; use crate::client::WorkingCopyClient; @@ -32,6 +38,12 @@ use crate::filesystem::FileSystem; use crate::filesystem::PendingChange; use crate::util::added_files; +enum DeraceMode { + Off, + // timeout, fatal? + On(Duration, bool), +} + pub struct EdenFileSystem { treestate: Arc>, client: Arc, @@ -40,10 +52,13 @@ pub struct EdenFileSystem { // For wait_for_potential_change journal_position: Cell<(i64, i64)>, + + derace_mode: DeraceMode, } impl EdenFileSystem { pub fn new( + config: &dyn Config, client: Arc, vfs: VFS, dot_dir: &Path, @@ -52,14 +67,145 @@ impl EdenFileSystem { let journal_position = Cell::new(client.get_journal_position()?); let treestate = create_treestate(dot_dir, vfs.case_sensitive())?; let treestate = Arc::new(Mutex::new(treestate)); + + let derace_mode = if cfg!(windows) { + let timeout: Duration = config + .get_or("experimental", "derace-eden-status-timeout", || { + Duration::from_secs(3) + }) + .map_err(|err| anyhow!(err))?; + match config + .get("experimental", "derace-eden-status-mode") + .as_deref() + { + Some("info") => DeraceMode::On(timeout, false), + Some("fatal") => DeraceMode::On(timeout, true), + _ => DeraceMode::Off, + } + } else { + DeraceMode::Off + }; + Ok(EdenFileSystem { treestate, client, vfs, store, journal_position, + derace_mode, }) } + + fn get_status( + &self, + p1: HgId, + include_ignored: bool, + ) -> Result> { + // If "derace" mode is enabled, below we touch a file and then keep calling eden + // status until we see the (untracked) file in the results. This is to mitigate a + // ProjectedFS race condition on Windows where eden's reported status won't + // reflect recent fs writes until eden receives the corresponding (async) + // notification from ProjectedFS. + + // Use a static touch file path to avoid creating too many files in the overlay. + // Note that this means we are open to race conditions between two concurrent + // "status" calls sharing the same touch file, but that should be rare (and the + // worst case is we just don't perform the derace check). + let derace_touch_file: &RepoPath = + ".eden-status-derace-GSZULQFGEEJXIONP".try_into().unwrap(); + + // This is set iff we create the touch file. + let mut wait_for_touch_file: Option = None; + let mut propagate_derace_error = false; + + if let DeraceMode::On(timeout, fatal) = self.derace_mode { + // Note: this touch file approach will be ineffective if the touch file + // already exists. The assumption is that will be very rare. We clean up the + // touch file aggressively below. + if let Err(err) = self + .vfs + .write(derace_touch_file, b"", vfs::UpdateFlag::Regular) + { + tracing::trace!(target: "eden_derace_info", eden_derace_error="error creating"); + tracing::error!(?err, %derace_touch_file, "error writing derace touch file"); + } else { + tracing::trace!("wrote derace touch file"); + wait_for_touch_file = Some(timeout); + propagate_derace_error = fatal; + } + } + + let mut start_time: Option = None; + loop { + let mut status_map = self.client.get_status(p1, include_ignored)?; + + // Handle derace touch file regardless of whether we created it. We want to + // ignore it and clean it up if it leaked previously. + if status_map.remove(derace_touch_file).is_some() { + if let Err(err) = self.vfs.remove(derace_touch_file) { + tracing::trace!(target: "eden_derace_info", eden_derace_error="error removing"); + tracing::error!(?err, %derace_touch_file, "error removing derace touch file"); + } + + if wait_for_touch_file.is_some() { + // If we are in derace mode, log how long we waited. + match start_time { + Some(start) => { + // We had multiple loops - log additional time we waited past first "status". + tracing::trace!(elapsed=?start.elapsed(), "eventually found derace touch file"); + tracing::trace!(target: "eden_derace_info", eden_derace_elapsed=start.elapsed().as_millis()); + } + None => { + // We saw touch file on first status - log "0". + tracing::trace!("found derace touch file on first try"); + tracing::trace!(target: "eden_derace_info", eden_derace_elapsed=0); + } + } + } + + return Ok(status_map); + } + + let timeout = match wait_for_touch_file { + Some(timeout) => timeout, + // We didn't create a touch file - nothing to check or wait for. + None => return Ok(status_map), + }; + + // + // From here we know we are in derace mode, and the first status call did not contain the touch file. + // + + if !self.vfs.exists(derace_touch_file).unwrap_or(false) { + tracing::warn!("derace touch file unexpectedly missing"); + tracing::trace!(target: "eden_derace_info", eden_derace_error="file missing"); + + // Touch file has disappeared from disk - probably raced with another + // "status" call that cleaned up the touch file. Should be pretty rare, so + // let's just give up and say "ok". + return Ok(status_map); + } + + match start_time { + // Start the derace clock _after_ the first status attempt (i.e. it measures additional time). + None => start_time = Some(Instant::now()), + Some(start) => { + if start.elapsed() >= timeout { + tracing::trace!(target: "eden_derace_info", eden_derace_error="timeout"); + + if propagate_derace_error { + bail!("failed to derace EdenFS status after {:?}", start.elapsed()); + } else { + return Ok(status_map); + } + } + } + } + + // Wait a bit for touch file PJFS notification to get to eden. + std::thread::sleep(Duration::from_millis(5)); + } + } } fn create_treestate(dot_dir: &std::path::Path, case_sensitive: bool) -> Result { @@ -85,7 +231,7 @@ impl FileSystem for EdenFileSystem { .next() .unwrap_or_else(|| Ok(NULL_ID))?; - let status_map = self.client.get_status(p1, include_ignored)?; + let status_map = self.get_status(p1, include_ignored)?; // In rare cases, a file can transition in the dirstate directly from "normal" to // "added". Eden won't report a pending change if the file is not modified (since diff --git a/eden/scm/lib/workingcopy/src/workingcopy.rs b/eden/scm/lib/workingcopy/src/workingcopy.rs index 9559e65250d07..a79da9307b361 100644 --- a/eden/scm/lib/workingcopy/src/workingcopy.rs +++ b/eden/scm/lib/workingcopy/src/workingcopy.rs @@ -312,6 +312,7 @@ impl WorkingCopy { { let client = Arc::new(EdenFsClient::from_wdir(vfs.root())?); Box::new(EdenFileSystem::new( + config, client, vfs.clone(), dot_dir,