Skip to content

Commit

Permalink
status: add ProjFS race mitigation
Browse files Browse the repository at this point in the history
Summary:
There is an inherent race condition on ProjectedFS (Windows EdenFS) where, after an fs write, "status" doesn't reflect the write because EdenFS hasn't received the async notification from ProjFS. This can wreak all sorts of havoc since "status" is used in Sapling for almost everything.

Mitigate the race condition on the Sapling side by writing a touch file before we call status, and then waiting until the status result contains the touch file. The idea is that once we see the touch file in the status result from eden, we can be confident that previous filesystem writes are also reflected.

The "derace" mitigation is enabled with the Sapling config experimental.derace-eden-status-mode=info|fatal (only for Windows). "info" will not propagate an error if we fail to see the touch file, where "fatal" will propagate an error.

You can use experimental.derace-eden-status-timeout to control how long Sapling waits for the touch file to show up _after_ the first status. Defaults to 3s (picked unscientifically by me).

Reviewed By: MichaelCuevas

Differential Revision: D66391999

fbshipit-source-id: 2a88d3feeef68f9d8680715eccc171c0c5fb1176
  • Loading branch information
muirdm authored and facebook-github-bot committed Nov 23, 2024
1 parent bd30f00 commit 9f06877
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 1 deletion.
148 changes: 147 additions & 1 deletion eden/scm/lib/workingcopy/src/filesystem/edenfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@
*/

use std::cell::Cell;
use std::collections::BTreeMap;
use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;

use anyhow::anyhow;
use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
use configmodel::Config;
Expand All @@ -25,13 +29,21 @@ use storemodel::FileStore;
use treestate::treestate::TreeState;
use types::hgid::NULL_ID;
use types::HgId;
use types::RepoPath;
use types::RepoPathBuf;
use vfs::VFS;

use crate::client::WorkingCopyClient;
use crate::filesystem::FileSystem;
use crate::filesystem::PendingChange;
use crate::util::added_files;

enum DeraceMode {
Off,
// timeout, fatal?
On(Duration, bool),
}

pub struct EdenFileSystem {
treestate: Arc<Mutex<TreeState>>,
client: Arc<EdenFsClient>,
Expand All @@ -40,10 +52,13 @@ pub struct EdenFileSystem {

// For wait_for_potential_change
journal_position: Cell<(i64, i64)>,

derace_mode: DeraceMode,
}

impl EdenFileSystem {
pub fn new(
config: &dyn Config,
client: Arc<EdenFsClient>,
vfs: VFS,
dot_dir: &Path,
Expand All @@ -52,14 +67,145 @@ impl EdenFileSystem {
let journal_position = Cell::new(client.get_journal_position()?);
let treestate = create_treestate(dot_dir, vfs.case_sensitive())?;
let treestate = Arc::new(Mutex::new(treestate));

let derace_mode = if cfg!(windows) {
let timeout: Duration = config
.get_or("experimental", "derace-eden-status-timeout", || {
Duration::from_secs(3)
})
.map_err(|err| anyhow!(err))?;
match config
.get("experimental", "derace-eden-status-mode")
.as_deref()
{
Some("info") => DeraceMode::On(timeout, false),
Some("fatal") => DeraceMode::On(timeout, true),
_ => DeraceMode::Off,
}
} else {
DeraceMode::Off
};

Ok(EdenFileSystem {
treestate,
client,
vfs,
store,
journal_position,
derace_mode,
})
}

fn get_status(
&self,
p1: HgId,
include_ignored: bool,
) -> Result<BTreeMap<RepoPathBuf, FileStatus>> {
// If "derace" mode is enabled, below we touch a file and then keep calling eden
// status until we see the (untracked) file in the results. This is to mitigate a
// ProjectedFS race condition on Windows where eden's reported status won't
// reflect recent fs writes until eden receives the corresponding (async)
// notification from ProjectedFS.

// Use a static touch file path to avoid creating too many files in the overlay.
// Note that this means we are open to race conditions between two concurrent
// "status" calls sharing the same touch file, but that should be rare (and the
// worst case is we just don't perform the derace check).
let derace_touch_file: &RepoPath =
".eden-status-derace-GSZULQFGEEJXIONP".try_into().unwrap();

// This is set iff we create the touch file.
let mut wait_for_touch_file: Option<Duration> = None;
let mut propagate_derace_error = false;

if let DeraceMode::On(timeout, fatal) = self.derace_mode {
// Note: this touch file approach will be ineffective if the touch file
// already exists. The assumption is that will be very rare. We clean up the
// touch file aggressively below.
if let Err(err) = self
.vfs
.write(derace_touch_file, b"", vfs::UpdateFlag::Regular)
{
tracing::trace!(target: "eden_derace_info", eden_derace_error="error creating");
tracing::error!(?err, %derace_touch_file, "error writing derace touch file");
} else {
tracing::trace!("wrote derace touch file");
wait_for_touch_file = Some(timeout);
propagate_derace_error = fatal;
}
}

let mut start_time: Option<Instant> = None;
loop {
let mut status_map = self.client.get_status(p1, include_ignored)?;

// Handle derace touch file regardless of whether we created it. We want to
// ignore it and clean it up if it leaked previously.
if status_map.remove(derace_touch_file).is_some() {
if let Err(err) = self.vfs.remove(derace_touch_file) {
tracing::trace!(target: "eden_derace_info", eden_derace_error="error removing");
tracing::error!(?err, %derace_touch_file, "error removing derace touch file");
}

if wait_for_touch_file.is_some() {
// If we are in derace mode, log how long we waited.
match start_time {
Some(start) => {
// We had multiple loops - log additional time we waited past first "status".
tracing::trace!(elapsed=?start.elapsed(), "eventually found derace touch file");
tracing::trace!(target: "eden_derace_info", eden_derace_elapsed=start.elapsed().as_millis());
}
None => {
// We saw touch file on first status - log "0".
tracing::trace!("found derace touch file on first try");
tracing::trace!(target: "eden_derace_info", eden_derace_elapsed=0);
}
}
}

return Ok(status_map);
}

let timeout = match wait_for_touch_file {
Some(timeout) => timeout,
// We didn't create a touch file - nothing to check or wait for.
None => return Ok(status_map),
};

//
// From here we know we are in derace mode, and the first status call did not contain the touch file.
//

if !self.vfs.exists(derace_touch_file).unwrap_or(false) {
tracing::warn!("derace touch file unexpectedly missing");
tracing::trace!(target: "eden_derace_info", eden_derace_error="file missing");

// Touch file has disappeared from disk - probably raced with another
// "status" call that cleaned up the touch file. Should be pretty rare, so
// let's just give up and say "ok".
return Ok(status_map);
}

match start_time {
// Start the derace clock _after_ the first status attempt (i.e. it measures additional time).
None => start_time = Some(Instant::now()),
Some(start) => {
if start.elapsed() >= timeout {
tracing::trace!(target: "eden_derace_info", eden_derace_error="timeout");

if propagate_derace_error {
bail!("failed to derace EdenFS status after {:?}", start.elapsed());
} else {
return Ok(status_map);
}
}
}
}

// Wait a bit for touch file PJFS notification to get to eden.
std::thread::sleep(Duration::from_millis(5));
}
}
}

fn create_treestate(dot_dir: &std::path::Path, case_sensitive: bool) -> Result<TreeState> {
Expand All @@ -85,7 +231,7 @@ impl FileSystem for EdenFileSystem {
.next()
.unwrap_or_else(|| Ok(NULL_ID))?;

let status_map = self.client.get_status(p1, include_ignored)?;
let status_map = self.get_status(p1, include_ignored)?;

// In rare cases, a file can transition in the dirstate directly from "normal" to
// "added". Eden won't report a pending change if the file is not modified (since
Expand Down
1 change: 1 addition & 0 deletions eden/scm/lib/workingcopy/src/workingcopy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ impl WorkingCopy {
{
let client = Arc::new(EdenFsClient::from_wdir(vfs.root())?);
Box::new(EdenFileSystem::new(
config,
client,
vfs.clone(),
dot_dir,
Expand Down

0 comments on commit 9f06877

Please sign in to comment.