From 74b039cd9dc84dde1d9172f2d31a6562e87d3962 Mon Sep 17 00:00:00 2001 From: Sean Olson Date: Wed, 1 Nov 2023 17:25:40 -0700 Subject: [PATCH] Decompose `walk` module into sub-modules. --- src/lib.rs | 8 +- src/walk/filter.rs | 18 +- src/walk/glob.rs | 413 ++++++++++++++++ src/walk/mod.rs | 1167 +------------------------------------------- src/walk/tree.rs | 677 +++++++++++++++++++++++++ tests/walk.rs | 3 +- 6 files changed, 1108 insertions(+), 1178 deletions(-) create mode 100644 src/walk/glob.rs create mode 100644 src/walk/tree.rs diff --git a/src/lib.rs b/src/lib.rs index 2e74d5d..658d1a2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,7 +35,7 @@ mod diagnostics; mod encode; mod rule; mod token; -mod walk; +pub mod walk; #[cfg(feature = "miette")] use miette::Diagnostic; @@ -53,13 +53,11 @@ use thiserror::Error; use crate::encode::CompileError; use crate::rule::{Checked, RuleError}; use crate::token::{InvariantText, ParseError, Token, TokenTree, Tokenized}; +#[cfg(feature = "walk")] +use crate::walk::WalkError; pub use crate::capture::MatchedText; pub use crate::diagnostics::{LocatedError, Span}; -#[cfg(feature = "walk")] -pub use crate::walk::{ - FileIterator, FilterTree, GlobEntry, LinkBehavior, WalkBehavior, WalkError, WalkNegation, -}; #[cfg(windows)] const PATHS_ARE_CASE_INSENSITIVE: bool = true; diff --git a/src/walk/filter.rs b/src/walk/filter.rs index b83aab6..7899562 100644 --- a/src/walk/filter.rs +++ b/src/walk/filter.rs @@ -35,19 +35,13 @@ mod state { impl TypeState> { pub fn transpose(self) -> Option> { - match self.inner { - Some(inner) => Some(TypeState::new(inner)), - _ => None, - } + self.inner.map(TypeState::new) } } impl TypeState> { pub fn transpose(self) -> Result, TypeState> { - match self.inner { - Ok(inner) => Ok(TypeState::new(inner)), - Err(error) => Err(TypeState::new(error)), - } + self.inner.map(TypeState::new).map_err(TypeState::new) } } @@ -268,9 +262,8 @@ where impl Separation<(Option, R)> { pub fn transpose_filtrate(self) -> Option> { match self { - Separation::Filtrate(filtrate) => match filtrate.into_inner() { - Some(filtrate) => Some(Filtrate::new(filtrate).into()), - _ => None, + Separation::Filtrate(filtrate) => { + filtrate.into_inner().map(Filtrate::new).map(From::from) }, Separation::Residue(residue) => Some(residue.into()), } @@ -368,8 +361,7 @@ pub enum TreeResidue { impl AsRef for TreeResidue { fn as_ref(&self) -> &T { match self { - TreeResidue::Node(ref residue) => residue, - TreeResidue::Tree(ref residue) => residue, + TreeResidue::Node(ref residue) | TreeResidue::Tree(ref residue) => residue, } } } diff --git a/src/walk/glob.rs b/src/walk/glob.rs new file mode 100644 index 0000000..a29d380 --- /dev/null +++ b/src/walk/glob.rs @@ -0,0 +1,413 @@ +use itertools::Itertools; +use regex::Regex; +use std::fs::{FileType, Metadata}; +use std::path::{Component, Path, PathBuf}; +use walkdir::{self, WalkDir}; + +use crate::capture::MatchedText; +use crate::encode::CompileError; +use crate::token::{self, Token, TokenTree}; +use crate::walk::filter::{ + self, SeparatingFilter, Separation, SkipTree, TreeIterator, TreeResidue, +}; +use crate::walk::tree::{FileIterator, LinkBehavior, WalkBehavior, WalkEntry, WalkError, WalkTree}; +use crate::{BuildError, CandidatePath, Combine, Glob}; + +impl<'t> Glob<'t> { + #[cfg_attr(docsrs, doc(cfg(feature = "walk")))] + pub fn walk( + &self, + directory: impl Into, + ) -> WalkGlob> { + self.walk_with_behavior(directory, WalkBehavior::default()) + } + + #[cfg_attr(docsrs, doc(cfg(feature = "walk")))] + pub fn walk_with_behavior( + &self, + directory: impl Into, + behavior: impl Into, + ) -> WalkGlob> { + let Anchor { root, prefix } = self.anchor(directory); + let component_patterns = WalkGlob::<()>::compile(self.tree.as_ref().tokens()) + .expect("failed to compile glob sub-expressions"); + let complete_pattern = self.pattern.clone(); + let input = WalkTree::from({ + let WalkBehavior { depth, link } = behavior.into(); + WalkDir::new(root.clone()) + .follow_links(match link { + LinkBehavior::ReadFile => false, + LinkBehavior::ReadTarget => true, + }) + .max_depth(depth) + }) + .filter_map_tree(move |cancellation, separation| { + use itertools::EitherOrBoth::{Both, Left, Right}; + use itertools::Position::{First, Last, Middle, Only}; + + let filtrate = match separation.filtrate() { + Some(filtrate) => match filtrate.transpose() { + Ok(filtrate) => filtrate.map(|entry| { + entry + .strip_path_prefix(&prefix) + .expect("path is not in tree") + }), + Err(error) => { + return Separation::from(error.map(Err)); + }, + }, + _ => unreachable!(), + }; + let entry = filtrate.as_ref(); + let path = entry.stripped_or_base_path(); + let depth = entry.depth().saturating_sub(1); + for (position, candidate) in path + .components() + .skip(depth) + .filter_map(|component| match component { + Component::Normal(component) => Some(CandidatePath::from(component)), + _ => None, + }) + .zip_longest(component_patterns.iter().skip(depth)) + .with_position() + { + match (position, candidate) { + (First | Middle, Both(candidate, pattern)) => { + if !pattern.is_match(candidate.as_ref()) { + // Do not descend into directories that do not match the corresponding + // component pattern. + return filtrate.filter_tree(cancellation).into(); + } + }, + (Last | Only, Both(candidate, pattern)) => { + return if pattern.is_match(candidate.as_ref()) { + let candidate = CandidatePath::from(path); + if let Some(matched) = complete_pattern + .captures(candidate.as_ref()) + .map(MatchedText::from) + .map(MatchedText::into_owned) + { + filtrate + .map(|entry| Ok(GlobEntry { entry, matched })) + .into() + } + else { + filtrate.filter_node().into() + } + } + else { + // Do not descend into directories that do not match the corresponding + // component pattern. + filtrate.filter_tree(cancellation).into() + }; + }, + (_, Left(_candidate)) => { + let candidate = CandidatePath::from(path); + return if let Some(matched) = complete_pattern + .captures(candidate.as_ref()) + .map(MatchedText::from) + .map(MatchedText::into_owned) + { + filtrate + .map(|entry| Ok(GlobEntry { entry, matched })) + .into() + } + else { + filtrate.filter_node().into() + }; + }, + (_, Right(_pattern)) => { + return filtrate.filter_node().into(); + }, + } + } + // If the component loop is not entered, then check for a match. This may indicate that + // the `Glob` is empty and a single invariant path may be matched. + let candidate = CandidatePath::from(path); + if let Some(matched) = complete_pattern + .captures(candidate.as_ref()) + .map(MatchedText::from) + .map(MatchedText::into_owned) + { + return filtrate + .map(|entry| Ok(GlobEntry { entry, matched })) + .into(); + } + filtrate.filter_node().into() + }); + WalkGlob { input, root } + } + + fn anchor(&self, directory: impl Into) -> Anchor { + fn invariant_path_prefix<'t, A, I>(tokens: I) -> Option + where + A: 't, + I: IntoIterator>, + { + let prefix = token::invariant_text_prefix(tokens); + if prefix.is_empty() { + None + } + else { + Some(prefix.into()) + } + } + + let directory = directory.into(); + // The directory tree is traversed from `root`, which may include an invariant prefix from + // the glob pattern. Patterns are only applied to path components following this prefix in + // `root`. + let (root, prefix) = match invariant_path_prefix(self.tree.as_ref().tokens()) { + Some(prefix) => { + let root = directory.join(&prefix); + if prefix.is_absolute() { + // Absolute paths replace paths with which they are joined, in which case there + // is no prefix. + (root, PathBuf::new()) + } + else { + (root, directory) + } + }, + _ => (directory.clone(), directory), + }; + Anchor { root, prefix } + } +} + +#[derive(Clone, Debug)] +struct Anchor { + root: PathBuf, + prefix: PathBuf, +} + +/// Negated glob combinator that efficiently filters [`WalkEntry`]s. +/// +/// Determines an appropriate [`FilterTarget`] for a [`WalkEntry`] based on the +/// [exhaustiveness][`Pattern::is_exhaustive`] of its component [`Pattern`]s. This can be used with +/// [`FilterTree`] to efficiently filter [`WalkEntry`]s without reading directory trees from the +/// file system when not necessary. +/// +/// [`FilterTarget`]: crate::FilterTarget +/// [`FilterTree`]: crate::FilterTree +/// [`Pattern`]: crate::Pattern +/// [`Pattern::is_exhaustive`]: crate::Pattern::is_exhaustive +/// [`WalkEntry`]: crate::WalkEntry +#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] +#[derive(Clone, Debug)] +pub struct WalkNegation { + exhaustive: Regex, + nonexhaustive: Regex, +} + +impl WalkNegation { + /// Combines glob expressions into a `WalkNegation`. + /// + /// This function accepts an [`IntoIterator`] with items that implement [`Combine`], such as + /// [`Glob`] and `&str`. + /// + /// # Errors + /// + /// Returns an error if any of the inputs fail to build. If the inputs are a compiled + /// [`Pattern`] types such as [`Glob`], then this only occurs if the compiled program is too + /// large. + /// + /// [`Combine`]: crate::Combine + /// [`Glob`]: crate::Glob + /// [`IntoIterator`]: std::iter::IntoIterator + /// [`Pattern`]: crate::Pattern + pub fn any<'t, I>(patterns: I) -> Result + where + I: IntoIterator, + I::Item: Combine<'t>, + { + let (exhaustive, nonexhaustive) = patterns + .into_iter() + .map(TryInto::try_into) + .collect::, _>>() + .map_err(Into::into)? + .into_iter() + .partition::, _>(|tree| token::is_exhaustive(tree.as_ref().tokens())); + let negation = WalkNegation { + exhaustive: crate::any(exhaustive)?.pattern, + nonexhaustive: crate::any(nonexhaustive)?.pattern, + }; + Ok(negation) + } + + /// Gets the appropriate [`FilterTarget`] for the given [`WalkEntry`]. + /// + /// This function can be used with [`FileIterator::filter_tree`] to effeciently filter + /// [`WalkEntry`]s without reading directory trees from the file system when not necessary. + /// + /// Returns [`FilterTarget::Tree`] if the [`WalkEntry`] matches an [exhaustive glob + /// expression][`Pattern::is_exhaustive`], such as `secret/**`. + /// + /// [`FileIterator::filter_tree`]: crate::FileIterator::filter_tree + /// [`FilterTarget`]: crate::FilterTarget + /// [`FilterTarget::Tree`]: crate::FilterTarget::Tree + /// [`Pattern::is_exhaustive`]: crate::Pattern::is_exhaustive + /// [`WalkEntry`]: crate::WalkEntry + pub fn residue(&self, entry: &WalkEntry) -> Option> { + let candidate = CandidatePath::from(entry.stripped_or_base_path()); + if self.exhaustive.is_match(candidate.as_ref()) { + // Do not descend into directories that match the exhaustive negation. + Some(TreeResidue::Tree(())) + } + else if self.nonexhaustive.is_match(candidate.as_ref()) { + Some(TreeResidue::Node(())) + } + else { + None + } + } +} + +/// Describes a file matching a [`Glob`] in a directory tree. +/// +/// [`Glob`]: crate::Glob +#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] +#[derive(Debug)] +pub struct GlobEntry { + entry: WalkEntry, + matched: MatchedText<'static>, +} + +impl GlobEntry { + pub fn into_path(self) -> PathBuf { + self.entry.into_path() + } + + /// Gets the path of the matched file. + pub fn path(&self) -> &Path { + self.entry.path() + } + + /// Converts the entry to the relative [`CandidatePath`]. + /// + /// **This differs from [`path`] and [`into_path`], which are natively encoded and may be + /// absolute.** The [`CandidatePath`] is always relative to [the root][`Walk::root`] of the + /// directory tree. + /// + /// [`CandidatePath`]: crate::CandidatePath + /// [`into_path`]: crate::WalkEntry::into_path + /// [`matched`]: crate::WalkEntry::matched + /// [`path`]: crate::WalkEntry::path + pub fn to_candidate_path(&self) -> CandidatePath<'_> { + self.matched.to_candidate_path() + } + + pub fn file_type(&self) -> FileType { + self.entry.file_type() + } + + pub fn metadata(&self) -> Result { + self.entry.metadata().map_err(WalkError::from) + } + + /// Gets the depth of the file from [the root][`Walk::root`] of the directory tree. + /// + /// [`Walk::root`]: crate::Walk::root + pub fn depth(&self) -> usize { + self.entry.depth() + } + + /// Gets the matched text in the path of the file. + pub fn matched(&self) -> &MatchedText<'static> { + &self.matched + } +} + +impl AsRef for GlobEntry { + fn as_ref(&self) -> &WalkEntry { + &self.entry + } +} + +impl From for WalkEntry { + fn from(entry: GlobEntry) -> Self { + entry.entry + } +} + +/// Iterator over files matching a [`Glob`] in a directory tree. +/// +/// `Walk` is a `TreeIterator` and supports [`FileIterator::filter_tree`]. +/// +/// [`FileIterator::filter_tree`]: crate::FileIterator::filter_tree +/// [`Glob`]: crate::Glob +#[derive(Clone, Debug)] +pub struct WalkGlob { + input: I, + root: PathBuf, +} + +impl WalkGlob { + fn compile<'t, J>(tokens: J) -> Result, CompileError> + where + J: IntoIterator>, + J::IntoIter: Clone, + { + let mut regexes = Vec::new(); + for component in token::components(tokens) { + if component + .tokens() + .iter() + .any(|token| token.has_component_boundary()) + { + // Stop at component boundaries, such as tree wildcards or any boundary within a + // group token. + break; + } + regexes.push(Glob::compile(component.tokens().iter().copied())?); + } + Ok(regexes) + } + + /// Gets the root directory of the traversal. + /// + /// The root directory is determined by joining the directory path in functions like + /// [`Glob::walk`] with any [invariant prefix](`Glob::partition`) of the [`Glob`]. When a + /// [`Glob`] is rooted, the root directory is the same as the invariant prefix. + /// + /// The depth specified via [`WalkBehavior`] is relative to this path. + /// + /// [`Glob`]: crate::Glob + /// [`Glob::partition`]: crate::Glob::partition + /// [`Glob::walk`]: crate::Glob::walk + /// [`WalkBehavior`]: crate::WalkBehavior + pub fn root(&self) -> &Path { + self.root.as_ref() + } +} + +impl Iterator for WalkGlob +where + I: FileIterator, +{ + type Item = I::Item; + + fn next(&mut self) -> Option { + filter::filtrate(self) + } +} + +impl SeparatingFilter for WalkGlob +where + I: SeparatingFilter, +{ + type Feed = I::Feed; + + fn feed(&mut self) -> Option> { + self.input.feed() + } +} + +impl SkipTree for WalkGlob +where + I: SkipTree, +{ + fn skip_tree(&mut self) { + self.input.skip_tree() + } +} diff --git a/src/walk/mod.rs b/src/walk/mod.rs index ee84fdd..a51e342 100644 --- a/src/walk/mod.rs +++ b/src/walk/mod.rs @@ -1,1164 +1,13 @@ #![cfg(feature = "walk")] -// TODO: Split the `walk` module into the following modules: -// - `filter`: separating filters -// - `glob`: pattern matching file iterators -// - `tree`: unfiltered file iterators -// Assemble and limit the contents of these modules via re-exports in `walk`. -mod filter; - -use itertools::Itertools as _; -use regex::Regex; -use std::fs::{FileType, Metadata}; -use std::io; -use std::path::{Component, Path, PathBuf}; -use thiserror::Error; -use walkdir::{self, DirEntry, WalkDir}; +// TODO: Are `cfg(feature = "walk")` attributes redundant on items defined within this module? -use crate::capture::MatchedText; -use crate::encode::CompileError; -use crate::token::{self, Token, TokenTree}; -use crate::walk::filter::{ - Isomeric, SeparatingFilter, SeparatingFilterInput, Separation, SkipTree, TreeIterator, - WalkCancellation, -}; -use crate::{BuildError, CandidatePath, Combine, Glob}; +mod filter; +mod glob; +mod tree; pub use crate::walk::filter::TreeResidue; - -pub type FileFiltrate = Result; -pub type FileResidue = TreeResidue; -pub type FileFeed = (FileFiltrate, FileResidue); - -impl Isomeric for (T, FileResidue) -where - T: AsRef, -{ - type Substituent<'a> = &'a WalkEntry - where - Self: 'a; - - fn substituent(separation: &Separation) -> Self::Substituent<'_> { - match separation { - Separation::Filtrate(ref filtrate) => filtrate.get().as_ref(), - Separation::Residue(ref residue) => residue.get().as_ref(), - } - } -} - -/// Describes errors that occur when matching a [`Glob`] against a directory tree. -/// -/// `WalkError` implements conversion into [`io::Error`]. -/// -/// [`Glob`]: crate::Glob -/// [`io::Error`]: std::io::Error -#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] -#[derive(Debug, Error)] -#[error("failed to match directory tree: {kind}")] -pub struct WalkError { - depth: usize, - kind: WalkErrorKind, -} - -impl WalkError { - /// Gets the path at which the error occurred. - /// - /// Returns `None` if there is no path associated with the error. - pub fn path(&self) -> Option<&Path> { - self.kind.path() - } - - /// Gets the depth from [the root][`Walk::root`] at which the error occurred. - /// - /// [`Walk::root`]: crate::Walk::root - pub fn depth(&self) -> usize { - self.depth - } -} - -impl From for WalkError { - fn from(error: walkdir::Error) -> Self { - let depth = error.depth(); - let path = error.path().map(From::from); - if error.io_error().is_some() { - WalkError { - depth, - kind: WalkErrorKind::Io { - path, - error: error.into_io_error().expect("incongruent error kind"), - }, - } - } - else { - WalkError { - depth, - kind: WalkErrorKind::LinkCycle { - root: error - .loop_ancestor() - .expect("incongruent error kind") - .into(), - leaf: path.expect("incongruent error kind"), - }, - } - } - } -} - -impl From for io::Error { - fn from(error: WalkError) -> Self { - let kind = match error.kind { - WalkErrorKind::Io { ref error, .. } => error.kind(), - _ => io::ErrorKind::Other, - }; - io::Error::new(kind, error) - } -} - -#[derive(Debug, Error)] -#[non_exhaustive] -enum WalkErrorKind { - #[error("failed to read file at `{path:?}`: {error}")] - Io { - path: Option, - error: io::Error, - }, - #[error("symbolic link cycle detected from `{root}` to `{leaf}`")] - LinkCycle { root: PathBuf, leaf: PathBuf }, -} - -impl WalkErrorKind { - pub fn path(&self) -> Option<&Path> { - match self { - WalkErrorKind::Io { ref path, .. } => path.as_ref().map(PathBuf::as_ref), - WalkErrorKind::LinkCycle { ref leaf, .. } => Some(leaf.as_ref()), - } - } -} - -impl<'t> Glob<'t> { - #[cfg_attr(docsrs, doc(cfg(feature = "walk")))] - pub fn walk( - &self, - directory: impl Into, - ) -> WalkGlob> { - self.walk_with_behavior(directory, WalkBehavior::default()) - } - - #[cfg_attr(docsrs, doc(cfg(feature = "walk")))] - pub fn walk_with_behavior( - &self, - directory: impl Into, - behavior: impl Into, - ) -> WalkGlob> { - let Anchor { root, prefix } = self.anchor(directory); - let component_patterns = WalkGlob::<()>::compile(self.tree.as_ref().tokens()) - .expect("failed to compile glob sub-expressions"); - let complete_pattern = self.pattern.clone(); - let input = WalkTree::from({ - let WalkBehavior { depth, link } = behavior.into(); - WalkDir::new(root.clone()) - .follow_links(match link { - LinkBehavior::ReadFile => false, - LinkBehavior::ReadTarget => true, - }) - .max_depth(depth) - }) - .filter_map_tree(move |cancellation, separation| { - use itertools::EitherOrBoth::{Both, Left, Right}; - use itertools::Position::{First, Last, Middle, Only}; - - let filtrate = match separation.filtrate() { - Some(filtrate) => match filtrate.transpose() { - Ok(filtrate) => filtrate.map(|entry| { - entry - .strip_path_prefix(&prefix) - .expect("path is not in tree") - }), - Err(error) => { - return Separation::from(error.map(|error| Err(error.into()))); - }, - }, - _ => unreachable!(), - }; - let entry = filtrate.as_ref(); - let path = entry.stripped_or_base_path(); - let depth = entry.depth().saturating_sub(1); - for (position, candidate) in path - .components() - .skip(depth) - .filter_map(|component| match component { - Component::Normal(component) => Some(CandidatePath::from(component)), - _ => None, - }) - .zip_longest(component_patterns.iter().skip(depth)) - .with_position() - { - match (position, candidate) { - (First | Middle, Both(candidate, pattern)) => { - if !pattern.is_match(candidate.as_ref()) { - // Do not descend into directories that do not match the corresponding - // component pattern. - return filtrate.filter_tree(cancellation).into(); - } - }, - (Last | Only, Both(candidate, pattern)) => { - return if pattern.is_match(candidate.as_ref()) { - let candidate = CandidatePath::from(path); - if let Some(matched) = complete_pattern - .captures(candidate.as_ref()) - .map(MatchedText::from) - .map(MatchedText::into_owned) - { - filtrate - .map(|entry| Ok(GlobEntry { entry, matched })) - .into() - } - else { - filtrate.filter_node().into() - } - } - else { - // Do not descend into directories that do not match the corresponding - // component pattern. - filtrate.filter_tree(cancellation).into() - }; - }, - (_, Left(_candidate)) => { - let candidate = CandidatePath::from(path); - return if let Some(matched) = complete_pattern - .captures(candidate.as_ref()) - .map(MatchedText::from) - .map(MatchedText::into_owned) - { - filtrate - .map(|entry| Ok(GlobEntry { entry, matched })) - .into() - } - else { - filtrate.filter_node().into() - }; - }, - (_, Right(_pattern)) => { - return filtrate.filter_node().into(); - }, - } - } - // If the component loop is not entered, then check for a match. This may indicate that - // the `Glob` is empty and a single invariant path may be matched. - let candidate = CandidatePath::from(path); - if let Some(matched) = complete_pattern - .captures(candidate.as_ref()) - .map(MatchedText::from) - .map(MatchedText::into_owned) - { - return filtrate - .map(|entry| Ok(GlobEntry { entry, matched })) - .into(); - } - filtrate.filter_node().into() - }); - WalkGlob { input, root } - } - - fn anchor(&self, directory: impl Into) -> Anchor { - fn invariant_path_prefix<'t, A, I>(tokens: I) -> Option - where - A: 't, - I: IntoIterator>, - { - let prefix = token::invariant_text_prefix(tokens); - if prefix.is_empty() { - None - } - else { - Some(prefix.into()) - } - } - - let directory = directory.into(); - // The directory tree is traversed from `root`, which may include an invariant prefix from - // the glob pattern. Patterns are only applied to path components following this prefix in - // `root`. - let (root, prefix) = match invariant_path_prefix(self.tree.as_ref().tokens()) { - Some(prefix) => { - let root = directory.join(&prefix); - if prefix.is_absolute() { - // Absolute paths replace paths with which they are joined, in which case there - // is no prefix. - (root, PathBuf::new().into()) - } - else { - (root, directory) - } - }, - _ => (directory.clone(), directory), - }; - Anchor { root, prefix } - } -} - -#[derive(Clone, Debug)] -struct Anchor { - root: PathBuf, - prefix: PathBuf, -} - -//impl Isomer for GlobEntry { -// type Substituent<'a> = &'a WalkEntry -// where -// Self: 'a; -// -// fn substituent(&self) -> Self::Substituent<'_> { -// self.as_ref() -// } -//} -// -//impl Isomer for FileResidue { -// type Substituent<'a> = &'a WalkEntry -// where -// Self: 'a; -// -// fn substituent(&self) -> Self::Substituent<'_> { -// self.as_ref() -// } -//} -// -//impl Isomer for WalkEntry { -// type Substituent<'a> = &'a WalkEntry -// where -// Self: 'a; -// -// fn substituent(&self) -> Self::Substituent<'_> { -// self -// } -//} -// -//impl Isomer for FileResidue { -// type Substituent<'a> = &'a WalkEntry -// where -// Self: 'a; -// -// fn substituent(&self) -> Self::Substituent<'_> { -// self.as_ref() -// } -//} - -#[derive(Clone, Debug)] -pub struct WalkEntry { - entry: DirEntry, - stripped: Option, -} - -impl WalkEntry { - pub fn strip_path_prefix(self, prefix: impl AsRef) -> Option { - self.path() - .strip_prefix(prefix) - .ok() - .map(PathBuf::from) - .map(|stripped| { - let WalkEntry { entry, .. } = self; - WalkEntry { - entry, - stripped: Some(stripped), - } - }) - } - - pub fn unstrip_path_prefix(self) -> Self { - let WalkEntry { entry, .. } = self; - WalkEntry { - entry, - stripped: None, - } - } - - pub fn stripped_or_base_path(&self) -> &Path { - self.stripped_path().unwrap_or_else(|| self.path()) - } - - pub fn stripped_path(&self) -> Option<&Path> { - self.stripped.as_ref().map(AsRef::as_ref) - } - - pub fn has_stripped_path(&self) -> bool { - self.stripped.is_some() - } -} - -impl AsRef for WalkEntry { - fn as_ref(&self) -> &WalkEntry { - self - } -} - -// TODO: Don't do this! Delegate/forward instead. -use std::ops::Deref; -impl Deref for WalkEntry { - type Target = DirEntry; - - fn deref(&self) -> &Self::Target { - &self.entry - } -} - -impl From for WalkEntry { - fn from(entry: DirEntry) -> Self { - WalkEntry { - entry, - stripped: None, - } - } -} - -impl From for WalkEntry { - fn from(entry: GlobEntry) -> Self { - entry.entry - } -} - -#[derive(Debug)] -pub struct WalkTree { - is_dir: bool, - input: walkdir::IntoIter, -} - -impl From for WalkTree { - fn from(walkdir: WalkDir) -> Self { - WalkTree { - is_dir: false, - input: walkdir.into_iter(), - } - } -} - -impl Iterator for WalkTree { - type Item = Result; - - fn next(&mut self) -> Option { - let (is_dir, next) = match self.input.next() { - Some(result) => match result { - Ok(entry) => (entry.file_type().is_dir(), Some(Ok(entry.into()))), - Err(error) => (false, Some(Err(error.into()))), - }, - _ => (false, None), - }; - self.is_dir = is_dir; - next - } -} - -impl SeparatingFilterInput for WalkTree { - type Feed = (Result, TreeResidue); -} - -impl SkipTree for WalkTree { - fn skip_tree(&mut self) { - // `IntoIter::skip_current_dir` discards the least recently yielded directory, but - // `skip_tree` must act upon the most recently yielded node regardless of its topology - // (leaf vs. branch). - if self.is_dir { - self.input.skip_current_dir(); - } - } -} - -/// An [`Iterator`] over [`WalkEntry`]s that can filter directory trees. -/// -/// A `FileIterator` is a `TreeIterator` that yields [`WalkEntry`]s. This trait is implemented by -/// [`Walk`] and adaptors like [`FilterTree`]. A `TreeIterator` is an iterator that reads its items -/// from a tree and therefore can meaningfully filter not only items but their corresponding -/// sub-trees to avoid unnecessary work. To that end, this trait provides the `filter_tree` -/// function, which allows directory trees to be discarded (not read from the file system) when -/// matching [`Glob`]s against directory trees. -/// -/// [`filter_tree`]: crate::FileIterator::filter_tree -/// [`Glob`]: crate::Glob -/// [`Iterator`]: std::iter::Iterator -/// [`WalkEntry`]: crate::WalkEntry -#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] -pub trait FileIterator: - Iterator> + TreeIterator> -{ - type Entry; - - /// Filters [`WalkEntry`]s and controls the traversal of directory trees. - /// - /// This function creates an adaptor that filters [`WalkEntry`]s and furthermore specifies how - /// iteration proceeds to traverse directory trees. The adaptor accepts a function that, when - /// discarding a [`WalkEntry`], yields a [`FilterTarget`]. **If the entry refers to a directory - /// and [`FilterTarget::Tree`] is returned by the function, then iteration will not descend - /// into that directory and the tree will not be read from the file system.** Therefore, this - /// adaptor should be preferred over functions like [`Iterator::filter`] when discarded - /// directories do not need to be read. - /// - /// Errors are not filtered, so if an error occurs reading a file at a path that would have - /// been discarded, then that error is still yielded by the iterator. - /// - /// # Examples - /// - /// The [`FilterTree`] adaptor can be used to apply additional custom filtering that avoids - /// unnecessary directory reads. The following example filters out hidden files on Unix and - /// Windows. On Unix, hidden files are filtered out nominally via [`not`]. On Windows, - /// `filter_tree` is used to detect the [hidden attribute][attributes]. In both cases, the - /// adaptor does not read conventionally hidden directory trees. - /// - /// ```rust,no_run - /// #[cfg(windows)] - /// use wax::TreeResidue; - /// use wax::{FileIterator, Glob}; - /// - /// let glob = Glob::new("**/*.(?i){jpg,jpeg}").unwrap(); - /// let walk = glob.walk("./Pictures"); - /// // Filter out nominally hidden files on Unix. Like `filter_tree`, `not` does not perform - /// // unnecessary reads of directory trees. - /// #[cfg(unix)] - /// let walk = walk.not(["**/.*/**"]).unwrap(); - /// // Filter out files with the hidden attribute on Windows. - /// #[cfg(windows)] - /// let walk = walk.filter_tree(|entry| { - /// use std::os::windows::fs::MetadataExt as _; - /// - /// const ATTRIBUTE_HIDDEN: u32 = 0x2; - /// - /// let attributes = entry.metadata().unwrap().file_attributes(); - /// if (attributes & ATTRIBUTE_HIDDEN) == ATTRIBUTE_HIDDEN { - /// // Do not read hidden directory trees. - /// Some(TreeResidue::Tree(())) - /// } - /// else { - /// None - /// } - /// }); - /// for entry in walk { - /// let entry = entry.unwrap(); - /// println!("JPEG: {:?}", entry.path()); - /// } - /// ``` - /// - /// [`FilterTree`]: crate::FilterTree - /// [`Iterator`]: std::iter::Iterator - /// [`Iterator::filter`]: std::iter::Iterator::filter - /// [`not`]: crate::Walk::not - /// [`Walk`]: crate::Walk - /// [`WalkEntry`]: crate::WalkEntry - /// - /// [attributes]: https://docs.microsoft.com/en-us/windows/win32/fileio/file-attribute-constants - fn filter_tree(self, f: F) -> FilterTree - where - WalkEntry: From, - Self: Sized, - Self::Entry: AsRef, - F: FnMut(&WalkEntry) -> Option>, - { - FilterTree { input: self, f } - } - - /// Filters [`WalkEntry`]s against negated glob expressions. - /// - /// This function creates an adaptor that discards [`WalkEntry`]s that match any of the given - /// glob expressions. This allows for broad negations while matching a [`Glob`] against a - /// directory tree that cannot be achieved using a single glob expression alone. - /// - /// The adaptor is constructed via [`FilterTree`] and [`WalkNegation`] and therefore does not - /// read directory trees from the file system when a directory matches an [exhaustive glob - /// expression][`Pattern::is_exhaustive`] such as `**/private/**` or `hidden/</>*`. **This - /// function should be preferred when filtering [`WalkEntry`]s against [`Glob`]s, since this - /// avoids potentially large and unnecessary reads**. - /// - /// # Errors - /// - /// Returns an error if any of the inputs fail to build. If the inputs are a compiled - /// [`Pattern`] type such as [`Glob`], then this only occurs if the compiled program is too - /// large. - /// - /// # Examples - /// - /// Because glob expressions do not support general negations, it is sometimes impossible to - /// express patterns that deny particular text. In such cases, `not` can be used to apply - /// additional patterns as a filter. - /// - /// ```rust,no_run - /// use wax::{FileIterator, Glob}; - /// - /// // Find image files, but not if they are beneath a directory with a name that suggests that - /// // they are private. - /// let glob = Glob::new("**/*.(?i){jpg,jpeg,png}").unwrap(); - /// for entry in glob.walk(".").not(["**/(?i)<.:0,1>private/**"]).unwrap() { - /// let entry = entry.unwrap(); - /// // ... - /// } - /// ``` - /// - /// [`FileIterator::filter_tree`]: crate::FileIterator::filter_tree - /// [`Glob`]: crate::Glob - /// [`Iterator::filter`]: std::iter::Iterator::filter - /// [`Pattern`]: crate::Pattern - /// [`Pattern::is_exhaustive`]: crate::Pattern::is_exhaustive - /// [`WalkEntry`]: crate::WalkEntry - /// [`WalkNegation`]: crate::WalkNegation - fn not<'t, I>(self, patterns: I) -> Result, BuildError> - where - WalkEntry: From, - Self: Sized, - Self::Entry: AsRef, - I: IntoIterator, - I::Item: Combine<'t>, - { - WalkNegation::any(patterns).map(|negation| Not { - input: self, - negation, - }) - } -} - -impl FileIterator for I -where - I: Iterator> + TreeIterator>, -{ - type Entry = T; -} - -/// Negated combinator that efficiently filters [`WalkEntry`]s. -/// -/// Determines an appropriate [`FilterTarget`] for a [`WalkEntry`] based on the -/// [exhaustiveness][`Pattern::is_exhaustive`] of its component [`Pattern`]s. This can be used with -/// [`FilterTree`] to efficiently filter [`WalkEntry`]s without reading directory trees from the -/// file system when not necessary. -/// -/// [`FilterTarget`]: crate::FilterTarget -/// [`FilterTree`]: crate::FilterTree -/// [`Pattern`]: crate::Pattern -/// [`Pattern::is_exhaustive`]: crate::Pattern::is_exhaustive -/// [`WalkEntry`]: crate::WalkEntry -#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] -#[derive(Clone, Debug)] -pub struct WalkNegation { - exhaustive: Regex, - nonexhaustive: Regex, -} - -impl WalkNegation { - /// Combines glob expressions into a `WalkNegation`. - /// - /// This function accepts an [`IntoIterator`] with items that implement [`Combine`], such as - /// [`Glob`] and `&str`. - /// - /// # Errors - /// - /// Returns an error if any of the inputs fail to build. If the inputs are a compiled - /// [`Pattern`] types such as [`Glob`], then this only occurs if the compiled program is too - /// large. - /// - /// [`Combine`]: crate::Combine - /// [`Glob`]: crate::Glob - /// [`IntoIterator`]: std::iter::IntoIterator - /// [`Pattern`]: crate::Pattern - pub fn any<'t, I>(patterns: I) -> Result - where - I: IntoIterator, - I::Item: Combine<'t>, - { - let (exhaustive, nonexhaustive) = patterns - .into_iter() - .map(TryInto::try_into) - .collect::, _>>() - .map_err(Into::into)? - .into_iter() - .partition::, _>(|tree| token::is_exhaustive(tree.as_ref().tokens())); - let negation = WalkNegation { - exhaustive: crate::any(exhaustive)?.pattern, - nonexhaustive: crate::any(nonexhaustive)?.pattern, - }; - Ok(negation) - } - - /// Gets the appropriate [`FilterTarget`] for the given [`WalkEntry`]. - /// - /// This function can be used with [`FileIterator::filter_tree`] to effeciently filter - /// [`WalkEntry`]s without reading directory trees from the file system when not necessary. - /// - /// Returns [`FilterTarget::Tree`] if the [`WalkEntry`] matches an [exhaustive glob - /// expression][`Pattern::is_exhaustive`], such as `secret/**`. - /// - /// [`FileIterator::filter_tree`]: crate::FileIterator::filter_tree - /// [`FilterTarget`]: crate::FilterTarget - /// [`FilterTarget::Tree`]: crate::FilterTarget::Tree - /// [`Pattern::is_exhaustive`]: crate::Pattern::is_exhaustive - /// [`WalkEntry`]: crate::WalkEntry - //pub fn filter( - // &self, - // cancellation: WalkCancellation<'_, I>, - // filtered: Filtered>, - //) -> Filtered> - //where - // I: FileIterator, - // T: 'static + AsRef, - // WalkEntry: From, - //{ - // // Transpose from `Filtered<(Result, WalkEntry)>` into - // // `Result, WalkError>`. - // match filtered.transpose_filtrate() { - // Ok(filtered) => { - // // Get the substituent `WalkEntry` from either the filtrate or the residue. - // let entry: &WalkEntry = match filtered { - // Filtered::Filtrate(ref entry) => entry.as_ref().as_ref(), - // Filtered::Residue(ref entry) => entry.as_ref(), - // }; - // let path = CandidatePath::from(entry.path()); - // if self.exhaustive.is_match(path.as_ref()) { - // // Do not descend into directories that match the exhaustive negation. - // filtered.filter_map_tree(cancellation, From::from) - // } - // else if self.nonexhaustive.is_match(path.as_ref()) { - // filtered.filter_map_node(From::from) - // } - // else { - // filtered - // } - // .map_filtrate(Ok) - // }, - // Err(error) => error.map(Err).into(), - // } - //} - pub fn residue(&self, entry: &WalkEntry) -> Option> { - let candidate = CandidatePath::from(entry.stripped_or_base_path()); - if self.exhaustive.is_match(candidate.as_ref()) { - // Do not descend into directories that match the exhaustive negation. - Some(TreeResidue::Tree(())) - } - else if self.nonexhaustive.is_match(candidate.as_ref()) { - Some(TreeResidue::Node(())) - } - else { - None - } - } -} - -/// Configuration for interpreting symbolic links. -/// -/// Determines how symbolic links are interpreted when traversing directory trees using functions -/// like [`Glob::walk`]. **By default, symbolic links are read as regular files and their targets -/// are ignored.** -/// -/// [`Glob::walk`]: crate::Glob::walk -#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] -pub enum LinkBehavior { - /// Read the symbolic link file itself. - /// - /// This behavior reads the symbolic link as a regular file. The corresponding [`WalkEntry`] - /// uses the path of the link file and its metadata describes the link file itself. The target - /// is effectively ignored and traversal will **not** follow the link. - /// - /// [`WalkEntry`]: crate::WalkEntry - #[default] - ReadFile, - /// Read the target of the symbolic link. - /// - /// This behavior reads the target of the symbolic link. The corresponding [`WalkEntry`] uses - /// the path of the link file and its metadata describes the target. If the target is a - /// directory, then traversal will follow the link and descend into the target. - /// - /// If a link is reentrant and forms a cycle, then an error will be emitted instead of a - /// [`WalkEntry`] and traversal will not follow the link. - /// - /// [`WalkEntry`]: crate::WalkEntry - ReadTarget, -} - -/// Configuration for matching [`Glob`]s against directory trees. -/// -/// Determines the behavior of the traversal within a directory tree when using functions like -/// [`Glob::walk`]. `WalkBehavior` can be constructed via conversions from types representing its -/// fields. APIs generally accept `impl Into`, so these conversion can be used -/// implicitly. When constructed using such a conversion, `WalkBehavior` will use defaults for any -/// remaining fields. -/// -/// # Examples -/// -/// By default, symbolic links are interpreted as regular files and targets are ignored. To read -/// linked targets, use [`LinkBehavior::ReadTarget`]. -/// -/// ```rust -/// use wax::{Glob, LinkBehavior}; -/// -/// for entry in Glob::new("**") -/// .unwrap() -/// .walk_with_behavior(".", LinkBehavior::ReadTarget) -/// { -/// let entry = entry.unwrap(); -/// // ... -/// } -/// ``` -/// -/// [`Glob`]: crate::Glob -/// [`Glob::walk`]: crate::Glob::walk -#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct WalkBehavior { - // TODO: Consider using a dedicated type for this field. Using primitive types does not - // interact well with conversions used in `walk` APIs. For example, if another `usize` - // field is introduced, then the conversions become ambiguous and confusing. - /// Maximum depth. - /// - /// Determines the maximum depth to which a directory tree will be traversed relative to [the - /// root][`Walk::root`]. A depth of zero corresponds to the root and so using such a depth will - /// yield at most one entry for the root. - /// - /// The default value is [`usize::MAX`]. - /// - /// [`usize::MAX`]: usize::MAX - /// [`Walk::root`]: crate::Walk::root - pub depth: usize, - /// Interpretation of symbolic links. - /// - /// Determines how symbolic links are interpreted when traversing a directory tree. See - /// [`LinkBehavior`]. - /// - /// The default value is [`LinkBehavior::ReadFile`]. - /// - /// [`LinkBehavior`]: crate::LinkBehavior - /// [`LinkBehavior::ReadFile`]: crate::LinkBehavior::ReadFile - pub link: LinkBehavior, -} - -/// Constructs a `WalkBehavior` using the following defaults: -/// -/// | Field | Description | Value | -/// |-----------|-----------------------------------|----------------------------| -/// | [`depth`] | Maximum depth. | [`usize::MAX`] | -/// | [`link`] | Interpretation of symbolic links. | [`LinkBehavior::ReadFile`] | -/// -/// [`depth`]: crate::WalkBehavior::depth -/// [`link`]: crate::WalkBehavior::link -/// [`LinkBehavior::ReadFile`]: crate::LinkBehavior::ReadFile -/// [`usize::MAX`]: usize::MAX -impl Default for WalkBehavior { - fn default() -> Self { - WalkBehavior { - depth: usize::MAX, - link: LinkBehavior::default(), - } - } -} - -impl From<()> for WalkBehavior { - fn from(_: ()) -> Self { - Default::default() - } -} - -impl From for WalkBehavior { - fn from(link: LinkBehavior) -> Self { - WalkBehavior { - link, - ..Default::default() - } - } -} - -impl From for WalkBehavior { - fn from(depth: usize) -> Self { - WalkBehavior { - depth, - ..Default::default() - } - } -} - -/// Iterator over files matching a [`Glob`] in a directory tree. -/// -/// `Walk` is a `TreeIterator` and supports [`FileIterator::filter_tree`]. -/// -/// [`FileIterator::filter_tree`]: crate::FileIterator::filter_tree -/// [`Glob`]: crate::Glob -#[derive(Clone, Debug)] -pub struct WalkGlob { - input: I, - root: PathBuf, -} - -impl WalkGlob { - fn compile<'t, J>(tokens: J) -> Result, CompileError> - where - J: IntoIterator>, - J::IntoIter: Clone, - { - let mut regexes = Vec::new(); - for component in token::components(tokens) { - if component - .tokens() - .iter() - .any(|token| token.has_component_boundary()) - { - // Stop at component boundaries, such as tree wildcards or any boundary within a - // group token. - break; - } - regexes.push(Glob::compile(component.tokens().iter().copied())?); - } - Ok(regexes) - } - - /// Gets the root directory of the traversal. - /// - /// The root directory is determined by joining the directory path in functions like - /// [`Glob::walk`] with any [invariant prefix](`Glob::partition`) of the [`Glob`]. When a - /// [`Glob`] is rooted, the root directory is the same as the invariant prefix. - /// - /// The depth specified via [`WalkBehavior`] is relative to this path. - /// - /// [`Glob`]: crate::Glob - /// [`Glob::partition`]: crate::Glob::partition - /// [`Glob::walk`]: crate::Glob::walk - /// [`WalkBehavior`]: crate::WalkBehavior - pub fn root(&self) -> &Path { - self.root.as_ref() - } -} - -impl Iterator for WalkGlob -where - I: FileIterator, -{ - type Item = I::Item; - - fn next(&mut self) -> Option { - filter::filtrate(self) - } -} - -impl SeparatingFilter for WalkGlob -where - I: SeparatingFilter, -{ - type Feed = I::Feed; - - fn feed(&mut self) -> Option> { - self.input.feed() - } -} - -impl SkipTree for WalkGlob -where - I: SkipTree, -{ - fn skip_tree(&mut self) { - self.input.skip_tree() - } -} - -/// Describes how files are read and discarded by [`FilterTree`]. -/// -/// [`FilterTree`]: crate::FilterTree -//#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] -//#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] -//pub enum FilterTarget { -// /// Discard the file. -// /// -// /// The [`WalkEntry`] for the given file is discarded by the [`FilterTree`] adaptor. Only this -// /// particular file is ignored and if the entry represents a directory, then its tree is still -// /// read from the file system. -// /// -// /// [`FilterTree`]: crate::FilterTree -// /// [`WalkEntry`]: crate::WalkEntry -// File, -// /// Discard the file and its directory tree, if any. -// /// -// /// The [`WalkEntry`] for the given file is discarded by the [`FilterTree`] adaptor. If the -// /// entry represents a directory, then its entire tree is ignored and is not read from the file -// /// system. -// /// -// /// When the [`WalkEntry`] represents a normal file (not a directory), then this is the same as -// /// [`FilterTarget::File`]. -// /// -// /// [`FilterTarget::File`]: crate::FilterTarget::File -// /// [`FilterTree`]: crate::FilterTree -// /// [`WalkEntry`]: crate::WalkEntry -// Tree, -//} - -// TODO: Implement this using combinators provided by the `filter` module and RPITIT once it lands -// in stable Rust. Remove any use of `WalkCancellation::unchecked`. -/// Iterator adaptor that filters [`WalkEntry`]s and controls the traversal of directory trees. -/// -/// This adaptor is returned by [`FileIterator::filter_tree`] and in addition to filtering -/// [`WalkEntry`]s also determines how `TreeIterator`s traverse directory trees. If discarded -/// directories do not need to be read from the file system, then **this adaptor should be -/// preferred over functions like [`Iterator::filter`], because it can avoid potentially large and -/// unnecessary reads.** -/// -/// `FilterTree` is a `TreeIterator` and supports [`FileIterator::filter_tree`] so `filter_tree` -/// may be chained. -/// -/// [`FileIterator::filter_tree`]: crate::FileIterator::filter_tree -/// [`WalkEntry`]: crate::WalkEntry -#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] -#[derive(Clone, Debug)] -pub struct FilterTree { - input: I, - f: F, -} - -impl SeparatingFilter for FilterTree -where - WalkEntry: From, - T: 'static + AsRef, - I: FileIterator, - F: FnMut(&WalkEntry) -> Option>, -{ - type Feed = I::Feed; - - fn feed(&mut self) -> Option> { - self.input - .feed() - .map(|separation| match separation.transpose_filtrate() { - Ok(separation) => separation - .filter_tree_by_substituent( - WalkCancellation::unchecked(&mut self.input), - &mut self.f, - ) - .map_filtrate(Ok), - Err(error) => error.map(Err).into(), - }) - } -} - -impl Iterator for FilterTree -where - WalkEntry: From, - T: 'static + AsRef, - I: FileIterator, - F: FnMut(&WalkEntry) -> Option>, -{ - type Item = I::Item; - - fn next(&mut self) -> Option { - filter::filtrate(self) - } -} - -impl SkipTree for FilterTree -where - I: SkipTree, -{ - fn skip_tree(&mut self) { - self.input.skip_tree() - } -} - -// TODO: Implement this using combinators provided by the `filter` module and RPITIT once it lands -// in stable Rust. Remove any use of `WalkCancellation::unchecked`. -#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] -#[derive(Clone, Debug)] -pub struct Not { - input: I, - negation: WalkNegation, -} - -impl SeparatingFilter for Not -where - WalkEntry: From, - T: 'static + AsRef, - I: FileIterator, -{ - type Feed = I::Feed; - - fn feed(&mut self) -> Option> { - self.input - .feed() - .map(|separation| match separation.transpose_filtrate() { - Ok(separation) => separation - .filter_tree_by_substituent( - WalkCancellation::unchecked(&mut self.input), - |substituent| self.negation.residue(substituent), - ) - .map_filtrate(Ok), - Err(error) => error.map(Err).into(), - }) - } -} - -impl Iterator for Not -where - WalkEntry: From, - T: 'static + AsRef, - I: FileIterator, -{ - type Item = I::Item; - - fn next(&mut self) -> Option { - filter::filtrate(self) - } -} - -impl SkipTree for Not -where - I: SkipTree, -{ - fn skip_tree(&mut self) { - self.input.skip_tree() - } -} - -/// Describes a file matching a [`Glob`] in a directory tree. -/// -/// [`Glob`]: crate::Glob -#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] -#[derive(Debug)] -pub struct GlobEntry { - entry: WalkEntry, - matched: MatchedText<'static>, -} - -impl GlobEntry { - pub fn into_path(self) -> PathBuf { - self.entry.entry.into_path() - } - - /// Gets the path of the matched file. - pub fn path(&self) -> &Path { - self.entry.path() - } - - /// Converts the entry to the relative [`CandidatePath`]. - /// - /// **This differs from [`path`] and [`into_path`], which are natively encoded and may be - /// absolute.** The [`CandidatePath`] is always relative to [the root][`Walk::root`] of the - /// directory tree. - /// - /// [`CandidatePath`]: crate::CandidatePath - /// [`into_path`]: crate::WalkEntry::into_path - /// [`matched`]: crate::WalkEntry::matched - /// [`path`]: crate::WalkEntry::path - pub fn to_candidate_path(&self) -> CandidatePath<'_> { - self.matched.to_candidate_path() - } - - pub fn file_type(&self) -> FileType { - self.entry.file_type() - } - - pub fn metadata(&self) -> Result { - self.entry.metadata().map_err(WalkError::from) - } - - /// Gets the depth of the file from [the root][`Walk::root`] of the directory tree. - /// - /// [`Walk::root`]: crate::Walk::root - pub fn depth(&self) -> usize { - self.entry.depth() - } - - /// Gets the matched text in the path of the file. - pub fn matched(&self) -> &MatchedText<'static> { - &self.matched - } -} - -impl AsRef for GlobEntry { - fn as_ref(&self) -> &WalkEntry { - &self.entry - } -} +pub use crate::walk::glob::{GlobEntry, WalkGlob, WalkNegation}; +pub use crate::walk::tree::{ + FileIterator, FilterTree, LinkBehavior, Not, WalkBehavior, WalkEntry, WalkError, WalkTree, +}; diff --git a/src/walk/tree.rs b/src/walk/tree.rs new file mode 100644 index 0000000..f5121a7 --- /dev/null +++ b/src/walk/tree.rs @@ -0,0 +1,677 @@ +use std::fs::{FileType, Metadata}; +use std::io; +use std::path::{Path, PathBuf}; +use thiserror::Error; +use walkdir::{self, DirEntry, WalkDir}; + +use crate::walk::filter::{ + self, Isomeric, SeparatingFilter, SeparatingFilterInput, Separation, SkipTree, TreeIterator, + TreeResidue, WalkCancellation, +}; +use crate::walk::glob::WalkNegation; +use crate::{BuildError, Combine}; + +pub type FileFiltrate = Result; +pub type FileResidue = TreeResidue; +pub type FileFeed = (FileFiltrate, FileResidue); + +impl Isomeric for (T, FileResidue) +where + T: AsRef, +{ + type Substituent<'a> = &'a WalkEntry + where + Self: 'a; + + fn substituent(separation: &Separation) -> Self::Substituent<'_> { + match separation { + Separation::Filtrate(ref filtrate) => filtrate.get().as_ref(), + Separation::Residue(ref residue) => residue.get().as_ref(), + } + } +} + +/// Describes errors that occur when matching a [`Glob`] against a directory tree. +/// +/// `WalkError` implements conversion into [`io::Error`]. +/// +/// [`Glob`]: crate::Glob +/// [`io::Error`]: std::io::Error +#[derive(Debug, Error)] +#[error("failed to match directory tree: {kind}")] +pub struct WalkError { + depth: usize, + kind: WalkErrorKind, +} + +impl WalkError { + /// Gets the path at which the error occurred. + /// + /// Returns `None` if there is no path associated with the error. + pub fn path(&self) -> Option<&Path> { + self.kind.path() + } + + /// Gets the depth from [the root][`Walk::root`] at which the error occurred. + /// + /// [`Walk::root`]: crate::Walk::root + pub fn depth(&self) -> usize { + self.depth + } +} + +impl From for WalkError { + fn from(error: walkdir::Error) -> Self { + let depth = error.depth(); + let path = error.path().map(From::from); + if error.io_error().is_some() { + WalkError { + depth, + kind: WalkErrorKind::Io { + path, + error: error.into_io_error().expect("incongruent error kind"), + }, + } + } + else { + WalkError { + depth, + kind: WalkErrorKind::LinkCycle { + root: error + .loop_ancestor() + .expect("incongruent error kind") + .into(), + leaf: path.expect("incongruent error kind"), + }, + } + } + } +} + +impl From for io::Error { + fn from(error: WalkError) -> Self { + let kind = match error.kind { + WalkErrorKind::Io { ref error, .. } => error.kind(), + _ => io::ErrorKind::Other, + }; + io::Error::new(kind, error) + } +} + +#[derive(Debug, Error)] +#[non_exhaustive] +enum WalkErrorKind { + #[error("failed to read file at `{path:?}`: {error}")] + Io { + path: Option, + error: io::Error, + }, + #[error("symbolic link cycle detected from `{root}` to `{leaf}`")] + LinkCycle { root: PathBuf, leaf: PathBuf }, +} + +impl WalkErrorKind { + pub fn path(&self) -> Option<&Path> { + match self { + WalkErrorKind::Io { ref path, .. } => path.as_ref().map(PathBuf::as_ref), + WalkErrorKind::LinkCycle { ref leaf, .. } => Some(leaf.as_ref()), + } + } +} + +/// Configuration for interpreting symbolic links. +/// +/// Determines how symbolic links are interpreted when traversing directory trees using functions +/// like [`Glob::walk`]. **By default, symbolic links are read as regular files and their targets +/// are ignored.** +/// +/// [`Glob::walk`]: crate::Glob::walk +#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub enum LinkBehavior { + /// Read the symbolic link file itself. + /// + /// This behavior reads the symbolic link as a regular file. The corresponding [`WalkEntry`] + /// uses the path of the link file and its metadata describes the link file itself. The target + /// is effectively ignored and traversal will **not** follow the link. + /// + /// [`WalkEntry`]: crate::WalkEntry + #[default] + ReadFile, + /// Read the target of the symbolic link. + /// + /// This behavior reads the target of the symbolic link. The corresponding [`WalkEntry`] uses + /// the path of the link file and its metadata describes the target. If the target is a + /// directory, then traversal will follow the link and descend into the target. + /// + /// If a link is reentrant and forms a cycle, then an error will be emitted instead of a + /// [`WalkEntry`] and traversal will not follow the link. + /// + /// [`WalkEntry`]: crate::WalkEntry + ReadTarget, +} + +/// Configuration for matching [`Glob`]s against directory trees. +/// +/// Determines the behavior of the traversal within a directory tree when using functions like +/// [`Glob::walk`]. `WalkBehavior` can be constructed via conversions from types representing its +/// fields. APIs generally accept `impl Into`, so these conversion can be used +/// implicitly. When constructed using such a conversion, `WalkBehavior` will use defaults for any +/// remaining fields. +/// +/// # Examples +/// +/// By default, symbolic links are interpreted as regular files and targets are ignored. To read +/// linked targets, use [`LinkBehavior::ReadTarget`]. +/// +/// ```rust +/// use wax::walk::LinkBehavior; +/// use wax::Glob; +/// +/// for entry in Glob::new("**") +/// .unwrap() +/// .walk_with_behavior(".", LinkBehavior::ReadTarget) +/// { +/// let entry = entry.unwrap(); +/// // ... +/// } +/// ``` +/// +/// [`Glob`]: crate::Glob +/// [`Glob::walk`]: crate::Glob::walk +#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct WalkBehavior { + // TODO: Consider using a dedicated type for this field. Using primitive types does not + // interact well with conversions used in `walk` APIs. For example, if another `usize` + // field is introduced, then the conversions become ambiguous and confusing. + /// Maximum depth. + /// + /// Determines the maximum depth to which a directory tree will be traversed relative to [the + /// root][`Walk::root`]. A depth of zero corresponds to the root and so using such a depth will + /// yield at most one entry for the root. + /// + /// The default value is [`usize::MAX`]. + /// + /// [`usize::MAX`]: usize::MAX + /// [`Walk::root`]: crate::Walk::root + pub depth: usize, + /// Interpretation of symbolic links. + /// + /// Determines how symbolic links are interpreted when traversing a directory tree. See + /// [`LinkBehavior`]. + /// + /// The default value is [`LinkBehavior::ReadFile`]. + /// + /// [`LinkBehavior`]: crate::LinkBehavior + /// [`LinkBehavior::ReadFile`]: crate::LinkBehavior::ReadFile + pub link: LinkBehavior, +} + +/// Constructs a `WalkBehavior` using the following defaults: +/// +/// | Field | Description | Value | +/// |-----------|-----------------------------------|----------------------------| +/// | [`depth`] | Maximum depth. | [`usize::MAX`] | +/// | [`link`] | Interpretation of symbolic links. | [`LinkBehavior::ReadFile`] | +/// +/// [`depth`]: crate::WalkBehavior::depth +/// [`link`]: crate::WalkBehavior::link +/// [`LinkBehavior::ReadFile`]: crate::LinkBehavior::ReadFile +/// [`usize::MAX`]: usize::MAX +impl Default for WalkBehavior { + fn default() -> Self { + WalkBehavior { + depth: usize::MAX, + link: LinkBehavior::default(), + } + } +} + +impl From<()> for WalkBehavior { + fn from(_: ()) -> Self { + Default::default() + } +} + +impl From for WalkBehavior { + fn from(link: LinkBehavior) -> Self { + WalkBehavior { + link, + ..Default::default() + } + } +} + +impl From for WalkBehavior { + fn from(depth: usize) -> Self { + WalkBehavior { + depth, + ..Default::default() + } + } +} + +#[derive(Clone, Debug)] +pub struct WalkEntry { + entry: DirEntry, + // TODO: Implement this as a generalized wrapper around entry types (probably by introducing an + // `Entry` trait). This allows any entry type to manage a prefix, allowing prefix + // management within an arbitrary combinator. + stripped: Option, +} + +impl WalkEntry { + pub fn strip_path_prefix(self, prefix: impl AsRef) -> Option { + self.path() + .strip_prefix(prefix) + .ok() + .map(PathBuf::from) + .map(|stripped| { + let WalkEntry { entry, .. } = self; + WalkEntry { + entry, + stripped: Some(stripped), + } + }) + } + + pub fn unstrip_path_prefix(self) -> Self { + let WalkEntry { entry, .. } = self; + WalkEntry { + entry, + stripped: None, + } + } + + pub fn into_path(self) -> PathBuf { + self.entry.into_path() + } + + pub fn path(&self) -> &Path { + self.entry.path() + } + + pub fn metadata(&self) -> Result { + self.entry.metadata().map_err(From::from) + } + + pub fn file_type(&self) -> FileType { + self.entry.file_type() + } + + pub fn depth(&self) -> usize { + self.entry.depth() + } + + pub fn stripped_or_base_path(&self) -> &Path { + self.stripped_path().unwrap_or_else(|| self.path()) + } + + pub fn stripped_path(&self) -> Option<&Path> { + self.stripped.as_ref().map(AsRef::as_ref) + } + + pub fn has_stripped_path(&self) -> bool { + self.stripped.is_some() + } +} + +impl AsRef for WalkEntry { + fn as_ref(&self) -> &WalkEntry { + self + } +} + +impl From for WalkEntry { + fn from(entry: DirEntry) -> Self { + WalkEntry { + entry, + stripped: None, + } + } +} + +#[derive(Debug)] +pub struct WalkTree { + is_dir: bool, + input: walkdir::IntoIter, +} + +impl WalkTree { + pub fn new(root: impl AsRef) -> Self { + Self::with_behavior(root, WalkBehavior::default()) + } + + pub fn with_behavior(root: impl AsRef, behavior: impl Into) -> Self { + let WalkBehavior { depth, link } = behavior.into(); + WalkDir::new(root) + .follow_links(match link { + LinkBehavior::ReadFile => false, + LinkBehavior::ReadTarget => true, + }) + .max_depth(depth) + .into() + } +} + +impl From for WalkTree { + fn from(walkdir: WalkDir) -> Self { + WalkTree { + is_dir: false, + input: walkdir.into_iter(), + } + } +} + +impl Iterator for WalkTree { + type Item = Result; + + fn next(&mut self) -> Option { + let (is_dir, next) = match self.input.next() { + Some(result) => match result { + Ok(entry) => (entry.file_type().is_dir(), Some(Ok(entry.into()))), + Err(error) => (false, Some(Err(error.into()))), + }, + _ => (false, None), + }; + self.is_dir = is_dir; + next + } +} + +impl SeparatingFilterInput for WalkTree { + type Feed = (Result, TreeResidue); +} + +impl SkipTree for WalkTree { + fn skip_tree(&mut self) { + // `IntoIter::skip_current_dir` discards the least recently yielded directory, but + // `skip_tree` must act upon the most recently yielded node regardless of its topology + // (leaf vs. branch). + if self.is_dir { + self.input.skip_current_dir(); + } + } +} + +/// An [`Iterator`] over [`WalkEntry`]s that can filter directory trees. +/// +/// A `FileIterator` is a `TreeIterator` that yields [`WalkEntry`]s. This trait is implemented by +/// [`Walk`] and adaptors like [`FilterTree`]. A `TreeIterator` is an iterator that reads its items +/// from a tree and therefore can meaningfully filter not only items but their corresponding +/// sub-trees to avoid unnecessary work. To that end, this trait provides the `filter_tree` +/// function, which allows directory trees to be discarded (not read from the file system) when +/// matching [`Glob`]s against directory trees. +/// +/// [`filter_tree`]: crate::FileIterator::filter_tree +/// [`Glob`]: crate::Glob +/// [`Iterator`]: std::iter::Iterator +/// [`WalkEntry`]: crate::WalkEntry +#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] +pub trait FileIterator: + Iterator> + TreeIterator> +{ + type Entry; + + /// Filters [`WalkEntry`]s and controls the traversal of directory trees. + /// + /// This function creates an adaptor that filters [`WalkEntry`]s and furthermore specifies how + /// iteration proceeds to traverse directory trees. The adaptor accepts a function that, when + /// discarding a [`WalkEntry`], yields a [`FilterTarget`]. **If the entry refers to a directory + /// and [`FilterTarget::Tree`] is returned by the function, then iteration will not descend + /// into that directory and the tree will not be read from the file system.** Therefore, this + /// adaptor should be preferred over functions like [`Iterator::filter`] when discarded + /// directories do not need to be read. + /// + /// Errors are not filtered, so if an error occurs reading a file at a path that would have + /// been discarded, then that error is still yielded by the iterator. + /// + /// # Examples + /// + /// The [`FilterTree`] adaptor can be used to apply additional custom filtering that avoids + /// unnecessary directory reads. The following example filters out hidden files on Unix and + /// Windows. On Unix, hidden files are filtered out nominally via [`not`]. On Windows, + /// `filter_tree` is used to detect the [hidden attribute][attributes]. In both cases, the + /// adaptor does not read conventionally hidden directory trees. + /// + /// ```rust,no_run + /// use wax::walk::FileIterator; + /// #[cfg(windows)] + /// use wax::walk::TreeResidue; + /// use wax::Glob; + /// + /// let glob = Glob::new("**/*.(?i){jpg,jpeg}").unwrap(); + /// let walk = glob.walk("./Pictures"); + /// // Filter out nominally hidden files on Unix. Like `filter_tree`, `not` does not perform + /// // unnecessary reads of directory trees. + /// #[cfg(unix)] + /// let walk = walk.not(["**/.*/**"]).unwrap(); + /// // Filter out files with the hidden attribute on Windows. + /// #[cfg(windows)] + /// let walk = walk.filter_tree(|entry| { + /// use std::os::windows::fs::MetadataExt as _; + /// + /// const ATTRIBUTE_HIDDEN: u32 = 0x2; + /// + /// let attributes = entry.metadata().unwrap().file_attributes(); + /// if (attributes & ATTRIBUTE_HIDDEN) == ATTRIBUTE_HIDDEN { + /// // Do not read hidden directory trees. + /// Some(TreeResidue::Tree(())) + /// } + /// else { + /// None + /// } + /// }); + /// for entry in walk { + /// let entry = entry.unwrap(); + /// println!("JPEG: {:?}", entry.path()); + /// } + /// ``` + /// + /// [`FilterTree`]: crate::FilterTree + /// [`Iterator`]: std::iter::Iterator + /// [`Iterator::filter`]: std::iter::Iterator::filter + /// [`not`]: crate::Walk::not + /// [`Walk`]: crate::Walk + /// [`WalkEntry`]: crate::WalkEntry + /// + /// [attributes]: https://docs.microsoft.com/en-us/windows/win32/fileio/file-attribute-constants + fn filter_tree(self, f: F) -> FilterTree + where + WalkEntry: From, + Self: Sized, + Self::Entry: AsRef, + F: FnMut(&WalkEntry) -> Option>, + { + FilterTree { input: self, f } + } + + /// Filters [`WalkEntry`]s against negated glob expressions. + /// + /// This function creates an adaptor that discards [`WalkEntry`]s that match any of the given + /// glob expressions. This allows for broad negations while matching a [`Glob`] against a + /// directory tree that cannot be achieved using a single glob expression alone. + /// + /// The adaptor is constructed via [`FilterTree`] and [`WalkNegation`] and therefore does not + /// read directory trees from the file system when a directory matches an [exhaustive glob + /// expression][`Pattern::is_exhaustive`] such as `**/private/**` or `hidden/</>*`. **This + /// function should be preferred when filtering [`WalkEntry`]s against [`Glob`]s, since this + /// avoids potentially large and unnecessary reads**. + /// + /// # Errors + /// + /// Returns an error if any of the inputs fail to build. If the inputs are a compiled + /// [`Pattern`] type such as [`Glob`], then this only occurs if the compiled program is too + /// large. + /// + /// # Examples + /// + /// Because glob expressions do not support general negations, it is sometimes impossible to + /// express patterns that deny particular text. In such cases, `not` can be used to apply + /// additional patterns as a filter. + /// + /// ```rust,no_run + /// use wax::walk::FileIterator; + /// use wax::Glob; + /// + /// // Find image files, but not if they are beneath a directory with a name that suggests that + /// // they are private. + /// let glob = Glob::new("**/*.(?i){jpg,jpeg,png}").unwrap(); + /// for entry in glob.walk(".").not(["**/(?i)<.:0,1>private/**"]).unwrap() { + /// let entry = entry.unwrap(); + /// // ... + /// } + /// ``` + /// + /// [`FileIterator::filter_tree`]: crate::FileIterator::filter_tree + /// [`Glob`]: crate::Glob + /// [`Iterator::filter`]: std::iter::Iterator::filter + /// [`Pattern`]: crate::Pattern + /// [`Pattern::is_exhaustive`]: crate::Pattern::is_exhaustive + /// [`WalkEntry`]: crate::WalkEntry + /// [`WalkNegation`]: crate::WalkNegation + fn not<'t, I>(self, patterns: I) -> Result, BuildError> + where + WalkEntry: From, + Self: Sized, + Self::Entry: AsRef, + I: IntoIterator, + I::Item: Combine<'t>, + { + WalkNegation::any(patterns).map(|negation| Not { + input: self, + negation, + }) + } +} + +impl FileIterator for I +where + I: Iterator> + TreeIterator>, +{ + type Entry = T; +} + +// TODO: Implement this using combinators provided by the `filter` module and RPITIT once it lands +// in stable Rust. Remove any use of `WalkCancellation::unchecked`. +/// Iterator adaptor that filters [`WalkEntry`]s and controls the traversal of directory trees. +/// +/// This adaptor is returned by [`FileIterator::filter_tree`] and in addition to filtering +/// [`WalkEntry`]s also determines how `TreeIterator`s traverse directory trees. If discarded +/// directories do not need to be read from the file system, then **this adaptor should be +/// preferred over functions like [`Iterator::filter`], because it can avoid potentially large and +/// unnecessary reads.** +/// +/// `FilterTree` is a `TreeIterator` and supports [`FileIterator::filter_tree`] so `filter_tree` +/// may be chained. +/// +/// [`FileIterator::filter_tree`]: crate::FileIterator::filter_tree +/// [`WalkEntry`]: crate::WalkEntry +#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] +#[derive(Clone, Debug)] +pub struct FilterTree { + input: I, + f: F, +} + +impl SeparatingFilter for FilterTree +where + WalkEntry: From, + T: 'static + AsRef, + I: FileIterator, + F: FnMut(&WalkEntry) -> Option>, +{ + type Feed = I::Feed; + + fn feed(&mut self) -> Option> { + self.input + .feed() + .map(|separation| match separation.transpose_filtrate() { + Ok(separation) => separation + .filter_tree_by_substituent( + WalkCancellation::unchecked(&mut self.input), + &mut self.f, + ) + .map_filtrate(Ok), + Err(error) => error.map(Err).into(), + }) + } +} + +impl Iterator for FilterTree +where + WalkEntry: From, + T: 'static + AsRef, + I: FileIterator, + F: FnMut(&WalkEntry) -> Option>, +{ + type Item = I::Item; + + fn next(&mut self) -> Option { + filter::filtrate(self) + } +} + +impl SkipTree for FilterTree +where + I: SkipTree, +{ + fn skip_tree(&mut self) { + self.input.skip_tree() + } +} + +// TODO: Implement this using combinators provided by the `filter` module and RPITIT once it lands +// in stable Rust. Remove any use of `WalkCancellation::unchecked`. +#[cfg_attr(docsrs, doc(cfg(feature = "walk")))] +#[derive(Clone, Debug)] +pub struct Not { + input: I, + negation: WalkNegation, +} + +impl SeparatingFilter for Not +where + WalkEntry: From, + T: 'static + AsRef, + I: FileIterator, +{ + type Feed = I::Feed; + + fn feed(&mut self) -> Option> { + self.input + .feed() + .map(|separation| match separation.transpose_filtrate() { + Ok(separation) => separation + .filter_tree_by_substituent( + WalkCancellation::unchecked(&mut self.input), + |substituent| self.negation.residue(substituent), + ) + .map_filtrate(Ok), + Err(error) => error.map(Err).into(), + }) + } +} + +impl Iterator for Not +where + WalkEntry: From, + T: 'static + AsRef, + I: FileIterator, +{ + type Item = I::Item; + + fn next(&mut self) -> Option { + filter::filtrate(self) + } +} + +impl SkipTree for Not +where + I: SkipTree, +{ + fn skip_tree(&mut self) { + self.input.skip_tree() + } +} diff --git a/tests/walk.rs b/tests/walk.rs index 2ca9c54..9852010 100644 --- a/tests/walk.rs +++ b/tests/walk.rs @@ -5,7 +5,8 @@ use std::collections::HashSet; use std::path::PathBuf; use tempfile::{self, TempDir}; -use wax::{FileIterator, Glob, LinkBehavior, WalkBehavior}; +use wax::walk::{FileIterator, LinkBehavior, WalkBehavior}; +use wax::Glob; // TODO: Rust's testing framework does not provide a mechanism for maintaining // shared state. This means that tests that write to the file system must