From 5872defc60bbc2174cd88d76eeab9b394b1e60e9 Mon Sep 17 00:00:00 2001 From: Sean Olson Date: Wed, 27 Mar 2024 12:04:50 -0700 Subject: [PATCH] [wip] Compile globs via HIR rather than text. This change migrates from `regex` to its intermediate `regex-syntax` and `regex-automata` crates. This greatly improves the flexibility in encoding globs and matched text and removes the need for obtuse textual encoding before compiling automata. --- Cargo.toml | 10 +- src/capture.rs | 134 +++++--------------------- src/encode.rs | 15 ++- src/hir.rs | 246 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 58 ++++++++--- src/token/mod.rs | 6 ++ src/walk/glob.rs | 14 +-- 7 files changed, 344 insertions(+), 139 deletions(-) create mode 100644 src/hir.rs diff --git a/Cargo.toml b/Cargo.toml index 91c57da..d8ec0d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,6 @@ miette = [ walk = ["dep:walkdir"] [dependencies] -const_format = "^0.2.0" itertools = "^0.11.0" nom = "^7.0.0" pori = "=0.0.0" @@ -44,15 +43,20 @@ version = "^5.10.0" default-features = false optional = true -[dependencies.regex] -version = "^1.9.0" +[dependencies.regex-automata] +version = "^0.4.6" default-features = false features = [ + "meta", "perf", "std", "unicode-case" ] +[dependencies.regex-syntax] +version = "^0.8.3" +default-features = false + [dependencies.tardar] version = "^0.1.0" optional = true diff --git a/src/capture.rs b/src/capture.rs index a4e4f36..e242806 100644 --- a/src/capture.rs +++ b/src/capture.rs @@ -1,81 +1,28 @@ -use regex::Captures as BorrowedText; +use regex_automata::meta::Regex; +use regex_automata::util::captures::Captures; +use std::borrow::Cow; use std::str; use crate::CandidatePath; -#[derive(Clone, Debug)] -struct OwnedText { - matched: String, - ranges: Vec>, +pub trait RegexExt { + fn matched<'t>(&self, text: impl Into>) -> Option>; } -impl OwnedText { - pub fn get(&self, index: usize) -> Option<&str> { - if index == 0 { - Some(self.matched.as_ref()) +impl RegexExt for Regex { + fn matched<'t>(&self, text: impl Into>) -> Option> { + let text = text.into(); + let mut captures = self.create_captures(); + self.captures(text.as_ref(), &mut captures); + if captures.is_match() { + Some(MatchedText { text, captures }) } else { - self.ranges - .get(index - 1) - .and_then(|range| range.map(|range| &self.matched[range.0..range.1])) + None } } } -impl<'t> From> for OwnedText { - fn from(captures: BorrowedText<'t>) -> Self { - From::from(&captures) - } -} - -impl<'m, 't> From<&'m BorrowedText<'t>> for OwnedText { - fn from(captures: &'m BorrowedText<'t>) -> Self { - let matched = captures.get(0).unwrap().as_str().into(); - let ranges = captures - .iter() - .skip(1) - .map(|capture| capture.map(|capture| (capture.start(), capture.end()))) - .collect(); - OwnedText { matched, ranges } - } -} - -#[derive(Debug)] -enum MaybeOwnedText<'t> { - Borrowed(BorrowedText<'t>), - Owned(OwnedText), -} - -impl<'t> MaybeOwnedText<'t> { - fn into_owned(self) -> MaybeOwnedText<'static> { - match self { - MaybeOwnedText::Borrowed(borrowed) => OwnedText::from(borrowed).into(), - MaybeOwnedText::Owned(owned) => owned.into(), - } - } - - // This conversion may appear to operate in place. - #[must_use] - fn to_owned(&self) -> MaybeOwnedText<'static> { - match self { - MaybeOwnedText::Borrowed(ref borrowed) => OwnedText::from(borrowed).into(), - MaybeOwnedText::Owned(ref owned) => owned.clone().into(), - } - } -} - -impl<'t> From> for MaybeOwnedText<'t> { - fn from(captures: BorrowedText<'t>) -> Self { - MaybeOwnedText::Borrowed(captures) - } -} - -impl From for MaybeOwnedText<'static> { - fn from(captures: OwnedText) -> Self { - MaybeOwnedText::Owned(captures) - } -} - /// Text that has been matched by a [`Program`] and its captures. /// /// To match a [`Glob`] or other [`Program`] against a [`CandidatePath`] and get the matched text, @@ -107,33 +54,19 @@ impl From for MaybeOwnedText<'static> { /// [`Glob`]: crate::Glob /// [`Program`]: crate::Program /// [`Program::matched`]: crate::Program::matched -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct MatchedText<'t> { - inner: MaybeOwnedText<'t>, + text: Cow<'t, str>, + captures: Captures, } impl<'t> MatchedText<'t> { /// Clones any borrowed data into an owning instance. pub fn into_owned(self) -> MatchedText<'static> { - let MatchedText { inner } = self; - MatchedText { - inner: inner.into_owned(), - } - } - - /// Clones any borrowed data to an owning instance. - /// - /// This function is similar to [`into_owned`], but does not consume its receiver. Due to a - /// technical limitation, `MatchedText` cannot properly implement [`Clone`], so this function - /// is provided as a stop gap that allows a distinct instance to be created that owns its data. - /// - /// [`Clone`]: std::clone::Clone - /// [`into_owned`]: crate::MatchedText::into_owned - // This conversion may appear to operate in place. - #[must_use] - pub fn to_owned(&self) -> MatchedText<'static> { + let MatchedText { text, captures } = self; MatchedText { - inner: self.inner.to_owned(), + text: text.into_owned().into(), + captures, } } @@ -162,32 +95,15 @@ impl<'t> MatchedText<'t> { /// /// [`Program`]: crate::Program pub fn get(&self, index: usize) -> Option<&str> { - match self.inner { - MaybeOwnedText::Borrowed(ref captures) => { - captures.get(index).map(|capture| capture.as_str()) - }, - MaybeOwnedText::Owned(ref captures) => captures.get(index), - } + self.captures.get_group(index).map(|span| { + self.text + .as_ref() + .get(span.start..span.end) + .expect("match span not in text") + }) } pub fn to_candidate_path(&self) -> CandidatePath { CandidatePath::from(self.complete()) } } - -// TODO: This probably shouldn't be part of the public API. -impl<'t> From> for MatchedText<'t> { - fn from(captures: BorrowedText<'t>) -> Self { - MatchedText { - inner: captures.into(), - } - } -} - -impl From for MatchedText<'static> { - fn from(captures: OwnedText) -> Self { - MatchedText { - inner: captures.into(), - } - } -} diff --git a/src/encode.rs b/src/encode.rs index 7f9c29e..199462f 100644 --- a/src/encode.rs +++ b/src/encode.rs @@ -2,13 +2,18 @@ use const_format::formatcp; use itertools::{Itertools as _, Position}; #[cfg(feature = "miette")] use miette::Diagnostic; -use regex::{Error as RegexError, Regex}; +use regex_automata::meta::BuildError; +use regex_syntax::hir::Hir; use std::borrow::{Borrow, Cow}; #[cfg(feature = "miette")] use std::fmt::Display; use thiserror::Error; -use crate::token::{ConcatenationTree, Token, TokenTopology}; +// TODO: Replace this file with `hir.rs`. + +pub use regex_automata::meta::Regex; + +use crate::token::{self, Fold, ConcatenationTree, Token, TokenTopology}; /// A regular expression that never matches. /// @@ -93,13 +98,13 @@ trait Escaped { impl Escaped for char { fn escaped(&self) -> String { - regex::escape(&self.to_string()) + regex_syntax::escape(&self.to_string()) } } impl Escaped for str { fn escaped(&self) -> String { - regex::escape(self) + regex_syntax::escape(self) } } @@ -128,7 +133,7 @@ impl Grouping { } pub fn case_folded_eq(left: &str, right: &str) -> bool { - let regex = Regex::new(&format!("(?i){}", regex::escape(left))) + let regex = Regex::new(&format!("(?i){}", regex_syntax::escape(left))) .expect("failed to compile literal regular expression"); if let Some(matched) = regex.find(right) { matched.start() == 0 && matched.end() == right.len() diff --git a/src/hir.rs b/src/hir.rs new file mode 100644 index 0000000..0f1fa3a --- /dev/null +++ b/src/hir.rs @@ -0,0 +1,246 @@ +#[cfg(feature = "miette")] +use miette::Diagnostic; +use regex_syntax::hir::{self, Hir}; +use std::borrow::Borrow; +#[cfg(feature = "miette")] +use std::fmt::Display; +use thiserror::Error; + +pub use regex_automata::meta::Regex; + +use crate::token::walk::{Fold, Forward}; +use crate::token::{self, Archetype, BranchKind, ConcatenationTree, LeafKind}; + +trait IntoHir { + fn into_hir(self) -> Hir; +} + +impl IntoHir for hir::ClassUnicode { + fn into_hir(self) -> Hir { + Hir::class(hir::Class::Unicode(self)) + } +} + +/// Describes errors that occur when compiling a glob expression. +/// +/// **This error only occurs when the size of the compiled program is too large.** All other +/// compilation errors are considered internal bugs and will panic. +#[derive(Clone, Debug, Error)] +#[error("failed to compile glob: {kind}")] +pub struct CompileError { + kind: CompileErrorKind, +} + +#[derive(Clone, Copy, Debug, Error)] +#[non_exhaustive] +enum CompileErrorKind { + #[error("oversized program")] + OversizedProgram, +} + +#[cfg(feature = "miette")] +#[cfg_attr(docsrs, doc(cfg(feature = "miette")))] +impl Diagnostic for CompileError { + fn code<'a>(&'a self) -> Option> { + Some(Box::new(String::from(match self.kind { + CompileErrorKind::OversizedProgram => "wax::glob::oversized_program", + }))) + } +} + +// TODO: The implementation of this function depends on platform/OS. +fn separator() -> hir::ClassUnicode { + hir::ClassUnicode::new([hir::ClassUnicodeRange::new('/', '/')]) +} + +fn not_separator() -> hir::ClassUnicode { + let mut hir = separator(); + hir.negate(); + hir +} + +fn case_folded_literal(text: impl AsRef) -> Hir { + Hir::concat( + text.as_ref() + .chars() + .map(|point| hir::ClassUnicode::new([hir::ClassUnicodeRange::new(point, point)])) + .map(|mut hir| { + let _ = hir.try_case_fold_simple(); + hir + }) + .map(hir::Class::Unicode) + .map(Hir::class) + .collect(), + ) +} + +pub fn case_folded_eq(lhs: impl AsRef, rhs: impl AsRef) -> bool { + let lhs = self::case_folded_literal(lhs); + let rhs = rhs.as_ref(); + let regex = Regex::builder() + .build_from_hir(&lhs) + .expect("failed to compile case folding regular expression"); + regex.find(rhs).map_or(false, |matched| { + matched.start() == 0 && matched.end() == rhs.len() + }) +} + +pub fn compile<'t, T>(tree: impl Borrow) -> Result +where + T: ConcatenationTree<'t>, +{ + #[derive(Debug, Default)] + struct Compile; + + impl<'t, A> Fold<'t, A> for Compile { + type Sequencer = Forward; + type Term = Hir; + + fn sequencer() -> Self::Sequencer { + Forward + } + + fn fold( + &mut self, + branch: &BranchKind<'t, A>, + terms: Vec, + ) -> Option { + use BranchKind::{Alternation, Concatenation, Repetition}; + + Some(match branch { + Alternation(_) => Hir::alternation(terms), + Concatenation(_) => Hir::concat(terms), + Repetition(ref repetition) => { + let (min, max) = repetition.bound_specification(); + Hir::repetition(hir::Repetition { + min: u32::try_from(min).unwrap_or(u32::MAX), + max: max.map(u32::try_from).transpose().unwrap_or(Some(u32::MAX)), + greedy: true, + sub: Box::new(Hir::concat(terms)), + }) + }, + }) + } + + fn finalize(&mut self, _branch: &BranchKind<'t, A>, term: Self::Term) -> Self::Term { + term + } + + fn term(&mut self, leaf: &LeafKind<'t>) -> Self::Term { + use token::Wildcard::{One, Tree, ZeroOrMore}; + use Archetype::{Character, Range}; + use LeafKind::{Class, Literal, Separator, Wildcard}; + + match leaf { + Class(ref class) => { + let is_negated = class.is_negated(); + let mut class = + hir::ClassUnicode::new(class.archetypes().iter().map(|archetype| { + let (start, end) = match archetype { + Character(ref point) => (*point, *point), + Range(ref start, ref end) => (*start, *end), + }; + hir::ClassUnicodeRange::new(start, end) + })); + if is_negated { + class.union(&self::separator()); + class.negate(); + } + else { + class.difference(&self::separator()); + } + class.into_hir() + }, + Literal(ref literal) => { + if literal.is_case_insensitive() { + self::case_folded_literal(literal.text()) + } + else { + Hir::literal(literal.text().as_bytes()) + } + }, + // TODO: Separators should probably also match the end of text when they are at the + // end of a glob expression. This may not be possible in a fold with simple + // terms though, since that positional information isn't available until + // reaching the root of the token tree. + Separator(_) => self::separator().into_hir(), + Wildcard(ref wildcard) => match wildcard { + One => Hir::class(hir::Class::Unicode(self::not_separator())), + Tree { has_root } => Hir::alternation(vec![ + Hir::concat(vec![ + if *has_root { + self::separator().into_hir() + } + else { + Hir::empty() + }, + Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy: true, + sub: Box::new(Hir::dot(hir::Dot::AnyChar)), + }), + Hir::alternation(vec![ + self::separator().into_hir(), + Hir::look(hir::Look::End), + ]), + ]), + self::separator().into_hir(), + Hir::empty(), + ]), + // TODO: Zero or more wildcards should match **one** or more if they comprise + // the entirety of a component, such as in `a/*/b`. This may not be + // possible in a fold with simple terms though, since adjacency + // information isn't available until reaching the root of the token tree. + ZeroOrMore(ref evaluation) => Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy: evaluation.is_eager(), + sub: Box::new(self::not_separator().into_hir()), + }), + }, + } + } + } + + let mut capture_group_index = 1u32; + let hir = Hir::concat( + Some(Hir::look(hir::Look::Start)) + .into_iter() + .chain(tree.borrow().concatenation().iter().map(|token| { + let hir = token.fold(Compile::default()).unwrap_or_else(Hir::empty); + if token.is_capturing() { + let index = capture_group_index; + capture_group_index = capture_group_index + .checked_add(1) + .expect("overflow determining capture group index"); + // TODO: Some tokens require different capture topology depending on their + // position in the concatenation. Position information is trivially + // available here, but a more complex term (likely a closure of some + // kind) is needed to integrate the capture HIR into its tree. This + // namely applies to tree wildcards, which should not always capture the + // entirety of the text that they match. + Hir::capture(hir::Capture { + index, + name: None, + sub: Box::new(hir), + }) + } + else { + hir + } + })) + .chain(Some(Hir::look(hir::Look::End))) + .collect(), + ); + // TODO: Remove this. + //eprintln!("TREE\n{:#?}", hir); + Regex::builder() + .build_from_hir(&hir) + .map_err(|error| match error.size_limit() { + Some(_) => CompileError { + kind: CompileErrorKind::OversizedProgram, + }, + _ => panic!("failed to compile glob"), + }) +} diff --git a/src/lib.rs b/src/lib.rs index ade2bcd..a7204c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,13 +32,16 @@ mod capture; mod diagnostics; -mod encode; +//mod encode; mod filter; +mod hir; pub mod query; mod rule; mod token; pub mod walk; +use hir as encode; + /// Re-exports of commonly used items. /// /// This module anonymously re-exports traits for matching [`Program`]s against file paths and @@ -66,7 +69,6 @@ pub mod prelude { #[cfg(feature = "miette")] use miette::Diagnostic; -use regex::Regex; use std::borrow::{Borrow, Cow}; use std::cmp::Ordering; use std::convert::Infallible; @@ -76,8 +78,9 @@ use std::path::{Path, PathBuf}; use std::str::{self, FromStr}; use thiserror::Error; +use crate::capture::RegexExt as _; use crate::diagnostics::LocatedError; -use crate::encode::CompileError; +use crate::encode::{CompileError, Regex}; use crate::query::{CapturingToken, DepthVariance, TextVariance, When}; use crate::rule::{Checked, RuleError}; use crate::token::{ @@ -745,7 +748,7 @@ impl<'t> Program<'t> for Glob<'t> { } fn matched<'p>(&self, path: &'p CandidatePath<'_>) -> Option> { - self.program.captures(path.as_ref()).map(From::from) + self.program.matched(path.as_ref()) } fn depth(&self) -> DepthVariance { @@ -804,7 +807,7 @@ impl<'t> Program<'t> for Any<'t> { } fn matched<'p>(&self, path: &'p CandidatePath<'_>) -> Option> { - self.program.captures(path.as_ref()).map(From::from) + self.program.matched(path.as_ref()) } fn depth(&self) -> DepthVariance { @@ -1089,7 +1092,8 @@ pub mod harness { pub fn assert_matched_is_none(matched: Option>) { assert!( matched.is_none(), - "matched text is `Some`, but expected `None`" + "matched text is `{:?}`, but expected `None`", + matched.as_ref().map(MatchedText::complete), ); } @@ -1623,6 +1627,32 @@ mod tests { harness::assert_any_is_err(patterns); } + // TODO: Remove this. + #[rstest] + #[case("a/b", "a/b", harness::assert_matched_has_text([ + (0, "a/b"), + ]))] + #[case("a?b", "axb", harness::assert_matched_has_text([ + (0, "axb"), + (1, "x"), + ]))] + #[case("a?b", "axyb", harness::assert_matched_is_none)] + #[case("a*b", "axyb", harness::assert_matched_has_text([ + (0, "axyb"), + (1, "xy"), + ]))] + #[case("a{c*,d*}{e*,f*}b", "acxfyb", harness::assert_matched_has_text([ + (0, "acxfyb"), + (1, "cx"), + (2, "fy"), + ]))] + fn match_glob_hir(#[case] expression: &str, #[case] path: &str, #[case] f: F) + where + F: FnOnce(Option>) -> T, + { + harness::assert_match_program_with(harness::assert_new_glob_is_ok(expression), path, f); + } + #[rstest] #[case("", harness::assert_matched_has_text([(0, "")]))] #[case("abc", harness::assert_matched_is_none)] @@ -1700,7 +1730,7 @@ mod tests { #[case("a/x/file.ext", harness::assert_matched_has_text([ (0, "a/x/file.ext"), (1, "x"), - (2, "file.ext"), + (2, "/file.ext"), ]))] #[case("a/y/file.ext", harness::assert_matched_has_text([(1, "y")]))] #[case("a/i/file.ext", harness::assert_matched_has_text([(1, "i")]))] @@ -1718,7 +1748,7 @@ mod tests { #[case("a/金/file.ext", harness::assert_matched_has_text([ (0, "a/金/file.ext"), (1, "金"), - (2, "file.ext"), + (2, "/file.ext"), ]))] #[case("a/銀/file.ext", harness::assert_matched_has_text([(1, "銀")]))] #[case("a/銅/file.ext", harness::assert_matched_is_none)] @@ -1955,17 +1985,19 @@ mod tests { #[rstest] #[case("/var/log/network.log", harness::assert_matched_has_text([ (0, "/var/log/network.log"), + // TODO: It seems compelling for `/**/` to capture `/` here, but is this general or ought + // this only apply to rooted tree wildcards at the beginning of an expression? + (1, "/"), (2, "var"), (3, "log/"), (4, "network"), ]))] #[case("/home/nobody/.var/network.log", harness::assert_matched_has_text([ (0, "/home/nobody/.var/network.log"), - // TODO: The match and capture behavior of `**` here seems not only to cross boundaries, - // but match only part of a component! Greedy or not, tree wildcards ought to operate - // exclusively on some number of **complete** components: - (1, "/home/nobody/."), - (2, "var"), + (1, "/home/nobody/"), + (2, ".var"), + // TODO: See capture 1 in the first test case. Should this capture `/` or `` (nothing)? + (3, "/"), (4, "network"), ]))] #[case("./var/cron.log", harness::assert_matched_is_none)] diff --git a/src/token/mod.rs b/src/token/mod.rs index 4d36505..3492c08 100644 --- a/src/token/mod.rs +++ b/src/token/mod.rs @@ -1176,6 +1176,12 @@ pub enum Evaluation { Lazy, } +impl Evaluation { + pub fn is_eager(&self) -> bool { + matches!(self, Evaluation::Eager) + } +} + #[derive(Clone, Debug, PartialEq)] pub struct Literal<'t> { text: Cow<'t, str>, diff --git a/src/walk/glob.rs b/src/walk/glob.rs index 6a5e18a..e762f23 100644 --- a/src/walk/glob.rs +++ b/src/walk/glob.rs @@ -1,11 +1,10 @@ use itertools::Itertools; -use regex::Regex; use std::borrow::Borrow; use std::fs::{FileType, Metadata}; use std::path::{Component, Path, PathBuf}; -use crate::capture::MatchedText; -use crate::encode::CompileError; +use crate::capture::{MatchedText, RegexExt as _}; +use crate::encode::{CompileError, Regex}; use crate::filter::{HierarchicalIterator, Separation}; use crate::token::{Token, TokenTree, Tokenized}; use crate::walk::{ @@ -313,8 +312,7 @@ impl GlobWalker { if let Some(matched) = self .program .complete - .captures(candidate.as_ref()) - .map(MatchedText::from) + .matched(candidate.as_ref()) .map(MatchedText::into_owned) { filtrate @@ -342,8 +340,7 @@ impl GlobWalker { return if let Some(matched) = self .program .complete - .captures(candidate.as_ref()) - .map(MatchedText::from) + .matched(candidate.as_ref()) .map(MatchedText::into_owned) { filtrate @@ -371,8 +368,7 @@ impl GlobWalker { if let Some(matched) = self .program .complete - .captures(candidate.as_ref()) - .map(MatchedText::from) + .matched(candidate.as_ref()) .map(MatchedText::into_owned) { return filtrate