From 71f707d762d3a72cf2199d9fca6f2aef830144e6 Mon Sep 17 00:00:00 2001 From: Neil Hansen Date: Fri, 15 Nov 2024 15:56:54 -0800 Subject: [PATCH 1/2] tests pass --- src/regex/compile.rs | 12 ++++++- src/regex/dfa.rs | 29 +++++++++++++++-- src/regex/error.rs | 12 +------ src/regex/mod.rs | 74 +++++++++++++++++++++++++++++++++++++------- 4 files changed, 100 insertions(+), 27 deletions(-) diff --git a/src/regex/compile.rs b/src/regex/compile.rs index 7267a7d4..fcfaa3cf 100644 --- a/src/regex/compile.rs +++ b/src/regex/compile.rs @@ -96,7 +96,17 @@ impl Compiler { self.set_split(split, j2, j3); } } - HirKind::Look(_) => return Err(Error::NoEmpty), + HirKind::Look(look) => { + match look { + regex_syntax::hir::Look::Start => { + self.push(Inst::StartText); + } + regex_syntax::hir::Look::End => { + self.push(Inst::EndText); + } + _ => return Err(Error::NoWordBoundary), + } + } } self.check_size() } diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs index 3af59edf..6d52c119 100644 --- a/src/regex/dfa.rs +++ b/src/regex/dfa.rs @@ -21,6 +21,8 @@ struct State { insts: Vec, next: [Option; 256], is_match: bool, + at_start: bool, + at_end: bool, } impl DfaBuilder { @@ -69,7 +71,7 @@ impl DfaBuilder { for &ip in &self.dfa.states[state].insts { cur.add(ip); } - self.dfa.run(cur, next, byte); + self.dfa.run(cur, next, Some(byte), false, false); let next_state = self.cached_state(next); self.dfa.states[state].next[byte as usize] = next_state; next_state @@ -92,6 +94,7 @@ impl DfaBuilder { is_match = true; insts.push(ip); } + StartText | EndText => insts.push(ip), } } if insts.is_empty() { @@ -104,6 +107,8 @@ impl DfaBuilder { insts, next: [None; 256], is_match, + at_start: false, + at_end: false, }); *v.insert(self.dfa.states.len() - 1) } @@ -134,10 +139,16 @@ impl Dfa { self.add(set, ip1); self.add(set, ip2); } + StartText => { + self.add(set, ip + 1); + } + EndText => { + self.add(set, ip + 1); + } } } - fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: u8) -> bool { + fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: Option, at_start: bool, at_end: bool) -> bool { use super::Inst::*; to.clear(); let mut is_match = false; @@ -147,7 +158,19 @@ impl Dfa { Jump(_) | Split(_, _) => {} Match => is_match = true, Range(s, e) => { - if s <= byte && byte <= e { + if let Some(b) = byte { + if s <= b && b <= e { + self.add(to, ip + 1); + } + } + } + StartText => { + if at_start { + self.add(to, ip + 1); + } + } + EndText => { + if at_end { self.add(to, ip + 1); } } diff --git a/src/regex/error.rs b/src/regex/error.rs index 561c554c..cf1826f3 100644 --- a/src/regex/error.rs +++ b/src/regex/error.rs @@ -28,11 +28,6 @@ pub enum Error { /// /// This restriction may be lifted in the future. NoWordBoundary, - /// Empty or "zero width assertions" such as `^` or `$` are currently - /// not allowed. - /// - /// This restriction may be lifted in the future. - NoEmpty, /// Byte literals such as `(?-u:\xff)` are not allowed. /// /// This restriction may be lifted in the future. @@ -49,7 +44,7 @@ impl From for Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::Error::*; - match *self { + match self { Syntax(ref err) => err.fmt(f), CompiledTooBig(size_limit) => write!( f, @@ -71,11 +66,6 @@ impl fmt::Display for Error { "Word boundary operators are not \ allowed." ), - NoEmpty => write!( - f, - "Empty match operators are not allowed \ - (hopefully temporary)." - ), NoBytes => write!(f, "Byte literals are not allowed."), } } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 50f25185..84ec5f10 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -29,9 +29,6 @@ pub use self::error::Error; /// 2. Word boundaries (i.e., `\b`). Because such things are hard to do in /// a deterministic finite automaton, but not impossible. As such, these /// may be allowed some day. -/// 3. Other zero width assertions like `^` and `$`. These are easier to -/// support than word boundaries, but are still tricky and usually aren't -/// as useful when searching dictionaries. /// /// Otherwise, the [full syntax of the `regex` /// crate](http://doc.rust-lang.org/regex/regex/index.html#syntax) @@ -58,12 +55,58 @@ pub struct Regex { dfa: dfa::Dfa, } +#[cfg(test)] +mod tests { + use super::*; + + fn run_regex(re: &str, input: &str) -> bool { + let regex = Regex::new(re).unwrap(); + let mut state = regex.start(); + for b in input.as_bytes() { + state = regex.accept(&state, *b); + if !regex.can_match(&state) { + return false; + } + } + regex.is_match(&state) + } + + #[test] + fn test_start_text() { + assert!(run_regex(r"^abc", "abc")); + assert!(run_regex(r"^abc.*", "abcdef")); + assert!(!run_regex(r"^abc", "defabc")); + } + + #[test] + fn test_end_text() { + assert!(run_regex(r"abc$", "abc")); + assert!(run_regex(r".*abc$", "defabc")); + assert!(!run_regex(r"abc$", "abcdef")); + } + + #[test] + fn test_start_and_end_text() { + assert!(run_regex(r"^abc$", "abc")); + assert!(!run_regex(r"^abc$", "defabc")); + assert!(!run_regex(r"^abc$", "abcdef")); + } + + #[test] + fn test_empty_string() { + assert!(run_regex(r"^$", "")); + assert!(!run_regex(r"^$", "a")); + } +} + #[derive(Eq, PartialEq)] pub enum Inst { Match, Jump(usize), Split(usize, usize), Range(u8, u8), + StartText, + EndText, } impl Regex { @@ -93,26 +136,31 @@ impl Regex { } impl Automaton for Regex { - type State = Option; + type State = (Option, usize); // (state index, position) #[inline] - fn start(&self) -> Option { - Some(0) + fn start(&self) -> Self::State { + (Some(0), 0) } #[inline] - fn is_match(&self, state: &Option) -> bool { - state.map(|state| self.dfa.is_match(state)).unwrap_or(false) + fn is_match(&self, state: &Self::State) -> bool { + state + .0 + .map(|state| self.dfa.is_match(state)) + .unwrap_or(false) } #[inline] - fn can_match(&self, state: &Option) -> bool { - state.is_some() + fn can_match(&self, state: &Self::State) -> bool { + state.0.is_some() } #[inline] - fn accept(&self, state: &Option, byte: u8) -> Option { - state.and_then(|state| self.dfa.accept(state, byte)) + fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + let (si, pos) = *state; + let si = si.and_then(|si| self.dfa.accept(si, byte)); + (si, pos + 1) } } @@ -131,6 +179,8 @@ impl fmt::Debug for Inst { Jump(ip) => write!(f, "JUMP {}", ip), Split(ip1, ip2) => write!(f, "SPLIT {}, {}", ip1, ip2), Range(s, e) => write!(f, "RANGE {:X}-{:X}", s, e), + StartText => write!(f, "START"), + EndText => write!(f, "END"), } } } From eb590eb5c128b3865203e791dce3b2aa411808cb Mon Sep 17 00:00:00 2001 From: Neil Hansen Date: Tue, 19 Nov 2024 15:24:22 -0800 Subject: [PATCH 2/2] remove extra fields --- src/regex/dfa.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs index 6d52c119..89592140 100644 --- a/src/regex/dfa.rs +++ b/src/regex/dfa.rs @@ -21,8 +21,6 @@ struct State { insts: Vec, next: [Option; 256], is_match: bool, - at_start: bool, - at_end: bool, } impl DfaBuilder { @@ -107,8 +105,6 @@ impl DfaBuilder { insts, next: [None; 256], is_match, - at_start: false, - at_end: false, }); *v.insert(self.dfa.states.len() - 1) } @@ -148,7 +144,14 @@ impl Dfa { } } - fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: Option, at_start: bool, at_end: bool) -> bool { + fn run( + &self, + from: &SparseSet, + to: &mut SparseSet, + byte: Option, + at_start: bool, + at_end: bool, + ) -> bool { use super::Inst::*; to.clear(); let mut is_match = false;