diff --git a/src/regex/compile.rs b/src/regex/compile.rs index 7267a7d..fcfaa3c 100644 --- a/src/regex/compile.rs +++ b/src/regex/compile.rs @@ -96,7 +96,17 @@ impl Compiler { self.set_split(split, j2, j3); } } - HirKind::Look(_) => return Err(Error::NoEmpty), + HirKind::Look(look) => { + match look { + regex_syntax::hir::Look::Start => { + self.push(Inst::StartText); + } + regex_syntax::hir::Look::End => { + self.push(Inst::EndText); + } + _ => return Err(Error::NoWordBoundary), + } + } } self.check_size() } diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs index 3af59ed..8959214 100644 --- a/src/regex/dfa.rs +++ b/src/regex/dfa.rs @@ -69,7 +69,7 @@ impl DfaBuilder { for &ip in &self.dfa.states[state].insts { cur.add(ip); } - self.dfa.run(cur, next, byte); + self.dfa.run(cur, next, Some(byte), false, false); let next_state = self.cached_state(next); self.dfa.states[state].next[byte as usize] = next_state; next_state @@ -92,6 +92,7 @@ impl DfaBuilder { is_match = true; insts.push(ip); } + StartText | EndText => insts.push(ip), } } if insts.is_empty() { @@ -134,10 +135,23 @@ impl Dfa { self.add(set, ip1); self.add(set, ip2); } + StartText => { + self.add(set, ip + 1); + } + EndText => { + self.add(set, ip + 1); + } } } - fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: u8) -> bool { + fn run( + &self, + from: &SparseSet, + to: &mut SparseSet, + byte: Option, + at_start: bool, + at_end: bool, + ) -> bool { use super::Inst::*; to.clear(); let mut is_match = false; @@ -147,7 +161,19 @@ impl Dfa { Jump(_) | Split(_, _) => {} Match => is_match = true, Range(s, e) => { - if s <= byte && byte <= e { + if let Some(b) = byte { + if s <= b && b <= e { + self.add(to, ip + 1); + } + } + } + StartText => { + if at_start { + self.add(to, ip + 1); + } + } + EndText => { + if at_end { self.add(to, ip + 1); } } diff --git a/src/regex/error.rs b/src/regex/error.rs index 561c554..cf1826f 100644 --- a/src/regex/error.rs +++ b/src/regex/error.rs @@ -28,11 +28,6 @@ pub enum Error { /// /// This restriction may be lifted in the future. NoWordBoundary, - /// Empty or "zero width assertions" such as `^` or `$` are currently - /// not allowed. - /// - /// This restriction may be lifted in the future. - NoEmpty, /// Byte literals such as `(?-u:\xff)` are not allowed. /// /// This restriction may be lifted in the future. @@ -49,7 +44,7 @@ impl From for Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::Error::*; - match *self { + match self { Syntax(ref err) => err.fmt(f), CompiledTooBig(size_limit) => write!( f, @@ -71,11 +66,6 @@ impl fmt::Display for Error { "Word boundary operators are not \ allowed." ), - NoEmpty => write!( - f, - "Empty match operators are not allowed \ - (hopefully temporary)." - ), NoBytes => write!(f, "Byte literals are not allowed."), } } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 50f2518..84ec5f1 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -29,9 +29,6 @@ pub use self::error::Error; /// 2. Word boundaries (i.e., `\b`). Because such things are hard to do in /// a deterministic finite automaton, but not impossible. As such, these /// may be allowed some day. -/// 3. Other zero width assertions like `^` and `$`. These are easier to -/// support than word boundaries, but are still tricky and usually aren't -/// as useful when searching dictionaries. /// /// Otherwise, the [full syntax of the `regex` /// crate](http://doc.rust-lang.org/regex/regex/index.html#syntax) @@ -58,12 +55,58 @@ pub struct Regex { dfa: dfa::Dfa, } +#[cfg(test)] +mod tests { + use super::*; + + fn run_regex(re: &str, input: &str) -> bool { + let regex = Regex::new(re).unwrap(); + let mut state = regex.start(); + for b in input.as_bytes() { + state = regex.accept(&state, *b); + if !regex.can_match(&state) { + return false; + } + } + regex.is_match(&state) + } + + #[test] + fn test_start_text() { + assert!(run_regex(r"^abc", "abc")); + assert!(run_regex(r"^abc.*", "abcdef")); + assert!(!run_regex(r"^abc", "defabc")); + } + + #[test] + fn test_end_text() { + assert!(run_regex(r"abc$", "abc")); + assert!(run_regex(r".*abc$", "defabc")); + assert!(!run_regex(r"abc$", "abcdef")); + } + + #[test] + fn test_start_and_end_text() { + assert!(run_regex(r"^abc$", "abc")); + assert!(!run_regex(r"^abc$", "defabc")); + assert!(!run_regex(r"^abc$", "abcdef")); + } + + #[test] + fn test_empty_string() { + assert!(run_regex(r"^$", "")); + assert!(!run_regex(r"^$", "a")); + } +} + #[derive(Eq, PartialEq)] pub enum Inst { Match, Jump(usize), Split(usize, usize), Range(u8, u8), + StartText, + EndText, } impl Regex { @@ -93,26 +136,31 @@ impl Regex { } impl Automaton for Regex { - type State = Option; + type State = (Option, usize); // (state index, position) #[inline] - fn start(&self) -> Option { - Some(0) + fn start(&self) -> Self::State { + (Some(0), 0) } #[inline] - fn is_match(&self, state: &Option) -> bool { - state.map(|state| self.dfa.is_match(state)).unwrap_or(false) + fn is_match(&self, state: &Self::State) -> bool { + state + .0 + .map(|state| self.dfa.is_match(state)) + .unwrap_or(false) } #[inline] - fn can_match(&self, state: &Option) -> bool { - state.is_some() + fn can_match(&self, state: &Self::State) -> bool { + state.0.is_some() } #[inline] - fn accept(&self, state: &Option, byte: u8) -> Option { - state.and_then(|state| self.dfa.accept(state, byte)) + fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + let (si, pos) = *state; + let si = si.and_then(|si| self.dfa.accept(si, byte)); + (si, pos + 1) } } @@ -131,6 +179,8 @@ impl fmt::Debug for Inst { Jump(ip) => write!(f, "JUMP {}", ip), Split(ip1, ip2) => write!(f, "SPLIT {}, {}", ip1, ip2), Range(s, e) => write!(f, "RANGE {:X}-{:X}", s, e), + StartText => write!(f, "START"), + EndText => write!(f, "END"), } } }