From 71f707d762d3a72cf2199d9fca6f2aef830144e6 Mon Sep 17 00:00:00 2001
From: Neil Hansen <neil.hansen.31@gmail.com>
Date: Fri, 15 Nov 2024 15:56:54 -0800
Subject: [PATCH 1/2] tests pass

---
 src/regex/compile.rs | 12 ++++++-
 src/regex/dfa.rs     | 29 +++++++++++++++--
 src/regex/error.rs   | 12 +------
 src/regex/mod.rs     | 74 +++++++++++++++++++++++++++++++++++++-------
 4 files changed, 100 insertions(+), 27 deletions(-)
diff --git a/src/regex/compile.rs b/src/regex/compile.rs
index 7267a7d4..fcfaa3cf 100644
--- a/src/regex/compile.rs
+++ b/src/regex/compile.rs
@@ -96,7 +96,17 @@ impl Compiler {
                     self.set_split(split, j2, j3);
                 }
             }
-            HirKind::Look(_) => return Err(Error::NoEmpty),
+            HirKind::Look(look) => {
+                match look {
+                    regex_syntax::hir::Look::Start => {
+                        self.push(Inst::StartText);
+                    }
+                    regex_syntax::hir::Look::End => {
+                        self.push(Inst::EndText);
+                    }
+                    _ => return Err(Error::NoWordBoundary),
+                }
+            }
         }
         self.check_size()
     }
diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs
index 3af59edf..6d52c119 100644
--- a/src/regex/dfa.rs
+++ b/src/regex/dfa.rs
@@ -21,6 +21,8 @@ struct State {
     insts: Vec<usize>,
     next: [Option<usize>; 256],
     is_match: bool,
+    at_start: bool,
+    at_end: bool,
 }
 
 impl DfaBuilder {
@@ -69,7 +71,7 @@ impl DfaBuilder {
         for &ip in &self.dfa.states[state].insts {
             cur.add(ip);
         }
-        self.dfa.run(cur, next, byte);
+        self.dfa.run(cur, next, Some(byte), false, false);
         let next_state = self.cached_state(next);
         self.dfa.states[state].next[byte as usize] = next_state;
         next_state
@@ -92,6 +94,7 @@ impl DfaBuilder {
                     is_match = true;
                     insts.push(ip);
                 }
+                StartText | EndText => insts.push(ip),
             }
         }
         if insts.is_empty() {
@@ -104,6 +107,8 @@ impl DfaBuilder {
                     insts,
                     next: [None; 256],
                     is_match,
+                    at_start: false,
+                    at_end: false,
                 });
                 *v.insert(self.dfa.states.len() - 1)
             }
@@ -134,10 +139,16 @@ impl Dfa {
                 self.add(set, ip1);
                 self.add(set, ip2);
             }
+            StartText => {
+                self.add(set, ip + 1);
+            }
+            EndText => {
+                self.add(set, ip + 1);
+            }
         }
     }
 
-    fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: u8) -> bool {
+    fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: Option<u8>, at_start: bool, at_end: bool) -> bool {
         use super::Inst::*;
         to.clear();
         let mut is_match = false;
@@ -147,7 +158,19 @@ impl Dfa {
                 Jump(_) | Split(_, _) => {}
                 Match => is_match = true,
                 Range(s, e) => {
-                    if s <= byte && byte <= e {
+                    if let Some(b) = byte {
+                        if s <= b && b <= e {
+                            self.add(to, ip + 1);
+                        }
+                    }
+                }
+                StartText => {
+                    if at_start {
+                        self.add(to, ip + 1);
+                    }
+                }
+                EndText => {
+                    if at_end {
                         self.add(to, ip + 1);
                     }
                 }
diff --git a/src/regex/error.rs b/src/regex/error.rs
index 561c554c..cf1826f3 100644
--- a/src/regex/error.rs
+++ b/src/regex/error.rs
@@ -28,11 +28,6 @@ pub enum Error {
     ///
     /// This restriction may be lifted in the future.
     NoWordBoundary,
-    /// Empty or "zero width assertions" such as `^` or `$` are currently
-    /// not allowed.
-    ///
-    /// This restriction may be lifted in the future.
-    NoEmpty,
     /// Byte literals such as `(?-u:\xff)` are not allowed.
     ///
     /// This restriction may be lifted in the future.
@@ -49,7 +44,7 @@ impl From<regex_syntax::Error> for Error {
 impl fmt::Display for Error {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         use self::Error::*;
-        match *self {
+        match self {
             Syntax(ref err) => err.fmt(f),
             CompiledTooBig(size_limit) => write!(
                 f,
@@ -71,11 +66,6 @@ impl fmt::Display for Error {
                 "Word boundary operators are not \
                  allowed."
             ),
-            NoEmpty => write!(
-                f,
-                "Empty match operators are not allowed \
-                 (hopefully temporary)."
-            ),
             NoBytes => write!(f, "Byte literals are not allowed."),
         }
     }
diff --git a/src/regex/mod.rs b/src/regex/mod.rs
index 50f25185..84ec5f10 100644
--- a/src/regex/mod.rs
+++ b/src/regex/mod.rs
@@ -29,9 +29,6 @@ pub use self::error::Error;
 /// 2. Word boundaries (i.e., `\b`). Because such things are hard to do in
 ///    a deterministic finite automaton, but not impossible. As such, these
 ///    may be allowed some day.
-/// 3. Other zero width assertions like `^` and `$`. These are easier to
-///    support than word boundaries, but are still tricky and usually aren't
-///    as useful when searching dictionaries.
 ///
 /// Otherwise, the [full syntax of the `regex`
 /// crate](http://doc.rust-lang.org/regex/regex/index.html#syntax)
@@ -58,12 +55,58 @@ pub struct Regex {
     dfa: dfa::Dfa,
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn run_regex(re: &str, input: &str) -> bool {
+        let regex = Regex::new(re).unwrap();
+        let mut state = regex.start();
+        for b in input.as_bytes() {
+            state = regex.accept(&state, *b);
+            if !regex.can_match(&state) {
+                return false;
+            }
+        }
+        regex.is_match(&state)
+    }
+
+    #[test]
+    fn test_start_text() {
+        assert!(run_regex(r"^abc", "abc"));
+        assert!(run_regex(r"^abc.*", "abcdef"));
+        assert!(!run_regex(r"^abc", "defabc"));
+    }
+
+    #[test]
+    fn test_end_text() {
+        assert!(run_regex(r"abc$", "abc"));
+        assert!(run_regex(r".*abc$", "defabc"));
+        assert!(!run_regex(r"abc$", "abcdef"));
+    }
+
+    #[test]
+    fn test_start_and_end_text() {
+        assert!(run_regex(r"^abc$", "abc"));
+        assert!(!run_regex(r"^abc$", "defabc"));
+        assert!(!run_regex(r"^abc$", "abcdef"));
+    }
+
+    #[test]
+    fn test_empty_string() {
+        assert!(run_regex(r"^$", ""));
+        assert!(!run_regex(r"^$", "a"));
+    }
+}
+
 #[derive(Eq, PartialEq)]
 pub enum Inst {
     Match,
     Jump(usize),
     Split(usize, usize),
     Range(u8, u8),
+    StartText,
+    EndText,
 }
 
 impl Regex {
@@ -93,26 +136,31 @@ impl Regex {
 }
 
 impl Automaton for Regex {
-    type State = Option<usize>;
+    type State = (Option<usize>, usize); // (state index, position)
 
     #[inline]
-    fn start(&self) -> Option<usize> {
-        Some(0)
+    fn start(&self) -> Self::State {
+        (Some(0), 0)
     }
 
     #[inline]
-    fn is_match(&self, state: &Option<usize>) -> bool {
-        state.map(|state| self.dfa.is_match(state)).unwrap_or(false)
+    fn is_match(&self, state: &Self::State) -> bool {
+        state
+            .0
+            .map(|state| self.dfa.is_match(state))
+            .unwrap_or(false)
     }
 
     #[inline]
-    fn can_match(&self, state: &Option<usize>) -> bool {
-        state.is_some()
+    fn can_match(&self, state: &Self::State) -> bool {
+        state.0.is_some()
     }
 
     #[inline]
-    fn accept(&self, state: &Option<usize>, byte: u8) -> Option<usize> {
-        state.and_then(|state| self.dfa.accept(state, byte))
+    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
+        let (si, pos) = *state;
+        let si = si.and_then(|si| self.dfa.accept(si, byte));
+        (si, pos + 1)
     }
 }
 
@@ -131,6 +179,8 @@ impl fmt::Debug for Inst {
             Jump(ip) => write!(f, "JUMP {}", ip),
             Split(ip1, ip2) => write!(f, "SPLIT {}, {}", ip1, ip2),
             Range(s, e) => write!(f, "RANGE {:X}-{:X}", s, e),
+            StartText => write!(f, "START"),
+            EndText => write!(f, "END"),
         }
     }
 }

From eb590eb5c128b3865203e791dce3b2aa411808cb Mon Sep 17 00:00:00 2001
From: Neil Hansen <neil.hansen.31@gmail.com>
Date: Tue, 19 Nov 2024 15:24:22 -0800
Subject: [PATCH 2/2] remove extra fields

---
 src/regex/dfa.rs | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs
index 6d52c119..89592140 100644
--- a/src/regex/dfa.rs
+++ b/src/regex/dfa.rs
@@ -21,8 +21,6 @@ struct State {
     insts: Vec<usize>,
     next: [Option<usize>; 256],
     is_match: bool,
-    at_start: bool,
-    at_end: bool,
 }
 
 impl DfaBuilder {
@@ -107,8 +105,6 @@ impl DfaBuilder {
                     insts,
                     next: [None; 256],
                     is_match,
-                    at_start: false,
-                    at_end: false,
                 });
                 *v.insert(self.dfa.states.len() - 1)
             }
@@ -148,7 +144,14 @@ impl Dfa {
         }
     }
 
-    fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: Option<u8>, at_start: bool, at_end: bool) -> bool {
+    fn run(
+        &self,
+        from: &SparseSet,
+        to: &mut SparseSet,
+        byte: Option<u8>,
+        at_start: bool,
+        at_end: bool,
+    ) -> bool {
         use super::Inst::*;
         to.clear();
         let mut is_match = false;