Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for ^ and $ anchors in regular expressions #23

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/regex/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,17 @@ impl Compiler {
self.set_split(split, j2, j3);
}
}
HirKind::Look(_) => return Err(Error::NoEmpty),
HirKind::Look(look) => {
match look {
regex_syntax::hir::Look::Start => {
self.push(Inst::StartText);
}
regex_syntax::hir::Look::End => {
self.push(Inst::EndText);
}
_ => return Err(Error::NoWordBoundary),
}
}
}
self.check_size()
}
Expand Down
32 changes: 29 additions & 3 deletions src/regex/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ impl DfaBuilder {
for &ip in &self.dfa.states[state].insts {
cur.add(ip);
}
self.dfa.run(cur, next, byte);
self.dfa.run(cur, next, Some(byte), false, false);
let next_state = self.cached_state(next);
self.dfa.states[state].next[byte as usize] = next_state;
next_state
Expand All @@ -92,6 +92,7 @@ impl DfaBuilder {
is_match = true;
insts.push(ip);
}
StartText | EndText => insts.push(ip),
}
}
if insts.is_empty() {
Expand Down Expand Up @@ -134,10 +135,23 @@ impl Dfa {
self.add(set, ip1);
self.add(set, ip2);
}
StartText => {
self.add(set, ip + 1);
}
EndText => {
self.add(set, ip + 1);
}
}
}

fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: u8) -> bool {
fn run(
&self,
from: &SparseSet,
to: &mut SparseSet,
byte: Option<u8>,
at_start: bool,
at_end: bool,
) -> bool {
use super::Inst::*;
to.clear();
let mut is_match = false;
Expand All @@ -147,7 +161,19 @@ impl Dfa {
Jump(_) | Split(_, _) => {}
Match => is_match = true,
Range(s, e) => {
if s <= byte && byte <= e {
if let Some(b) = byte {
if s <= b && b <= e {
self.add(to, ip + 1);
}
}
}
StartText => {
if at_start {
self.add(to, ip + 1);
}
}
EndText => {
if at_end {
self.add(to, ip + 1);
}
}
Expand Down
12 changes: 1 addition & 11 deletions src/regex/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,6 @@ pub enum Error {
///
/// This restriction may be lifted in the future.
NoWordBoundary,
/// Empty or "zero width assertions" such as `^` or `$` are currently
/// not allowed.
///
/// This restriction may be lifted in the future.
NoEmpty,
/// Byte literals such as `(?-u:\xff)` are not allowed.
///
/// This restriction may be lifted in the future.
Expand All @@ -49,7 +44,7 @@ impl From<regex_syntax::Error> for Error {
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Error::*;
match *self {
match self {
Syntax(ref err) => err.fmt(f),
CompiledTooBig(size_limit) => write!(
f,
Expand All @@ -71,11 +66,6 @@ impl fmt::Display for Error {
"Word boundary operators are not \
allowed."
),
NoEmpty => write!(
f,
"Empty match operators are not allowed \
(hopefully temporary)."
),
NoBytes => write!(f, "Byte literals are not allowed."),
}
}
Expand Down
74 changes: 62 additions & 12 deletions src/regex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ pub use self::error::Error;
/// 2. Word boundaries (i.e., `\b`). Because such things are hard to do in
/// a deterministic finite automaton, but not impossible. As such, these
/// may be allowed some day.
/// 3. Other zero width assertions like `^` and `$`. These are easier to
/// support than word boundaries, but are still tricky and usually aren't
/// as useful when searching dictionaries.
///
/// Otherwise, the [full syntax of the `regex`
/// crate](http://doc.rust-lang.org/regex/regex/index.html#syntax)
Expand All @@ -58,12 +55,58 @@ pub struct Regex {
dfa: dfa::Dfa,
}

#[cfg(test)]
mod tests {
use super::*;

fn run_regex(re: &str, input: &str) -> bool {
let regex = Regex::new(re).unwrap();
let mut state = regex.start();
for b in input.as_bytes() {
state = regex.accept(&state, *b);
if !regex.can_match(&state) {
return false;
}
}
regex.is_match(&state)
}

#[test]
fn test_start_text() {
assert!(run_regex(r"^abc", "abc"));
assert!(run_regex(r"^abc.*", "abcdef"));
assert!(!run_regex(r"^abc", "defabc"));
}

#[test]
fn test_end_text() {
assert!(run_regex(r"abc$", "abc"));
assert!(run_regex(r".*abc$", "defabc"));
assert!(!run_regex(r"abc$", "abcdef"));
}

#[test]
fn test_start_and_end_text() {
assert!(run_regex(r"^abc$", "abc"));
assert!(!run_regex(r"^abc$", "defabc"));
assert!(!run_regex(r"^abc$", "abcdef"));
}

#[test]
fn test_empty_string() {
assert!(run_regex(r"^$", ""));
assert!(!run_regex(r"^$", "a"));
}
}

#[derive(Eq, PartialEq)]
pub enum Inst {
Match,
Jump(usize),
Split(usize, usize),
Range(u8, u8),
StartText,
EndText,
}

impl Regex {
Expand Down Expand Up @@ -93,26 +136,31 @@ impl Regex {
}

impl Automaton for Regex {
type State = Option<usize>;
type State = (Option<usize>, usize); // (state index, position)

#[inline]
fn start(&self) -> Option<usize> {
Some(0)
fn start(&self) -> Self::State {
(Some(0), 0)
}

#[inline]
fn is_match(&self, state: &Option<usize>) -> bool {
state.map(|state| self.dfa.is_match(state)).unwrap_or(false)
fn is_match(&self, state: &Self::State) -> bool {
state
.0
.map(|state| self.dfa.is_match(state))
.unwrap_or(false)
}

#[inline]
fn can_match(&self, state: &Option<usize>) -> bool {
state.is_some()
fn can_match(&self, state: &Self::State) -> bool {
state.0.is_some()
}

#[inline]
fn accept(&self, state: &Option<usize>, byte: u8) -> Option<usize> {
state.and_then(|state| self.dfa.accept(state, byte))
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
let (si, pos) = *state;
let si = si.and_then(|si| self.dfa.accept(si, byte));
(si, pos + 1)
}
}

Expand All @@ -131,6 +179,8 @@ impl fmt::Debug for Inst {
Jump(ip) => write!(f, "JUMP {}", ip),
Split(ip1, ip2) => write!(f, "SPLIT {}, {}", ip1, ip2),
Range(s, e) => write!(f, "RANGE {:X}-{:X}", s, e),
StartText => write!(f, "START"),
EndText => write!(f, "END"),
}
}
}