Skip to content

Commit 3fb34e2

Browse files
committed
Fix #186.
The problem here was that match instructions from each regex in a set were being carried through all of the DFA states. This was intentional so that when the DFA was done executing, we could look at the last state the machine was in and figure which match instructions were reached, and therefore determine which expressions in the set matched. This doesn't work when every regex in the set is anchored, because it causes the DFA to continue creating non-dead states, even when no other regex could possibly match. To fix this, we are more selective about adding match instructions to states. This will cause the match instructions to eventually disappear, so we build up the matches in the set as they occur when producing cached states. This is a little hokey, but since we're only interested in the existence of a match, it works.
1 parent b4360ec commit 3fb34e2

File tree

3 files changed

+14
-15
lines changed

3 files changed

+14
-15
lines changed

src/compile.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -998,7 +998,7 @@ impl ByteClassSet {
998998
// N.B. If you're debugging the DFA, it's useful to simply return
999999
// `(0..256).collect()`, which effectively removes the byte classes
10001000
// and makes the transitions easier to read.
1001-
// return (0..256).collect();
1001+
// return (0..256).map(|b| b as u8).collect();
10021002
let mut byte_classes = vec![0; 256];
10031003
let mut class = 0u8;
10041004
for i in 0..256 {

src/dfa.rs

+9-14
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ use std::fmt;
4646
use std::mem;
4747

4848
use exec::Search;
49-
use prog::{Inst, Program};
49+
use prog::Program;
5050
use sparse::SparseSet;
5151

5252
/// The cache limit specifies approximately how much space we're willing to
@@ -465,6 +465,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> {
465465
}
466466
self.at += 1;
467467
}
468+
// println!("exiting loop at input {:?}", self.at);
468469
// Run the DFA once more on the special EOF senitnel value.
469470
si = match self.next_state(qcur, qnext, si, Byte::eof()) {
470471
None => return DfaResult::Quit,
@@ -481,13 +482,6 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> {
481482
result = DfaResult::Match;
482483
self.search.set_end(Some(text.len()));
483484
}
484-
if result.is_match() && !self.search.find_one_match() {
485-
for &ip in &self.states[si as usize].insts {
486-
if let Inst::Match(slot) = self.prog[ip as usize] {
487-
self.search.set_match(slot);
488-
}
489-
}
490-
}
491485
result
492486
}
493487

@@ -628,16 +622,16 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> {
628622
Char(_) | Ranges(_) => unreachable!(),
629623
// These states are handled when following epsilon transitions.
630624
Save(_) | Split(_) | EmptyLook(_) => {}
631-
Match(_) => {
625+
Match(slot) => {
632626
is_match = true;
633627
if !self.continue_past_first_match() {
634628
break;
635-
} else if !self.search.find_one_match() {
629+
} else if !self.search.find_one_match()
630+
&& !self.search.has_match(slot)
631+
&& !qnext.contains_ip(ip as usize) {
636632
// If we are continuing on to find other matches,
637633
// then keep a record of the match states we've seen.
638-
if !qnext.contains_ip(ip as usize) {
639-
qnext.add(ip);
640-
}
634+
qnext.add(ip);
641635
}
642636
}
643637
Bytes(ref inst) => {
@@ -876,8 +870,9 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> {
876870
| NotWordBoundaryAscii => unreachable!(),
877871
}
878872
}
879-
Match(_) => {
873+
Match(slot) => {
880874
insts.push(ip);
875+
self.search.set_match(slot);
881876
if !self.continue_past_first_match() {
882877
break;
883878
}

src/exec.rs

+4
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ impl<'caps, 'matches> Search<'caps, 'matches> {
7676
self.matches.iter().all(|m| *m)
7777
}
7878

79+
pub fn has_match(&mut self, match_slot: usize) -> bool {
80+
self.matches.get(match_slot).map_or(false, |&b| b)
81+
}
82+
7983
pub fn set_match(&mut self, match_slot: usize) {
8084
self.matched_any = true;
8185
if let Some(old) = self.matches.get_mut(match_slot) {

0 commit comments

Comments
 (0)