Skip to content

Commit 7084f14

Browse files
committed
Add known upper limit to capture search.
The DFA will report the end location of a match, so we should pass that along to capture detection. In theory, the DFA and the NFA report the same match locations, so this upper bound shouldn't be necessary---the NFA should quit once it finds the right match. It turns out though bounding the text has two important ramifications: 1. It will enable the backtracking engine to be used more often. In particular, the backtracking engine can only be used on small inputs and this change decreases the size of the input by only considering the match. 2. The backtracking engine must start every search by zeroing memory that is proportional to the size of the input. If the input is smaller, then this runs more quickly. We are also careful to bound the match to one additional "character" past the end of the match, so that lookahead operators work correctly.
1 parent 4471212 commit 7084f14

File tree

2 files changed

+31
-13
lines changed

2 files changed

+31
-13
lines changed

src/exec.rs

+12-13
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
use std::cell::RefCell;
1212
use std::collections::HashMap;
13+
use std::cmp;
1314
use std::sync::Arc;
1415

1516
use thread_local::CachedThreadLocal;
@@ -27,6 +28,7 @@ use re_bytes;
2728
use re_trait::{RegularExpression, Slot};
2829
use re_unicode;
2930
use set;
31+
use utf8::next_utf8;
3032

3133
/// Exec manages the execution of a regular expression.
3234
///
@@ -253,17 +255,7 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
253255
fn slots_len(&self) -> usize { self.0.slots_len() }
254256

255257
fn next_after_empty(&self, text: &str, i: usize) -> usize {
256-
let b = text.as_bytes()[i];
257-
let inc = if b <= 0x7F {
258-
1
259-
} else if b <= 0b110_11111 {
260-
2
261-
} else if b <= 0b1110_1111 {
262-
3
263-
} else {
264-
4
265-
};
266-
i + inc
258+
next_utf8(text.as_bytes(), i)
267259
}
268260

269261
#[inline(always)] // reduces constant overhead
@@ -439,9 +431,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
439431
}
440432
MatchType::Dfa => {
441433
match self.find_dfa_forward(text, start) {
442-
dfa::Result::Match((s, _)) => {
434+
dfa::Result::Match((s, e)) => {
435+
// We need the +1 here to account for lookahead
436+
// operators.
437+
let e = if self.ro.nfa.uses_bytes() {
438+
cmp::min(e + 1, text.len())
439+
} else {
440+
cmp::min(next_utf8(text, e), text.len())
441+
};
443442
self.captures_nfa(
444-
MatchNfaType::Auto, slots, text, s)
443+
MatchNfaType::Auto, slots, &text[..e], s)
445444
}
446445
dfa::Result::NoMatch => None,
447446
dfa::Result::Quit => {

src/utf8.rs

+19
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,25 @@ const TAG_TWO: u8 = 0b1100_0000;
1919
const TAG_THREE: u8 = 0b1110_0000;
2020
const TAG_FOUR: u8 = 0b1111_0000;
2121

22+
/// Returns the smallest possible index of the next valid UTF-8 sequence
23+
/// starting after `i`.
24+
pub fn next_utf8(text: &[u8], i: usize) -> usize {
25+
let b = match text.get(i) {
26+
None => return i + 1,
27+
Some(&b) => b,
28+
};
29+
let inc = if b <= 0x7F {
30+
1
31+
} else if b <= 0b110_11111 {
32+
2
33+
} else if b <= 0b1110_1111 {
34+
3
35+
} else {
36+
4
37+
};
38+
i + inc
39+
}
40+
2241
/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
2342
///
2443
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number

0 commit comments

Comments
 (0)