Add known upper limit to capture search.

BurntSushi · BurntSushi · commit 7084f14571b6 · 2016-04-27T08:48:47.000-04:00
The DFA will report the end location of a match, so we should pass that
along to capture detection. In theory, the DFA and the NFA report the
same match locations, so this upper bound shouldn't be necessary---the
NFA should quit once it finds the right match. It turns out though
bounding the text has two important ramifications:

1. It will enable the backtracking engine to be used more often. In
particular, the backtracking engine can only be used on small inputs and
this change decreases the size of the input by only considering the
match.
2. The backtracking engine must start every search by zeroing memory
that is proportional to the size of the input. If the input is smaller,
then this runs more quickly.

We are also careful to bound the match to one additional "character"
past the end of the match, so that lookahead operators work correctly.
diff --git a/src/exec.rs b/src/exec.rs
@@ -10,6 +10,7 @@
 
 use std::cell::RefCell;
 use std::collections::HashMap;
+use std::cmp;
 use std::sync::Arc;
 
 use thread_local::CachedThreadLocal;
@@ -27,6 +28,7 @@ use re_bytes;
 use re_trait::{RegularExpression, Slot};
 use re_unicode;
 use set;
+use utf8::next_utf8;
 
 /// Exec manages the execution of a regular expression.
 ///
@@ -253,17 +255,7 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
     fn slots_len(&self) -> usize { self.0.slots_len() }
 
     fn next_after_empty(&self, text: &str, i: usize) -> usize {
-        let b = text.as_bytes()[i];
-        let inc = if b <= 0x7F {
-            1
-        } else if b <= 0b110_11111 {
-            2
-        } else if b <= 0b1110_1111 {
-            3
-        } else {
-            4
-        };
-        i + inc
+        next_utf8(text.as_bytes(), i)
     }
 
     #[inline(always)] // reduces constant overhead
@@ -439,9 +431,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
             }
             MatchType::Dfa => {
                 match self.find_dfa_forward(text, start) {
-                    dfa::Result::Match((s, _)) => {
+                    dfa::Result::Match((s, e)) => {
+                        // We need the +1 here to account for lookahead
+                        // operators.
+                        let e = if self.ro.nfa.uses_bytes() {
+                            cmp::min(e + 1, text.len())
+                        } else {
+                            cmp::min(next_utf8(text, e), text.len())
+                        };
                         self.captures_nfa(
-                            MatchNfaType::Auto, slots, text, s)
+                            MatchNfaType::Auto, slots, &text[..e], s)
                     }
                     dfa::Result::NoMatch => None,
                     dfa::Result::Quit => {
diff --git a/src/utf8.rs b/src/utf8.rs
@@ -19,6 +19,25 @@ const TAG_TWO: u8 = 0b1100_0000;
 const TAG_THREE: u8 = 0b1110_0000;
 const TAG_FOUR: u8 = 0b1111_0000;
 
+/// Returns the smallest possible index of the next valid UTF-8 sequence
+/// starting after `i`.
+pub fn next_utf8(text: &[u8], i: usize) -> usize {
+    let b = match text.get(i) {
+        None => return i + 1,
+        Some(&b) => b,
+    };
+    let inc = if b <= 0x7F {
+        1
+    } else if b <= 0b110_11111 {
+        2
+    } else if b <= 0b1110_1111 {
+        3
+    } else {
+        4
+    };
+    i + inc
+}
+
 /// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
 ///
 /// If `dst` is not long enough, then `None` is returned. Otherwise, the number