From 4c59b7079424ef7423a4c9dfdfcaac4e8cf54928 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 25 Jun 2018 17:37:58 -0400 Subject: [PATCH] regex: expose lower level search APIs This commit exposes two new areas of API surface: 1. A new `captures_read` method which provides a way to access the offsets of submatches while amortizing the allocation of the space required to store those offsets. Callers should still of course prefer to use the higher level `captures` method, but if performance dictates, this lower level API may be useful. 2. New "at" variants of shortest_match/is_match/find/captures/captures_read that permit controlling where the start of a search begins within a slice. This is typically useful for controlling the match semantics of look-around operators such as `^` and `$`, and are necessary for implementing non-overlapping iterators. Fixes #219 --- ci/script.sh | 4 +- ci/test-regex-capi | 7 +++ src/exec.rs | 10 ++-- src/lib.rs | 3 +- src/re_bytes.rs | 128 +++++++++++++++++++++++++++++++++++++++------ src/re_trait.rs | 14 ++--- src/re_unicode.rs | 126 ++++++++++++++++++++++++++++++++++++++------ 7 files changed, 243 insertions(+), 49 deletions(-) create mode 100755 ci/test-regex-capi diff --git a/ci/script.sh b/ci/script.sh index b2b9d88661..9fa43079bf 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -30,9 +30,7 @@ cargo test --verbose --manifest-path regex-syntax/Cargo.toml cargo doc --verbose --manifest-path regex-syntax/Cargo.toml # Run tests on regex-capi crate. -cargo build --verbose --manifest-path regex-capi/Cargo.toml -(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test) -(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter) +ci/test-regex-capi # Make sure benchmarks compile. Don't run them though because they take a # very long time. Also, check that we can build the regex-debug tool. diff --git a/ci/test-regex-capi b/ci/test-regex-capi new file mode 100755 index 0000000000..f643aad0dd --- /dev/null +++ b/ci/test-regex-capi @@ -0,0 +1,7 @@ +#!/bin/sh + +set -e + +cargo build --verbose --manifest-path regex-capi/Cargo.toml +(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test) +(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter) diff --git a/src/exec.rs b/src/exec.rs index e6079e6c7a..578289aa5c 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -29,7 +29,7 @@ use prog::Program; use re_builder::RegexOptions; use re_bytes; use re_set; -use re_trait::{RegularExpression, Slot, Locations, as_slots}; +use re_trait::{RegularExpression, Slot, Locations}; use re_unicode; use utf8::next_utf8; @@ -359,13 +359,13 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> { } #[inline(always)] // reduces constant overhead - fn read_captures_at( + fn captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)> { - self.0.read_captures_at(locs, text.as_bytes(), start) + self.0.captures_read_at(locs, text.as_bytes(), start) } } @@ -528,13 +528,13 @@ impl<'c> RegularExpression for ExecNoSync<'c> { /// /// Note that the first two slots always correspond to the start and end /// locations of the overall match. - fn read_captures_at( + fn captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)> { - let slots = as_slots(locs); + let slots = locs.as_slots(); for slot in slots.iter_mut() { *slot = None; } diff --git a/src/lib.rs b/src/lib.rs index 0fb0c7150f..240daac1ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -542,11 +542,11 @@ pub use re_builder::set_unicode::*; #[cfg(feature = "use_std")] pub use re_set::unicode::*; #[cfg(feature = "use_std")] -pub use re_trait::Locations; #[cfg(feature = "use_std")] pub use re_unicode::{ Regex, Match, Captures, CaptureNames, Matches, CaptureMatches, SubCaptureMatches, + CaptureLocations, Locations, Replacer, ReplacerRef, NoExpand, Split, SplitN, escape, }; @@ -644,7 +644,6 @@ pub mod bytes { pub use re_builder::set_bytes::*; pub use re_bytes::*; pub use re_set::bytes::*; - pub use re_trait::Locations; } mod backtrack; diff --git a/src/re_bytes.rs b/src/re_bytes.rs index d577d6b0c3..cbe5febe28 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -21,7 +21,7 @@ use exec::{Exec, ExecNoSync}; use expand::expand_bytes; use error::Error; use re_builder::bytes::RegexBuilder; -use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; +use re_trait::{self, RegularExpression, SubCapturesPosIter}; /// Match represents a single match of a regex in a haystack. /// @@ -252,10 +252,10 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t [u8]) -> Option> { - let mut locs = self.locations(); - self.read_captures_at(&mut locs, text, 0).map(|_| Captures { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { text: text, - locs: locs, + locs: locs.0, named_groups: self.0.capture_name_idx().clone(), }) } @@ -568,7 +568,6 @@ impl Regex { /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. - #[doc(hidden)] pub fn shortest_match_at( &self, text: &[u8], @@ -583,7 +582,6 @@ impl Regex { /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. - #[doc(hidden)] pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { self.shortest_match_at(text, start).is_some() } @@ -594,7 +592,6 @@ impl Regex { /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. - #[doc(hidden)] pub fn find_at<'t>( &self, text: &'t [u8], @@ -604,21 +601,55 @@ impl Regex { .map(|(s, e)| Match::new(text, s, e)) } - /// Returns the same as captures, but starts the search at the given + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalence to the `0`th capture group. + pub fn captures_read<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + ) -> Option> { + self.captures_read_at(locs, text, 0) + } + + /// Returns the same as `captures_read`, but starts the search at the given /// offset and populates the capture locations given. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. + pub fn captures_read_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + start: usize, + ) -> Option> { + self.0 + .searcher() + .captures_read_at(&mut locs.0, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. #[doc(hidden)] pub fn read_captures_at<'t>( &self, - locs: &mut Locations, + locs: &mut CaptureLocations, text: &'t [u8], start: usize, ) -> Option> { - self.0.searcher().read_captures_at(locs, text, start) - .map(|(s, e)| Match::new(text, s, e)) + self.captures_read_at(locs, text, start) } } @@ -639,11 +670,19 @@ impl Regex { self.0.capture_names().len() } - /// Returns an empty set of locations that can be reused in multiple calls - /// to `read_captures`. + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher().locations()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. #[doc(hidden)] - pub fn locations(&self) -> Locations { - self.0.searcher().locations() + pub fn locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher().locations()) } } @@ -769,6 +808,63 @@ impl<'r> Iterator for CaptureNames<'r> { } } +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// You can think of this as a lower level +/// [`Captures`](struct.Captures.html), where this type does not support +/// named capturing groups directly and it does not borrow the text that these +/// offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs +/// such as `read_captures`, which permits amortizing the allocation in which +/// capture match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +#[derive(Clone, Debug)] +pub struct CaptureLocations(re_trait::Locations); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.pos(i) + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + /// Captures represents a group of captured byte strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent @@ -782,7 +878,7 @@ impl<'r> Iterator for CaptureNames<'r> { /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t [u8], - locs: Locations, + locs: re_trait::Locations, named_groups: Arc>, } diff --git a/src/re_trait.rs b/src/re_trait.rs index de674bd528..81a64bb252 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -18,6 +18,7 @@ pub type Slot = Option; /// /// Unlike `Captures`, a `Locations` value only stores offsets. #[doc(hidden)] +#[derive(Clone, Debug)] pub struct Locations(Vec); impl Locations { @@ -47,12 +48,11 @@ impl Locations { pub fn len(&self) -> usize { self.0.len() / 2 } -} -/// This is a hack to make Locations -> &mut [Slot] be available internally -/// without exposing it in the public API. -pub fn as_slots(locs: &mut Locations) -> &mut [Slot] { - &mut locs.0 + /// Return the individual slots as a slice. + pub(crate) fn as_slots(&mut self) -> &mut [Slot] { + &mut self.0 + } } /// An iterator over capture group positions for a particular match of a @@ -139,7 +139,7 @@ pub trait RegularExpression: Sized { /// Returns the leftmost-first match location if one exists, and also /// fills in any matching capture slot locations. - fn read_captures_at( + fn captures_read_at( &self, locs: &mut Locations, text: &Self::Text, @@ -246,7 +246,7 @@ impl<'t, R> Iterator for CaptureMatches<'t, R> return None } let mut locs = self.0.re.locations(); - let (s, e) = match self.0.re.read_captures_at( + let (s, e) = match self.0.re.captures_read_at( &mut locs, self.0.text, self.0.last_end, diff --git a/src/re_unicode.rs b/src/re_unicode.rs index fb78e56e4c..0bc5b185d6 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -22,7 +22,7 @@ use error::Error; use exec::{Exec, ExecNoSyncStr}; use expand::expand_str; use re_builder::unicode::RegexBuilder; -use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; +use re_trait::{self, RegularExpression, SubCapturesPosIter}; /// Escapes all regular expression meta characters in `text`. /// @@ -309,10 +309,10 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t str) -> Option> { - let mut locs = self.locations(); - self.read_captures_at(&mut locs, text, 0).map(|_| Captures { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { text: text, - locs: locs, + locs: locs.0, named_groups: self.0.capture_name_idx().clone(), }) } @@ -624,7 +624,6 @@ impl Regex { /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. - #[doc(hidden)] pub fn shortest_match_at( &self, text: &str, @@ -639,7 +638,6 @@ impl Regex { /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. - #[doc(hidden)] pub fn is_match_at(&self, text: &str, start: usize) -> bool { self.shortest_match_at(text, start).is_some() } @@ -650,7 +648,6 @@ impl Regex { /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. - #[doc(hidden)] pub fn find_at<'t>( &self, text: &'t str, @@ -661,24 +658,56 @@ impl Regex { }) } + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalence to the `0`th capture group. + pub fn captures_read<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + ) -> Option> { + self.captures_read_at(locs, text, 0) + } + /// Returns the same as captures, but starts the search at the given /// offset and populates the capture locations given. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. - #[doc(hidden)] - pub fn read_captures_at<'t>( + pub fn captures_read_at<'t>( &self, - locs: &mut Locations, + locs: &mut CaptureLocations, text: &'t str, start: usize, ) -> Option> { self.0 .searcher_str() - .read_captures_at(locs, text, start) + .captures_read_at(&mut locs.0, text, start) .map(|(s, e)| Match::new(text, s, e)) } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + start: usize, + ) -> Option> { + self.captures_read_at(locs, text, start) + } } /// Auxiliary methods. @@ -698,11 +727,19 @@ impl Regex { self.0.capture_names().len() } - /// Returns an empty set of locations that can be reused in multiple calls - /// to `read_captures`. + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher_str().locations()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. #[doc(hidden)] - pub fn locations(&self) -> Locations { - self.0.searcher_str().locations() + pub fn locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher_str().locations()) } } @@ -790,6 +827,63 @@ impl<'r, 't> Iterator for SplitN<'r, 't> { } } +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// You can think of this as a lower level +/// [`Captures`](struct.Captures.html), where this type does not support +/// named capturing groups directly and it does not borrow the text that these +/// offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs +/// such as `read_captures`, which permits amortizing the allocation in which +/// capture match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +#[derive(Clone, Debug)] +pub struct CaptureLocations(re_trait::Locations); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.pos(i) + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + /// Captures represents a group of captured strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent @@ -803,7 +897,7 @@ impl<'r, 't> Iterator for SplitN<'r, 't> { /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t str, - locs: Locations, + locs: re_trait::Locations, named_groups: Arc>, }