Skip to content

Commit 53395bf

Browse files
committed
literal: upgrade to aho-corasick 0.7
This is a "dumb" update in that we retain exactly the same functionality as before.
1 parent 1dfac42 commit 53395bf

File tree

4 files changed

+36
-22
lines changed

4 files changed

+36
-22
lines changed

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ members = [
2626

2727
[dependencies]
2828
# For very fast prefix literal matching.
29-
aho-corasick = "0.6.7"
29+
aho-corasick = "0.7.1"
3030
# For skipping along search text quickly when a leading byte is known.
3131
memchr = "2.0.2"
3232
# For managing regex caches quickly across multiple threads.

src/literal/mod.rs

+13-7
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
use std::cmp;
1212
use std::mem;
1313

14-
use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
14+
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
1515
use memchr::{memchr, memchr2, memchr3};
1616
use syntax::hir::literal::{Literal, Literals};
1717

@@ -46,7 +46,7 @@ enum Matcher {
4646
/// A single substring, find using Boyer-Moore.
4747
BoyerMoore(BoyerMooreSearch),
4848
/// An Aho-Corasick automaton.
49-
AC(FullAcAutomaton<Literal>),
49+
AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
5050
/// A simd accelerated multiple string matcher. Used only for a small
5151
/// number of small literals.
5252
TeddySSSE3(TeddySSSE3),
@@ -102,7 +102,9 @@ impl LiteralSearcher {
102102
Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
103103
FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
104104
BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
105-
AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)),
105+
AC { ref ac, .. } => {
106+
ac.find(haystack).map(|m| (m.start(), m.end()))
107+
}
106108
TeddySSSE3(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
107109
TeddyAVX2(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
108110
}
@@ -141,7 +143,7 @@ impl LiteralSearcher {
141143
Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
142144
Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
143145
Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
144-
Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()),
146+
Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
145147
Matcher::TeddySSSE3(ref ted) => {
146148
LiteralIter::TeddySSSE3(ted.patterns())
147149
}
@@ -174,7 +176,7 @@ impl LiteralSearcher {
174176
Bytes(ref sset) => sset.dense.len(),
175177
FreqyPacked(_) => 1,
176178
BoyerMoore(_) => 1,
177-
AC(ref aut) => aut.len(),
179+
AC { ref ac, .. } => ac.pattern_count(),
178180
TeddySSSE3(ref ted) => ted.len(),
179181
TeddyAVX2(ref ted) => ted.len(),
180182
}
@@ -188,7 +190,7 @@ impl LiteralSearcher {
188190
Bytes(ref sset) => sset.approximate_size(),
189191
FreqyPacked(ref single) => single.approximate_size(),
190192
BoyerMoore(ref single) => single.approximate_size(),
191-
AC(ref aut) => aut.heap_bytes(),
193+
AC { ref ac, .. } => ac.heap_bytes(),
192194
TeddySSSE3(ref ted) => ted.approximate_size(),
193195
TeddyAVX2(ref ted) => ted.approximate_size(),
194196
}
@@ -258,7 +260,11 @@ impl Matcher {
258260
// Fallthrough to ol' reliable Aho-Corasick...
259261
}
260262
let pats = lits.literals().to_owned();
261-
Matcher::AC(AcAutomaton::new(pats).into_full())
263+
let ac = AhoCorasickBuilder::new()
264+
.dfa(true)
265+
.build_with_size::<u32, _, _>(&pats)
266+
.unwrap();
267+
Matcher::AC { ac, lits: pats }
262268
}
263269
}
264270

src/literal/teddy_avx2/imp.rs

+11-7
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ basically the same as the SSSE3 version, but using 256-bit vectors instead of
99

1010
use std::cmp;
1111

12-
use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
12+
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
1313
use syntax::hir::literal::Literals;
1414

1515
use vector::avx2::{AVX2VectorBuilder, u8x32};
@@ -38,7 +38,7 @@ pub struct Teddy {
3838
pats: Vec<Vec<u8>>,
3939
/// An Aho-Corasick automaton of the patterns. We use this when we need to
4040
/// search pieces smaller than the Teddy block size.
41-
ac: FullAcAutomaton<Vec<u8>>,
41+
ac: AhoCorasick,
4242
/// A set of 8 buckets. Each bucket corresponds to a single member of a
4343
/// bitset. A bucket contains zero or more substrings. This is useful
4444
/// when the number of substrings exceeds 8, since our bitsets cannot have
@@ -88,10 +88,14 @@ impl Teddy {
8888
buckets[bucket].push(pati);
8989
masks.add(bucket as u8, pat);
9090
}
91+
let ac = AhoCorasickBuilder::new()
92+
.dfa(true)
93+
.prefilter(false)
94+
.build(&pats);
9195
Some(Teddy {
9296
vb: vb,
9397
pats: pats.to_vec(),
94-
ac: AcAutomaton::new(pats.to_vec()).into_full(),
98+
ac: ac,
9599
buckets: buckets,
96100
masks: masks,
97101
})
@@ -341,11 +345,11 @@ impl Teddy {
341345
/// block based approach.
342346
#[inline(never)]
343347
fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
344-
self.ac.find(&haystack[pos..]).next().map(|m| {
348+
self.ac.find(&haystack[pos..]).map(|m| {
345349
Match {
346-
pat: m.pati,
347-
start: pos + m.start,
348-
end: pos + m.end,
350+
pat: m.pattern(),
351+
start: pos + m.start(),
352+
end: pos + m.end(),
349353
}
350354
})
351355
}

src/literal/teddy_ssse3/imp.rs

+11-7
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ References
320320

321321
use std::cmp;
322322

323-
use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
323+
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
324324
use syntax::hir::literal::Literals;
325325

326326
use vector::ssse3::{SSSE3VectorBuilder, u8x16};
@@ -349,7 +349,7 @@ pub struct Teddy {
349349
pats: Vec<Vec<u8>>,
350350
/// An Aho-Corasick automaton of the patterns. We use this when we need to
351351
/// search pieces smaller than the Teddy block size.
352-
ac: FullAcAutomaton<Vec<u8>>,
352+
ac: AhoCorasick,
353353
/// A set of 8 buckets. Each bucket corresponds to a single member of a
354354
/// bitset. A bucket contains zero or more substrings. This is useful
355355
/// when the number of substrings exceeds 8, since our bitsets cannot have
@@ -399,10 +399,14 @@ impl Teddy {
399399
buckets[bucket].push(pati);
400400
masks.add(bucket as u8, pat);
401401
}
402+
let ac = AhoCorasickBuilder::new()
403+
.dfa(true)
404+
.prefilter(false)
405+
.build(&pats);
402406
Some(Teddy {
403407
vb: vb,
404408
pats: pats.to_vec(),
405-
ac: AcAutomaton::new(pats.to_vec()).into_full(),
409+
ac: ac,
406410
buckets: buckets,
407411
masks: masks,
408412
})
@@ -651,11 +655,11 @@ impl Teddy {
651655
/// block based approach.
652656
#[inline(never)]
653657
fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
654-
self.ac.find(&haystack[pos..]).next().map(|m| {
658+
self.ac.find(&haystack[pos..]).map(|m| {
655659
Match {
656-
pat: m.pati,
657-
start: pos + m.start,
658-
end: pos + m.end,
660+
pat: m.pattern(),
661+
start: pos + m.start(),
662+
end: pos + m.end(),
659663
}
660664
})
661665
}

0 commit comments

Comments
 (0)