regex_automata/meta/literal.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
use alloc::{vec, vec::Vec};
use regex_syntax::hir::Hir;
use crate::{meta::regex::RegexInfo, util::search::MatchKind};
/// Pull out an alternation of literals from the given sequence of HIR
/// expressions.
///
/// There are numerous ways for this to fail. Generally, this only applies
/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there
/// are "too few" alternates, in which case, the regex engine is likely faster.
///
/// And currently, this only returns something when 'hirs.len() == 1'.
pub(crate) fn alternation_literals(
info: &RegexInfo,
hirs: &[&Hir],
) -> Option<Vec<Vec<u8>>> {
use regex_syntax::hir::{HirKind, Literal};
// Might as well skip the work below if we know we can't build an
// Aho-Corasick searcher.
if !cfg!(feature = "perf-literal-multisubstring") {
return None;
}
// This is pretty hacky, but basically, if `is_alternation_literal` is
// true, then we can make several assumptions about the structure of our
// HIR. This is what justifies the `unreachable!` statements below.
if hirs.len() != 1
|| !info.props()[0].look_set().is_empty()
|| info.props()[0].explicit_captures_len() > 0
|| !info.props()[0].is_alternation_literal()
|| info.config().get_match_kind() != MatchKind::LeftmostFirst
{
return None;
}
let hir = &hirs[0];
let alts = match *hir.kind() {
HirKind::Alternation(ref alts) => alts,
_ => return None, // one literal isn't worth it
};
let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
HirKind::Literal(Literal(ref bytes)) => {
lit.extend_from_slice(bytes)
}
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
HirKind::Literal(Literal(ref bytes)) => {
lit.extend_from_slice(bytes);
}
_ => unreachable!("expected literal, got {:?}", e),
}
}
}
_ => unreachable!("expected literal or concat, got {:?}", alt),
}
lits.push(lit);
}
// Why do this? Well, when the number of literals is small, it's likely
// that we'll use the lazy DFA which is in turn likely to be faster than
// Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
// a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
// the latter because it is so hungry (in time and space), and the former
// is decently fast, but not as fast as a well oiled lazy DFA.
//
// However, once the number starts getting large, the lazy DFA is likely
// to start thrashing because of the modest default cache size. When
// exactly does this happen? Dunno. But at whatever point that is (we make
// a guess below based on ad hoc benchmarking), we'll want to cut over to
// Aho-Corasick, where even the contiguous NFA is likely to do much better.
if lits.len() < 3000 {
debug!("skipping Aho-Corasick because there are too few literals");
return None;
}
Some(lits)
}