regex_automata/meta/reverse_inner.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
/*!
A module dedicated to plucking inner literals out of a regex pattern, and
then constructing a prefilter for them. We also include a regex pattern
"prefix" that corresponds to the bits of the regex that need to match before
the literals do. The reverse inner optimization then proceeds by looking for
matches of the inner literal(s), and then doing a reverse search of the prefix
from the start of the literal match to find the overall start position of the
match.
The essential invariant we want to uphold here is that the literals we return
reflect a set where *at least* one of them must match in order for the overall
regex to match. We also need to maintain the invariant that the regex prefix
returned corresponds to the entirety of the regex up until the literals we
return.
This somewhat limits what we can do. That is, if we a regex like
`\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter
from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if
we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically,
we could still extract `{@!, %%}`, and it is true that at least of them must
match. But then, what is our regex prefix? Again, in theory, that could be
`\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!`
matches, and `\w+` only matches when `%%` matches.
All of that is technically possible to do, but it seemingly requires a lot of
sophistication and machinery. Probably the way to tackle that is with some kind
of formalism and approach this problem more generally.
For now, the code below basically just looks for a top-level concatenation.
And if it can find one, it looks for literals in each of the direct child
sub-expressions of that concatenation. If some good ones are found, we return
those and a concatenation of the Hir expressions seen up to that point.
*/
use alloc::vec::Vec;
use regex_syntax::hir::{self, literal, Hir, HirKind};
use crate::{util::prefilter::Prefilter, MatchKind};
/// Attempts to extract an "inner" prefilter from the given HIR expressions. If
/// one was found, then a concatenation of the HIR expressions that precede it
/// is returned.
///
/// The idea here is that the prefilter returned can be used to find candidate
/// matches. And then the HIR returned can be used to build a reverse regex
/// matcher, which will find the start of the candidate match. Finally, the
/// match still has to be confirmed with a normal anchored forward scan to find
/// the end position of the match.
///
/// Note that this assumes leftmost-first match semantics, so callers must
/// not call this otherwise.
pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> {
if hirs.len() != 1 {
debug!(
"skipping reverse inner optimization since it only \
supports 1 pattern, {} were given",
hirs.len(),
);
return None;
}
let mut concat = match top_concat(hirs[0]) {
Some(concat) => concat,
None => {
debug!(
"skipping reverse inner optimization because a top-level \
concatenation could not found",
);
return None;
}
};
// We skip the first HIR because if it did have a prefix prefilter in it,
// we probably wouldn't be here looking for an inner prefilter.
for i in 1..concat.len() {
let hir = &concat[i];
let pre = match prefilter(hir) {
None => continue,
Some(pre) => pre,
};
// Even if we got a prefilter, if it isn't consider "fast," then we
// probably don't want to bother with it. Namely, since the reverse
// inner optimization requires some overhead, it likely only makes
// sense if the prefilter scan itself is (believed) to be much faster
// than the regex engine.
if !pre.is_fast() {
debug!(
"skipping extracted inner prefilter because \
it probably isn't fast"
);
continue;
}
let concat_suffix = Hir::concat(concat.split_off(i));
let concat_prefix = Hir::concat(concat);
// Look for a prefilter again. Why? Because above we only looked for
// a prefilter on the individual 'hir', but we might be able to find
// something better and more discriminatory by looking at the entire
// suffix. We don't do this above to avoid making this loop worst case
// quadratic in the length of 'concat'.
let pre2 = match prefilter(&concat_suffix) {
None => pre,
Some(pre2) => {
if pre2.is_fast() {
pre2
} else {
pre
}
}
};
return Some((concat_prefix, pre2));
}
debug!(
"skipping reverse inner optimization because a top-level \
sub-expression with a fast prefilter could not be found"
);
None
}
/// Attempt to extract a prefilter from an HIR expression.
///
/// We do a little massaging here to do our best that the prefilter we get out
/// of this is *probably* fast. Basically, the false positive rate has a much
/// higher impact for things like the reverse inner optimization because more
/// work needs to potentially be done for each candidate match.
///
/// Note that this assumes leftmost-first match semantics, so callers must
/// not call this otherwise.
fn prefilter(hir: &Hir) -> Option<Prefilter> {
let mut extractor = literal::Extractor::new();
extractor.kind(literal::ExtractKind::Prefix);
let mut prefixes = extractor.extract(hir);
debug!(
"inner prefixes (len={:?}) extracted before optimization: {:?}",
prefixes.len(),
prefixes
);
// Since these are inner literals, we know they cannot be exact. But the
// extractor doesn't know this. We mark them as inexact because this might
// impact literal optimization. Namely, optimization weights "all literals
// are exact" as very high, because it presumes that any match results in
// an overall match. But of course, that is not the case here.
//
// In practice, this avoids plucking out a ASCII-only \s as an alternation
// of single-byte whitespace characters.
prefixes.make_inexact();
prefixes.optimize_for_prefix_by_preference();
debug!(
"inner prefixes (len={:?}) extracted after optimization: {:?}",
prefixes.len(),
prefixes
);
prefixes
.literals()
.and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits))
}
/// Looks for a "top level" HirKind::Concat item in the given HIR. This will
/// try to return one even if it's embedded in a capturing group, but is
/// otherwise pretty conservative in what is returned.
///
/// The HIR returned is a complete copy of the concat with all capturing
/// groups removed. In effect, the concat returned is "flattened" with respect
/// to capturing groups. This makes the detection logic above for prefixes
/// a bit simpler, and it works because 1) capturing groups never influence
/// whether a match occurs or not and 2) capturing groups are not used when
/// doing the reverse inner search to find the start of the match.
fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
loop {
hir = match hir.kind() {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Look(_)
| HirKind::Repetition(_)
| HirKind::Alternation(_) => return None,
HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
HirKind::Concat(ref subs) => {
// We are careful to only do the flattening/copy when we know
// we have a "top level" concat we can inspect. This avoids
// doing extra work in cases where we definitely won't use it.
// (This might still be wasted work if we can't go on to find
// some literals to extract.)
let concat =
Hir::concat(subs.iter().map(|h| flatten(h)).collect());
return match concat.into_kind() {
HirKind::Concat(xs) => Some(xs),
// It is actually possible for this case to occur, because
// 'Hir::concat' might simplify the expression to the point
// that concatenations are actually removed. One wonders
// whether this leads to other cases where we should be
// extracting literals, but in theory, I believe if we do
// get here, then it means that a "real" prefilter failed
// to be extracted and we should probably leave well enough
// alone. (A "real" prefilter is unbothered by "top-level
// concats" and "capturing groups.")
_ => return None,
};
}
};
}
}
/// Returns a copy of the given HIR but with all capturing groups removed.
fn flatten(hir: &Hir) -> Hir {
match hir.kind() {
HirKind::Empty => Hir::empty(),
HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()),
HirKind::Class(ref x) => Hir::class(x.clone()),
HirKind::Look(ref x) => Hir::look(x.clone()),
HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))),
// This is the interesting case. We just drop the group information
// entirely and use the child HIR itself.
HirKind::Capture(hir::Capture { ref sub, .. }) => flatten(sub),
HirKind::Alternation(ref xs) => {
Hir::alternation(xs.iter().map(|x| flatten(x)).collect())
}
HirKind::Concat(ref xs) => {
Hir::concat(xs.iter().map(|x| flatten(x)).collect())
}
}
}