regex_automata/nfa/thompson/map.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
// This module contains a couple simple and purpose built hash maps. The key
// trade off they make is that they serve as caches rather than true maps. That
// is, inserting a new entry may cause eviction of another entry. This gives
// us two things. First, there's less overhead associated with inserts and
// lookups. Secondly, it lets us control our memory usage.
//
// These maps are used in some fairly hot code when generating NFA states for
// large Unicode character classes.
//
// Instead of exposing a rich hashmap entry API, we just permit the caller to
// produce a hash of the key directly. The hash can then be reused for both
// lookups and insertions at the cost of leaking abstraction a bit. But these
// are for internal use only, so it's fine.
//
// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
// (almost) minimal DFA for large Unicode character classes in linear time.
// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
// since there's a bit more expense in the reverse direction.)
//
// The Utf8SuffixMap is used when compiling large Unicode character classes for
// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
// construction of UTF-8 automata by caching common suffixes. This doesn't
// get the same space savings as Daciuk's algorithm, but it's basically as
// fast as the naive approach and typically winds up using less memory (since
// it generates smaller NFAs) despite the presence of the cache.
//
// These maps effectively represent caching mechanisms for sparse and
// byte-range NFA states, respectively. The former represents a single NFA
// state with many transitions of equivalent priority while the latter
// represents a single NFA state with a single transition. (Neither state ever
// has or is an epsilon transition.) Thus, they have different key types. It's
// likely we could make one generic map, but the machinery didn't seem worth
// it. They are simple enough.
use alloc::{vec, vec::Vec};
use crate::{
nfa::thompson::Transition,
util::{
int::{Usize, U64},
primitives::StateID,
},
};
// Basic FNV-1a hash constants as described in:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
const PRIME: u64 = 1099511628211;
const INIT: u64 = 14695981039346656037;
/// A bounded hash map where the key is a sequence of NFA transitions and the
/// value is a pre-existing NFA state ID.
///
/// std's hashmap can be used for this, however, this map has two important
/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
/// control our memory usage by limited the number of slots. In general, the
/// cost here is that this map acts as a cache. That is, inserting a new entry
/// may remove an old entry. We are okay with this, since it does not impact
/// correctness in the cases where it is used. The only effect that dropping
/// states from the cache has is that the resulting NFA generated may be bigger
/// than it otherwise would be.
///
/// This improves benchmarks that compile large Unicode character classes,
/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
/// Specifically, one could observe the difference with std's hashmap via
/// something like the following benchmark:
///
/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'"
///
/// But to observe that difference, you'd have to modify the code to use
/// std's hashmap.
///
/// It is quite possible that there is a better way to approach this problem.
/// For example, if there happens to be a very common state that collides with
/// a lot of less frequent states, then we could wind up with very poor caching
/// behavior. Alas, the effectiveness of this cache has not been measured.
/// Instead, ad hoc experiments suggest that it is "good enough." Additional
/// smarts (such as an LRU eviction policy) have to be weighed against the
/// amount of extra time they cost.
#[derive(Clone, Debug)]
pub struct Utf8BoundedMap {
/// The current version of this map. Only entries with matching versions
/// are considered during lookups. If an entry is found with a mismatched
/// version, then the map behaves as if the entry does not exist.
///
/// This makes it possible to clear the map by simply incrementing the
/// version number instead of actually deallocating any storage.
version: u16,
/// The total number of entries this map can store.
capacity: usize,
/// The actual entries, keyed by hash. Collisions between different states
/// result in the old state being dropped.
map: Vec<Utf8BoundedEntry>,
}
/// An entry in this map.
#[derive(Clone, Debug, Default)]
struct Utf8BoundedEntry {
/// The version of the map used to produce this entry. If this entry's
/// version does not match the current version of the map, then the map
/// should behave as if this entry does not exist.
version: u16,
/// The key, which is a sorted sequence of non-overlapping NFA transitions.
key: Vec<Transition>,
/// The state ID corresponding to the state containing the transitions in
/// this entry.
val: StateID,
}
impl Utf8BoundedMap {
/// Create a new bounded map with the given capacity. The map will never
/// grow beyond the given size.
///
/// Note that this does not allocate. Instead, callers must call `clear`
/// before using this map. `clear` will allocate space if necessary.
///
/// This avoids the need to pay for the allocation of this map when
/// compiling regexes that lack large Unicode character classes.
pub fn new(capacity: usize) -> Utf8BoundedMap {
assert!(capacity > 0);
Utf8BoundedMap { version: 0, capacity, map: vec![] }
}
/// Clear this map of all entries, but permit the reuse of allocation
/// if possible.
///
/// This must be called before the map can be used.
pub fn clear(&mut self) {
if self.map.is_empty() {
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
} else {
self.version = self.version.wrapping_add(1);
// If we loop back to version 0, then we forcefully clear the
// entire map. Otherwise, it might be possible to incorrectly
// match entries used to generate other NFAs.
if self.version == 0 {
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
}
}
}
/// Return a hash of the given transitions.
pub fn hash(&self, key: &[Transition]) -> usize {
let mut h = INIT;
for t in key {
h = (h ^ u64::from(t.start)).wrapping_mul(PRIME);
h = (h ^ u64::from(t.end)).wrapping_mul(PRIME);
h = (h ^ t.next.as_u64()).wrapping_mul(PRIME);
}
(h % self.map.len().as_u64()).as_usize()
}
/// Retrieve the cached state ID corresponding to the given key. The hash
/// given must have been computed with `hash` using the same key value.
///
/// If there is no cached state with the given transitions, then None is
/// returned.
pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
let entry = &self.map[hash];
if entry.version != self.version {
return None;
}
// There may be a hash collision, so we need to confirm real equality.
if entry.key != key {
return None;
}
Some(entry.val)
}
/// Add a cached state to this map with the given key. Callers should
/// ensure that `state_id` points to a state that contains precisely the
/// NFA transitions given.
///
/// `hash` must have been computed using the `hash` method with the same
/// key.
pub fn set(
&mut self,
key: Vec<Transition>,
hash: usize,
state_id: StateID,
) {
self.map[hash] =
Utf8BoundedEntry { version: self.version, key, val: state_id };
}
}
/// A cache of suffixes used to modestly compress UTF-8 automata for large
/// Unicode character classes.
#[derive(Clone, Debug)]
pub struct Utf8SuffixMap {
/// The current version of this map. Only entries with matching versions
/// are considered during lookups. If an entry is found with a mismatched
/// version, then the map behaves as if the entry does not exist.
version: u16,
/// The total number of entries this map can store.
capacity: usize,
/// The actual entries, keyed by hash. Collisions between different states
/// result in the old state being dropped.
map: Vec<Utf8SuffixEntry>,
}
/// A key that uniquely identifies an NFA state. It is a triple that represents
/// a transition from one state for a particular byte range.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct Utf8SuffixKey {
pub from: StateID,
pub start: u8,
pub end: u8,
}
/// An entry in this map.
#[derive(Clone, Debug, Default)]
struct Utf8SuffixEntry {
/// The version of the map used to produce this entry. If this entry's
/// version does not match the current version of the map, then the map
/// should behave as if this entry does not exist.
version: u16,
/// The key, which consists of a transition in a particular state.
key: Utf8SuffixKey,
/// The identifier that the transition in the key maps to.
val: StateID,
}
impl Utf8SuffixMap {
/// Create a new bounded map with the given capacity. The map will never
/// grow beyond the given size.
///
/// Note that this does not allocate. Instead, callers must call `clear`
/// before using this map. `clear` will allocate space if necessary.
///
/// This avoids the need to pay for the allocation of this map when
/// compiling regexes that lack large Unicode character classes.
pub fn new(capacity: usize) -> Utf8SuffixMap {
assert!(capacity > 0);
Utf8SuffixMap { version: 0, capacity, map: vec![] }
}
/// Clear this map of all entries, but permit the reuse of allocation
/// if possible.
///
/// This must be called before the map can be used.
pub fn clear(&mut self) {
if self.map.is_empty() {
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
} else {
self.version = self.version.wrapping_add(1);
if self.version == 0 {
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
}
}
}
/// Return a hash of the given transition.
pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
// Basic FNV-1a hash as described:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
const PRIME: u64 = 1099511628211;
const INIT: u64 = 14695981039346656037;
let mut h = INIT;
h = (h ^ key.from.as_u64()).wrapping_mul(PRIME);
h = (h ^ u64::from(key.start)).wrapping_mul(PRIME);
h = (h ^ u64::from(key.end)).wrapping_mul(PRIME);
(h % self.map.len().as_u64()).as_usize()
}
/// Retrieve the cached state ID corresponding to the given key. The hash
/// given must have been computed with `hash` using the same key value.
///
/// If there is no cached state with the given key, then None is returned.
pub fn get(
&mut self,
key: &Utf8SuffixKey,
hash: usize,
) -> Option<StateID> {
let entry = &self.map[hash];
if entry.version != self.version {
return None;
}
if key != &entry.key {
return None;
}
Some(entry.val)
}
/// Add a cached state to this map with the given key. Callers should
/// ensure that `state_id` points to a state that contains precisely the
/// NFA transition given.
///
/// `hash` must have been computed using the `hash` method with the same
/// key.
pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
self.map[hash] =
Utf8SuffixEntry { version: self.version, key, val: state_id };
}
}