aho_corasick/packed/
pattern.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
use core::{cmp, fmt, mem, u16, usize};

use alloc::{boxed::Box, string::String, vec, vec::Vec};

use crate::{
    packed::{api::MatchKind, ext::Pointer},
    PatternID,
};

/// A non-empty collection of non-empty patterns to search for.
///
/// This collection of patterns is what is passed around to both execute
/// searches and to construct the searchers themselves. Namely, this permits
/// searches to avoid copying all of the patterns, and allows us to keep only
/// one copy throughout all packed searchers.
///
/// Note that this collection is not a set. The same pattern can appear more
/// than once.
#[derive(Clone, Debug)]
pub(crate) struct Patterns {
    /// The match semantics supported by this collection of patterns.
    ///
    /// The match semantics determines the order of the iterator over patterns.
    /// For leftmost-first, patterns are provided in the same order as were
    /// provided by the caller. For leftmost-longest, patterns are provided in
    /// descending order of length, with ties broken by the order in which they
    /// were provided by the caller.
    kind: MatchKind,
    /// The collection of patterns, indexed by their identifier.
    by_id: Vec<Vec<u8>>,
    /// The order of patterns defined for iteration, given by pattern
    /// identifiers. The order of `by_id` and `order` is always the same for
    /// leftmost-first semantics, but may be different for leftmost-longest
    /// semantics.
    order: Vec<PatternID>,
    /// The length of the smallest pattern, in bytes.
    minimum_len: usize,
    /// The total number of pattern bytes across the entire collection. This
    /// is used for reporting total heap usage in constant time.
    total_pattern_bytes: usize,
}

// BREADCRUMBS: I think we want to experiment with a different bucket
// representation. Basically, each bucket is just a Range<usize> to a single
// contiguous allocation? Maybe length-prefixed patterns or something? The
// idea is to try to get rid of the pointer chasing in verification. I don't
// know that that is the issue, but I suspect it is.

impl Patterns {
    /// Create a new collection of patterns for the given match semantics. The
    /// ID of each pattern is the index of the pattern at which it occurs in
    /// the `by_id` slice.
    ///
    /// If any of the patterns in the slice given are empty, then this panics.
    /// Similarly, if the number of patterns given is zero, then this also
    /// panics.
    pub(crate) fn new() -> Patterns {
        Patterns {
            kind: MatchKind::default(),
            by_id: vec![],
            order: vec![],
            minimum_len: usize::MAX,
            total_pattern_bytes: 0,
        }
    }

    /// Add a pattern to this collection.
    ///
    /// This panics if the pattern given is empty.
    pub(crate) fn add(&mut self, bytes: &[u8]) {
        assert!(!bytes.is_empty());
        assert!(self.by_id.len() <= u16::MAX as usize);

        let id = PatternID::new(self.by_id.len()).unwrap();
        self.order.push(id);
        self.by_id.push(bytes.to_vec());
        self.minimum_len = cmp::min(self.minimum_len, bytes.len());
        self.total_pattern_bytes += bytes.len();
    }

    /// Set the match kind semantics for this collection of patterns.
    ///
    /// If the kind is not set, then the default is leftmost-first.
    pub(crate) fn set_match_kind(&mut self, kind: MatchKind) {
        self.kind = kind;
        match self.kind {
            MatchKind::LeftmostFirst => {
                self.order.sort();
            }
            MatchKind::LeftmostLongest => {
                let (order, by_id) = (&mut self.order, &mut self.by_id);
                order.sort_by(|&id1, &id2| {
                    by_id[id1].len().cmp(&by_id[id2].len()).reverse()
                });
            }
        }
    }

    /// Return the number of patterns in this collection.
    ///
    /// This is guaranteed to be greater than zero.
    pub(crate) fn len(&self) -> usize {
        self.by_id.len()
    }

    /// Returns true if and only if this collection of patterns is empty.
    pub(crate) fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Returns the approximate total amount of heap used by these patterns, in
    /// units of bytes.
    pub(crate) fn memory_usage(&self) -> usize {
        self.order.len() * mem::size_of::<PatternID>()
            + self.by_id.len() * mem::size_of::<Vec<u8>>()
            + self.total_pattern_bytes
    }

    /// Clears all heap memory associated with this collection of patterns and
    /// resets all state such that it is a valid empty collection.
    pub(crate) fn reset(&mut self) {
        self.kind = MatchKind::default();
        self.by_id.clear();
        self.order.clear();
        self.minimum_len = usize::MAX;
    }

    /// Returns the length, in bytes, of the smallest pattern.
    ///
    /// This is guaranteed to be at least one.
    pub(crate) fn minimum_len(&self) -> usize {
        self.minimum_len
    }

    /// Returns the match semantics used by these patterns.
    pub(crate) fn match_kind(&self) -> &MatchKind {
        &self.kind
    }

    /// Return the pattern with the given identifier. If such a pattern does
    /// not exist, then this panics.
    pub(crate) fn get(&self, id: PatternID) -> Pattern<'_> {
        Pattern(&self.by_id[id])
    }

    /// Return the pattern with the given identifier without performing bounds
    /// checks.
    ///
    /// # Safety
    ///
    /// Callers must ensure that a pattern with the given identifier exists
    /// before using this method.
    pub(crate) unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {
        Pattern(self.by_id.get_unchecked(id.as_usize()))
    }

    /// Return an iterator over all the patterns in this collection, in the
    /// order in which they should be matched.
    ///
    /// Specifically, in a naive multi-pattern matcher, the following is
    /// guaranteed to satisfy the match semantics of this collection of
    /// patterns:
    ///
    /// ```ignore
    /// for i in 0..haystack.len():
    ///   for p in patterns.iter():
    ///     if haystack[i..].starts_with(p.bytes()):
    ///       return Match(p.id(), i, i + p.bytes().len())
    /// ```
    ///
    /// Namely, among the patterns in a collection, if they are matched in
    /// the order provided by this iterator, then the result is guaranteed
    /// to satisfy the correct match semantics. (Either leftmost-first or
    /// leftmost-longest.)
    pub(crate) fn iter(&self) -> PatternIter<'_> {
        PatternIter { patterns: self, i: 0 }
    }
}

/// An iterator over the patterns in the `Patterns` collection.
///
/// The order of the patterns provided by this iterator is consistent with the
/// match semantics of the originating collection of patterns.
///
/// The lifetime `'p` corresponds to the lifetime of the collection of patterns
/// this is iterating over.
#[derive(Debug)]
pub(crate) struct PatternIter<'p> {
    patterns: &'p Patterns,
    i: usize,
}

impl<'p> Iterator for PatternIter<'p> {
    type Item = (PatternID, Pattern<'p>);

    fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {
        if self.i >= self.patterns.len() {
            return None;
        }
        let id = self.patterns.order[self.i];
        let p = self.patterns.get(id);
        self.i += 1;
        Some((id, p))
    }
}

/// A pattern that is used in packed searching.
#[derive(Clone)]
pub(crate) struct Pattern<'a>(&'a [u8]);

impl<'a> fmt::Debug for Pattern<'a> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("Pattern")
            .field("lit", &String::from_utf8_lossy(&self.0))
            .finish()
    }
}

impl<'p> Pattern<'p> {
    /// Returns the length of this pattern, in bytes.
    pub(crate) fn len(&self) -> usize {
        self.0.len()
    }

    /// Returns the bytes of this pattern.
    pub(crate) fn bytes(&self) -> &[u8] {
        &self.0
    }

    /// Returns the first `len` low nybbles from this pattern. If this pattern
    /// is shorter than `len`, then this panics.
    pub(crate) fn low_nybbles(&self, len: usize) -> Box<[u8]> {
        let mut nybs = vec![0; len].into_boxed_slice();
        for (i, byte) in self.bytes().iter().take(len).enumerate() {
            nybs[i] = byte & 0xF;
        }
        nybs
    }

    /// Returns true if this pattern is a prefix of the given bytes.
    #[inline(always)]
    pub(crate) fn is_prefix(&self, bytes: &[u8]) -> bool {
        is_prefix(bytes, self.bytes())
    }

    /// Returns true if this pattern is a prefix of the haystack given by the
    /// raw `start` and `end` pointers.
    ///
    /// # Safety
    ///
    /// * It must be the case that `start < end` and that the distance between
    /// them is at least equal to `V::BYTES`. That is, it must always be valid
    /// to do at least an unaligned load of `V` at `start`.
    /// * Both `start` and `end` must be valid for reads.
    /// * Both `start` and `end` must point to an initialized value.
    /// * Both `start` and `end` must point to the same allocated object and
    /// must either be in bounds or at most one byte past the end of the
    /// allocated object.
    /// * Both `start` and `end` must be _derived from_ a pointer to the same
    /// object.
    /// * The distance between `start` and `end` must not overflow `isize`.
    /// * The distance being in bounds must not rely on "wrapping around" the
    /// address space.
    #[inline(always)]
    pub(crate) unsafe fn is_prefix_raw(
        &self,
        start: *const u8,
        end: *const u8,
    ) -> bool {
        let patlen = self.bytes().len();
        let haylen = end.distance(start);
        if patlen > haylen {
            return false;
        }
        // SAFETY: We've checked that the haystack has length at least equal
        // to this pattern. All other safety concerns are the responsibility
        // of the caller.
        is_equal_raw(start, self.bytes().as_ptr(), patlen)
    }
}

/// Returns true if and only if `needle` is a prefix of `haystack`.
///
/// This uses a latency optimized variant of `memcmp` internally which *might*
/// make this faster for very short strings.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
#[inline(always)]
fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
    if needle.len() > haystack.len() {
        return false;
    }
    // SAFETY: Our pointers are derived directly from borrowed slices which
    // uphold all of our safety guarantees except for length. We account for
    // length with the check above.
    unsafe { is_equal_raw(haystack.as_ptr(), needle.as_ptr(), needle.len()) }
}

/// Compare corresponding bytes in `x` and `y` for equality.
///
/// That is, this returns true if and only if `x.len() == y.len()` and
/// `x[i] == y[i]` for all `0 <= i < x.len()`.
///
/// Note that this isn't used. We only use it in tests as a convenient way
/// of testing `is_equal_raw`.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
///
/// # Motivation
///
/// Why not use slice equality instead? Well, slice equality usually results in
/// a call out to the current platform's `libc` which might not be inlineable
/// or have other overhead. This routine isn't guaranteed to be a win, but it
/// might be in some cases.
#[cfg(test)]
#[inline(always)]
fn is_equal(x: &[u8], y: &[u8]) -> bool {
    if x.len() != y.len() {
        return false;
    }
    // SAFETY: Our pointers are derived directly from borrowed slices which
    // uphold all of our safety guarantees except for length. We account for
    // length with the check above.
    unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) }
}

/// Compare `n` bytes at the given pointers for equality.
///
/// This returns true if and only if `*x.add(i) == *y.add(i)` for all
/// `0 <= i < n`.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
///
/// # Motivation
///
/// Why not use slice equality instead? Well, slice equality usually results in
/// a call out to the current platform's `libc` which might not be inlineable
/// or have other overhead. This routine isn't guaranteed to be a win, but it
/// might be in some cases.
///
/// # Safety
///
/// * Both `x` and `y` must be valid for reads of up to `n` bytes.
/// * Both `x` and `y` must point to an initialized value.
/// * Both `x` and `y` must each point to an allocated object and
/// must either be in bounds or at most one byte past the end of the
/// allocated object. `x` and `y` do not need to point to the same allocated
/// object, but they may.
/// * Both `x` and `y` must be _derived from_ a pointer to their respective
/// allocated objects.
/// * The distance between `x` and `x+n` must not overflow `isize`. Similarly
/// for `y` and `y+n`.
/// * The distance being in bounds must not rely on "wrapping around" the
/// address space.
#[inline(always)]
unsafe fn is_equal_raw(mut x: *const u8, mut y: *const u8, n: usize) -> bool {
    // If we don't have enough bytes to do 4-byte at a time loads, then
    // handle each possible length specially. Note that I used to have a
    // byte-at-a-time loop here and that turned out to be quite a bit slower
    // for the memmem/pathological/defeat-simple-vector-alphabet benchmark.
    if n < 4 {
        return match n {
            0 => true,
            1 => x.read() == y.read(),
            2 => {
                x.cast::<u16>().read_unaligned()
                    == y.cast::<u16>().read_unaligned()
            }
            // I also tried copy_nonoverlapping here and it looks like the
            // codegen is the same.
            3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(),
            _ => unreachable!(),
        };
    }
    // When we have 4 or more bytes to compare, then proceed in chunks of 4 at
    // a time using unaligned loads.
    //
    // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
    // that this particular version of memcmp is likely to be called with tiny
    // needles. That means that if we do 8 byte loads, then a higher proportion
    // of memcmp calls will use the slower variant above. With that said, this
    // is a hypothesis and is only loosely supported by benchmarks. There's
    // likely some improvement that could be made here. The main thing here
    // though is to optimize for latency, not throughput.

    // SAFETY: The caller is responsible for ensuring the pointers we get are
    // valid and readable for at least `n` bytes. We also do unaligned loads,
    // so there's no need to ensure we're aligned. (This is justified by this
    // routine being specifically for short strings.)
    let xend = x.add(n.wrapping_sub(4));
    let yend = y.add(n.wrapping_sub(4));
    while x < xend {
        let vx = x.cast::<u32>().read_unaligned();
        let vy = y.cast::<u32>().read_unaligned();
        if vx != vy {
            return false;
        }
        x = x.add(4);
        y = y.add(4);
    }
    let vx = xend.cast::<u32>().read_unaligned();
    let vy = yend.cast::<u32>().read_unaligned();
    vx == vy
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn equals_different_lengths() {
        assert!(!is_equal(b"", b"a"));
        assert!(!is_equal(b"a", b""));
        assert!(!is_equal(b"ab", b"a"));
        assert!(!is_equal(b"a", b"ab"));
    }

    #[test]
    fn equals_mismatch() {
        let one_mismatch = [
            (&b"a"[..], &b"x"[..]),
            (&b"ab"[..], &b"ax"[..]),
            (&b"abc"[..], &b"abx"[..]),
            (&b"abcd"[..], &b"abcx"[..]),
            (&b"abcde"[..], &b"abcdx"[..]),
            (&b"abcdef"[..], &b"abcdex"[..]),
            (&b"abcdefg"[..], &b"abcdefx"[..]),
            (&b"abcdefgh"[..], &b"abcdefgx"[..]),
            (&b"abcdefghi"[..], &b"abcdefghx"[..]),
            (&b"abcdefghij"[..], &b"abcdefghix"[..]),
            (&b"abcdefghijk"[..], &b"abcdefghijx"[..]),
            (&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]),
            (&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]),
            (&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]),
        ];
        for (x, y) in one_mismatch {
            assert_eq!(x.len(), y.len(), "lengths should match");
            assert!(!is_equal(x, y));
            assert!(!is_equal(y, x));
        }
    }

    #[test]
    fn equals_yes() {
        assert!(is_equal(b"", b""));
        assert!(is_equal(b"a", b"a"));
        assert!(is_equal(b"ab", b"ab"));
        assert!(is_equal(b"abc", b"abc"));
        assert!(is_equal(b"abcd", b"abcd"));
        assert!(is_equal(b"abcde", b"abcde"));
        assert!(is_equal(b"abcdef", b"abcdef"));
        assert!(is_equal(b"abcdefg", b"abcdefg"));
        assert!(is_equal(b"abcdefgh", b"abcdefgh"));
        assert!(is_equal(b"abcdefghi", b"abcdefghi"));
    }

    #[test]
    fn prefix() {
        assert!(is_prefix(b"", b""));
        assert!(is_prefix(b"a", b""));
        assert!(is_prefix(b"ab", b""));
        assert!(is_prefix(b"foo", b"foo"));
        assert!(is_prefix(b"foobar", b"foo"));

        assert!(!is_prefix(b"foo", b"fob"));
        assert!(!is_prefix(b"foobar", b"fob"));
    }
}