png/
filter.rs

1use core::convert::TryInto;
2
3use crate::common::BytesPerPixel;
4
5/// SIMD helpers for `fn unfilter`
6///
7/// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the
8/// "unstable" feature of the `png` crate.  This should be possible once the "portable_simd"
9/// feature of Rust gets stabilized.
10///
11/// This is only known to help on x86, with no change measured on most benchmarks on ARM,
12/// and even severely regressing some of them.
13/// So despite the code being portable, we only enable this for x86.
14/// We can add more platforms once this code is proven to be beneficial for them.
15#[cfg(all(feature = "unstable", target_arch = "x86_64"))]
16mod simd {
17    use std::simd::num::{SimdInt, SimdUint};
18    use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount};
19
20    /// Scalar Paeth function wrapped in SIMD scaffolding.
21    ///
22    /// This is needed because simply running the function on the inputs
23    /// makes the compiler think our inputs are too short
24    /// to benefit from vectorization.
25    /// Putting it in SIMD scaffolding fixes that.
26    /// https://github.com/image-rs/image-png/issues/511
27    ///
28    /// Funnily, the autovectorizer does a better job here
29    /// than a handwritten algorithm using std::simd!
30    /// We used to have a handwritten one but this is just faster.
31    fn paeth_predictor<const N: usize>(
32        a: Simd<i16, N>,
33        b: Simd<i16, N>,
34        c: Simd<i16, N>,
35    ) -> Simd<i16, N>
36    where
37        LaneCount<N>: SupportedLaneCount,
38    {
39        let mut out = [0; N];
40        for i in 0..N {
41            out[i] = super::filter_paeth_stbi_i16(a[i].into(), b[i].into(), c[i].into());
42        }
43        out.into()
44    }
45
46    /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert
47    /// the SIMD elements to `i16`.
48    fn paeth_predictor_u8<const N: usize>(
49        a: Simd<u8, N>,
50        b: Simd<u8, N>,
51        c: Simd<u8, N>,
52    ) -> Simd<u8, N>
53    where
54        LaneCount<N>: SupportedLaneCount,
55    {
56        let mut out = [0; N];
57        for i in 0..N {
58            out[i] = super::filter_paeth_stbi(a[i].into(), b[i].into(), c[i].into());
59        }
60        out.into()
61    }
62
63    /// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`).
64    /// See also https://www.w3.org/TR/png/#filter-byte-positions
65    #[derive(Default)]
66    struct PaethState<T, const N: usize>
67    where
68        T: SimdElement,
69        LaneCount<N>: SupportedLaneCount,
70    {
71        /// Previous pixel in the previous row.
72        c: Simd<T, N>,
73
74        /// Previous pixel in the current row.
75        a: Simd<T, N>,
76    }
77
78    /// Mutates `x` as needed to unfilter `FilterType::Paeth`.
79    ///
80    /// `b` is the current pixel in the previous row.  `x` is the current pixel in the current row.
81    /// See also https://www.w3.org/TR/png/#filter-byte-positions
82    fn paeth_step<const N: usize>(
83        state: &mut PaethState<i16, N>,
84        b: Simd<u8, N>,
85        x: &mut Simd<u8, N>,
86    ) where
87        LaneCount<N>: SupportedLaneCount,
88    {
89        // Storing the inputs.
90        let b = b.cast::<i16>();
91
92        // Calculating the new value of the current pixel.
93        let predictor = paeth_predictor(state.a, b, state.c);
94        *x += predictor.cast::<u8>();
95
96        // Preparing for the next step.
97        state.c = b;
98        state.a = x.cast::<i16>();
99    }
100
101    /// Computes the Paeth predictor without converting `u8` to `i16`.
102    ///
103    /// See `simd::paeth_step`.
104    fn paeth_step_u8<const N: usize>(
105        state: &mut PaethState<u8, N>,
106        b: Simd<u8, N>,
107        x: &mut Simd<u8, N>,
108    ) where
109        LaneCount<N>: SupportedLaneCount,
110    {
111        // Calculating the new value of the current pixel.
112        *x += paeth_predictor_u8(state.a, b, state.c);
113
114        // Preparing for the next step.
115        state.c = b;
116        state.a = *x;
117    }
118
119    fn load3(src: &[u8]) -> u8x4 {
120        u8x4::from_array([src[0], src[1], src[2], 0])
121    }
122
123    fn store3(src: u8x4, dest: &mut [u8]) {
124        dest[0..3].copy_from_slice(&src.to_array()[0..3])
125    }
126
127    /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
128    pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
129        debug_assert_eq!(prev_row.len(), curr_row.len());
130        debug_assert_eq!(prev_row.len() % 3, 0);
131
132        let mut state = PaethState::<i16, 4>::default();
133        while prev_row.len() >= 4 {
134            // `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first
135            // byte from the next triple.  This optimization technique mimics the algorithm found
136            // in
137            // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
138            let b = u8x4::from_slice(prev_row);
139            let mut x = u8x4::from_slice(curr_row);
140
141            paeth_step(&mut state, b, &mut x);
142
143            // We can speculate that writing 4 bytes might be more efficient (just as with using
144            // `u8x4::from_slice` above), but we can't use that here, because we can't clobber the
145            // first byte of the next pixel in the `curr_row`.
146            store3(x, curr_row);
147
148            prev_row = &prev_row[3..];
149            curr_row = &mut curr_row[3..];
150        }
151        // Can't use `u8x4::from_slice` for the last `[u8;3]`.
152        let b = load3(prev_row);
153        let mut x = load3(curr_row);
154        paeth_step(&mut state, b, &mut x);
155        store3(x, curr_row);
156    }
157
158    /// Undoes `FilterType::Paeth` for `BytesPerPixel::Four` and `BytesPerPixel::Eight`.
159    ///
160    /// This function calculates the Paeth predictor entirely in `Simd<u8, N>`
161    /// without converting to an intermediate `Simd<i16, N>`. Doing so avoids
162    /// paying a small performance penalty converting between types.
163    pub fn unfilter_paeth_u8<const N: usize>(prev_row: &[u8], curr_row: &mut [u8])
164    where
165        LaneCount<N>: SupportedLaneCount,
166    {
167        debug_assert_eq!(prev_row.len(), curr_row.len());
168        debug_assert_eq!(prev_row.len() % N, 0);
169        assert!(matches!(N, 4 | 8));
170
171        let mut state = PaethState::<u8, N>::default();
172        for (prev_row, curr_row) in prev_row.chunks_exact(N).zip(curr_row.chunks_exact_mut(N)) {
173            let b = Simd::from_slice(prev_row);
174            let mut x = Simd::from_slice(curr_row);
175
176            paeth_step_u8(&mut state, b, &mut x);
177
178            curr_row[..N].copy_from_slice(&x.to_array()[..N]);
179        }
180    }
181
182    fn load6(src: &[u8]) -> u8x8 {
183        u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0])
184    }
185
186    fn store6(src: u8x8, dest: &mut [u8]) {
187        dest[0..6].copy_from_slice(&src.to_array()[0..6])
188    }
189
190    /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`.
191    pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
192        debug_assert_eq!(prev_row.len(), curr_row.len());
193        debug_assert_eq!(prev_row.len() % 6, 0);
194
195        let mut state = PaethState::<i16, 8>::default();
196        while prev_row.len() >= 8 {
197            // `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two
198            // bytes from the next pixel.  This optimization technique mimics the algorithm found
199            // in
200            // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
201            let b = u8x8::from_slice(prev_row);
202            let mut x = u8x8::from_slice(curr_row);
203
204            paeth_step(&mut state, b, &mut x);
205
206            // We can speculate that writing 8 bytes might be more efficient (just as with using
207            // `u8x8::from_slice` above), but we can't use that here, because we can't clobber the
208            // first bytes of the next pixel in the `curr_row`.
209            store6(x, curr_row);
210
211            prev_row = &prev_row[6..];
212            curr_row = &mut curr_row[6..];
213        }
214        // Can't use `u8x8::from_slice` for the last `[u8;6]`.
215        let b = load6(prev_row);
216        let mut x = load6(curr_row);
217        paeth_step(&mut state, b, &mut x);
218        store6(x, curr_row);
219    }
220}
221
222/// The byte level filter applied to scanlines to prepare them for compression.
223///
224/// Compression in general benefits from repetitive data. The filter is a content-aware method of
225/// compressing the range of occurring byte values to help the compression algorithm. Note that
226/// this does not operate on pixels but on raw bytes of a scanline.
227///
228/// Details on how each filter works can be found in the [PNG Book](http://www.libpng.org/pub/png/book/chapter09.html).
229#[derive(Debug, Clone, Copy, PartialEq, Eq)]
230#[repr(u8)]
231pub enum FilterType {
232    NoFilter = 0,
233    Sub = 1,
234    Up = 2,
235    Avg = 3,
236    Paeth = 4,
237}
238
239impl Default for FilterType {
240    fn default() -> Self {
241        FilterType::Sub
242    }
243}
244
245impl FilterType {
246    /// u8 -> Self. Temporary solution until Rust provides a canonical one.
247    pub fn from_u8(n: u8) -> Option<FilterType> {
248        match n {
249            0 => Some(FilterType::NoFilter),
250            1 => Some(FilterType::Sub),
251            2 => Some(FilterType::Up),
252            3 => Some(FilterType::Avg),
253            4 => Some(FilterType::Paeth),
254            _ => None,
255        }
256    }
257}
258
259/// Adaptive filtering tries every possible filter for each row and uses a heuristic to select the best one.
260/// This improves compression ratio, but makes encoding slightly slower.
261///
262/// It is recommended to use `Adaptive` whenever you care about compression ratio.
263/// Filtering is quite cheap compared to other parts of encoding, but can contribute
264/// to the compression ratio significantly.
265///
266/// `NonAdaptive` filtering is the default.
267#[derive(Debug, Clone, Copy, PartialEq, Eq)]
268#[repr(u8)]
269pub enum AdaptiveFilterType {
270    Adaptive,
271    NonAdaptive,
272}
273
274impl Default for AdaptiveFilterType {
275    fn default() -> Self {
276        AdaptiveFilterType::NonAdaptive
277    }
278}
279
280fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
281    // On ARM this algorithm performs much better than the one above adapted from stb,
282    // and this is the better-studied algorithm we've always used here,
283    // so we default to it on all non-x86 platforms.
284    let pa = (i16::from(b) - i16::from(c)).abs();
285    let pb = (i16::from(a) - i16::from(c)).abs();
286    let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
287
288    let mut out = a;
289    let mut min = pa;
290
291    if pb < min {
292        min = pb;
293        out = b;
294    }
295    if pc < min {
296        out = c;
297    }
298
299    out
300}
301
302fn filter_paeth_stbi(a: u8, b: u8, c: u8) -> u8 {
303    // Decoding optimizes better with this algorithm than with `filter_paeth`
304    //
305    // This formulation looks very different from the reference in the PNG spec, but is
306    // actually equivalent and has favorable data dependencies and admits straightforward
307    // generation of branch-free code, which helps performance significantly.
308    //
309    // Adapted from public domain PNG implementation:
310    // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
311    let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
312    let lo = a.min(b);
313    let hi = a.max(b);
314    let t0 = if hi as i16 <= thresh { lo } else { c };
315    let t1 = if thresh <= lo as i16 { hi } else { t0 };
316    return t1;
317}
318
319#[cfg(any(test, all(feature = "unstable", target_arch = "x86_64")))]
320fn filter_paeth_stbi_i16(a: i16, b: i16, c: i16) -> i16 {
321    // Like `filter_paeth_stbi` but vectorizes better when wrapped in SIMD types.
322    // Used for bpp=3 and bpp=6
323    let thresh = c * 3 - (a + b);
324    let lo = a.min(b);
325    let hi = a.max(b);
326    let t0 = if hi <= thresh { lo } else { c };
327    let t1 = if thresh <= lo { hi } else { t0 };
328    return t1;
329}
330
331fn filter_paeth_fpnge(a: u8, b: u8, c: u8) -> u8 {
332    // This is an optimized version of the paeth filter from the PNG specification, proposed by
333    // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates
334    // entirely on unsigned 8-bit quantities, making it more conducive to vectorization.
335    //
336    //     p = a + b - c
337    //     pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c)
338    //     pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c)
339    //     pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ...
340    //
341    // Further optimizing the calculation of `pc` a bit tricker. However, notice that:
342    //
343    //        a > c && b > c
344    //    ==> (a - c) > 0 && (b - c) > 0
345    //    ==> pc > (a - c) && pc > (b - c)
346    //    ==> pc > |a - c| && pc > |b - c|
347    //    ==> pc > pb && pc > pa
348    //
349    // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar
350    // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and
351    // `c <= b` or vice versa:
352    //
353    //     pc = ||b - c| - |a - c|| =  |pa - pb| = max(pa, pb) - min(pa, pb)
354    //
355    let pa = b.max(c) - c.min(b);
356    let pb = a.max(c) - c.min(a);
357    let pc = if (a < c) == (c < b) {
358        pa.max(pb) - pa.min(pb)
359    } else {
360        255
361    };
362
363    if pa <= pb && pa <= pc {
364        a
365    } else if pb <= pc {
366        b
367    } else {
368        c
369    }
370}
371
372pub(crate) fn unfilter(
373    mut filter: FilterType,
374    tbpp: BytesPerPixel,
375    previous: &[u8],
376    current: &mut [u8],
377) {
378    use self::FilterType::*;
379
380    // If the previous row is empty, then treat it as if it were filled with zeros.
381    if previous.is_empty() {
382        if filter == Paeth {
383            filter = Sub;
384        } else if filter == Up {
385            filter = NoFilter;
386        }
387    }
388
389    // [2023/01 @okaneco] - Notes on optimizing decoding filters
390    //
391    // Links:
392    // [PR]: https://github.com/image-rs/image-png/pull/382
393    // [SWAR]: http://aggregate.org/SWAR/over.html
394    // [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers
395    //
396    // #382 heavily refactored and optimized the following filters making the
397    // implementation nonobvious. These comments function as a summary of that
398    // PR with an explanation of the choices made below.
399    //
400    // #382 originally started with trying to optimize using a technique called
401    // SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32`
402    // and `u64` as SIMD registers to perform vertical operations in parallel,
403    // usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp)
404    // pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp
405    // in a `u64`. The `Sub` filter looked like the following code block, `Avg`
406    // was similar but used a bitwise average method from [AVG]:
407    // ```
408    // // See "Unpartitioned Operations With Correction Code" from [SWAR]
409    // fn swar_add_u32(x: u32, y: u32) -> u32 {
410    //     // 7-bit addition so there's no carry over the most significant bit
411    //     let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111
412    //     // 1-bit parity/XOR addition to fill in the missing MSB
413    //     n ^ (x ^ y) & 0x80808080                     // 0x80 = 0b_1000_0000
414    // }
415    //
416    // let mut prev =
417    //     u32::from_ne_bytes([current[0], current[1], current[2], current[3]]);
418    // for chunk in current[4..].chunks_exact_mut(4) {
419    //     let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
420    //     let new_chunk = swar_add_u32(cur, prev);
421    //     chunk.copy_from_slice(&new_chunk.to_ne_bytes());
422    //     prev = new_chunk;
423    // }
424    // ```
425    // While this provided a measurable increase, @fintelia found that this idea
426    // could be taken even further by unrolling the chunks component-wise and
427    // avoiding unnecessary byte-shuffling by using byte arrays instead of
428    // `u32::from|to_ne_bytes`. The bitwise operations were no longer necessary
429    // so they were reverted to their obvious arithmetic equivalent. Lastly,
430    // `TryInto` was used instead of `copy_from_slice`. The `Sub` code now
431    // looked like this (with asserts to remove `0..bpp` bounds checks):
432    // ```
433    // assert!(len > 3);
434    // let mut prev = [current[0], current[1], current[2], current[3]];
435    // for chunk in current[4..].chunks_exact_mut(4) {
436    //     let new_chunk = [
437    //         chunk[0].wrapping_add(prev[0]),
438    //         chunk[1].wrapping_add(prev[1]),
439    //         chunk[2].wrapping_add(prev[2]),
440    //         chunk[3].wrapping_add(prev[3]),
441    //     ];
442    //     *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
443    //     prev = new_chunk;
444    // }
445    // ```
446    // The compiler was able to optimize the code to be even faster and this
447    // method even sped up Paeth filtering! Assertions were experimentally
448    // added within loop bodies which produced better instructions but no
449    // difference in speed. Finally, the code was refactored to remove manual
450    // slicing and start the previous pixel chunks with arrays of `[0; N]`.
451    // ```
452    // let mut prev = [0; 4];
453    // for chunk in current.chunks_exact_mut(4) {
454    //     let new_chunk = [
455    //         chunk[0].wrapping_add(prev[0]),
456    //         chunk[1].wrapping_add(prev[1]),
457    //         chunk[2].wrapping_add(prev[2]),
458    //         chunk[3].wrapping_add(prev[3]),
459    //     ];
460    //     *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
461    //     prev = new_chunk;
462    // }
463    // ```
464    // While we're not manually bit-twiddling anymore, a possible takeaway from
465    // this is to "think in SWAR" when dealing with small byte arrays. Unrolling
466    // array operations and performing them component-wise may unlock previously
467    // unavailable optimizations from the compiler, even when using the
468    // `chunks_exact` methods for their potential auto-vectorization benefits.
469    match filter {
470        NoFilter => {}
471        Sub => match tbpp {
472            BytesPerPixel::One => {
473                current.iter_mut().reduce(|&mut prev, curr| {
474                    *curr = curr.wrapping_add(prev);
475                    curr
476                });
477            }
478            BytesPerPixel::Two => {
479                let mut prev = [0; 2];
480                for chunk in current.chunks_exact_mut(2) {
481                    let new_chunk = [
482                        chunk[0].wrapping_add(prev[0]),
483                        chunk[1].wrapping_add(prev[1]),
484                    ];
485                    *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
486                    prev = new_chunk;
487                }
488            }
489            BytesPerPixel::Three => {
490                let mut prev = [0; 3];
491                for chunk in current.chunks_exact_mut(3) {
492                    let new_chunk = [
493                        chunk[0].wrapping_add(prev[0]),
494                        chunk[1].wrapping_add(prev[1]),
495                        chunk[2].wrapping_add(prev[2]),
496                    ];
497                    *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
498                    prev = new_chunk;
499                }
500            }
501            BytesPerPixel::Four => {
502                let mut prev = [0; 4];
503                for chunk in current.chunks_exact_mut(4) {
504                    let new_chunk = [
505                        chunk[0].wrapping_add(prev[0]),
506                        chunk[1].wrapping_add(prev[1]),
507                        chunk[2].wrapping_add(prev[2]),
508                        chunk[3].wrapping_add(prev[3]),
509                    ];
510                    *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
511                    prev = new_chunk;
512                }
513            }
514            BytesPerPixel::Six => {
515                let mut prev = [0; 6];
516                for chunk in current.chunks_exact_mut(6) {
517                    let new_chunk = [
518                        chunk[0].wrapping_add(prev[0]),
519                        chunk[1].wrapping_add(prev[1]),
520                        chunk[2].wrapping_add(prev[2]),
521                        chunk[3].wrapping_add(prev[3]),
522                        chunk[4].wrapping_add(prev[4]),
523                        chunk[5].wrapping_add(prev[5]),
524                    ];
525                    *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
526                    prev = new_chunk;
527                }
528            }
529            BytesPerPixel::Eight => {
530                let mut prev = [0; 8];
531                for chunk in current.chunks_exact_mut(8) {
532                    let new_chunk = [
533                        chunk[0].wrapping_add(prev[0]),
534                        chunk[1].wrapping_add(prev[1]),
535                        chunk[2].wrapping_add(prev[2]),
536                        chunk[3].wrapping_add(prev[3]),
537                        chunk[4].wrapping_add(prev[4]),
538                        chunk[5].wrapping_add(prev[5]),
539                        chunk[6].wrapping_add(prev[6]),
540                        chunk[7].wrapping_add(prev[7]),
541                    ];
542                    *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
543                    prev = new_chunk;
544                }
545            }
546        },
547        Up => {
548            for (curr, &above) in current.iter_mut().zip(previous) {
549                *curr = curr.wrapping_add(above);
550            }
551        }
552        Avg if previous.is_empty() => match tbpp {
553            BytesPerPixel::One => {
554                current.iter_mut().reduce(|&mut prev, curr| {
555                    *curr = curr.wrapping_add(prev / 2);
556                    curr
557                });
558            }
559            BytesPerPixel::Two => {
560                let mut prev = [0; 2];
561                for chunk in current.chunks_exact_mut(2) {
562                    let new_chunk = [
563                        chunk[0].wrapping_add(prev[0] / 2),
564                        chunk[1].wrapping_add(prev[1] / 2),
565                    ];
566                    *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
567                    prev = new_chunk;
568                }
569            }
570            BytesPerPixel::Three => {
571                let mut prev = [0; 3];
572                for chunk in current.chunks_exact_mut(3) {
573                    let new_chunk = [
574                        chunk[0].wrapping_add(prev[0] / 2),
575                        chunk[1].wrapping_add(prev[1] / 2),
576                        chunk[2].wrapping_add(prev[2] / 2),
577                    ];
578                    *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
579                    prev = new_chunk;
580                }
581            }
582            BytesPerPixel::Four => {
583                let mut prev = [0; 4];
584                for chunk in current.chunks_exact_mut(4) {
585                    let new_chunk = [
586                        chunk[0].wrapping_add(prev[0] / 2),
587                        chunk[1].wrapping_add(prev[1] / 2),
588                        chunk[2].wrapping_add(prev[2] / 2),
589                        chunk[3].wrapping_add(prev[3] / 2),
590                    ];
591                    *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
592                    prev = new_chunk;
593                }
594            }
595            BytesPerPixel::Six => {
596                let mut prev = [0; 6];
597                for chunk in current.chunks_exact_mut(6) {
598                    let new_chunk = [
599                        chunk[0].wrapping_add(prev[0] / 2),
600                        chunk[1].wrapping_add(prev[1] / 2),
601                        chunk[2].wrapping_add(prev[2] / 2),
602                        chunk[3].wrapping_add(prev[3] / 2),
603                        chunk[4].wrapping_add(prev[4] / 2),
604                        chunk[5].wrapping_add(prev[5] / 2),
605                    ];
606                    *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
607                    prev = new_chunk;
608                }
609            }
610            BytesPerPixel::Eight => {
611                let mut prev = [0; 8];
612                for chunk in current.chunks_exact_mut(8) {
613                    let new_chunk = [
614                        chunk[0].wrapping_add(prev[0] / 2),
615                        chunk[1].wrapping_add(prev[1] / 2),
616                        chunk[2].wrapping_add(prev[2] / 2),
617                        chunk[3].wrapping_add(prev[3] / 2),
618                        chunk[4].wrapping_add(prev[4] / 2),
619                        chunk[5].wrapping_add(prev[5] / 2),
620                        chunk[6].wrapping_add(prev[6] / 2),
621                        chunk[7].wrapping_add(prev[7] / 2),
622                    ];
623                    *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
624                    prev = new_chunk;
625                }
626            }
627        },
628        Avg => match tbpp {
629            BytesPerPixel::One => {
630                let mut lprev = [0; 1];
631                for (chunk, above) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1)) {
632                    let new_chunk =
633                        [chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8)];
634                    *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk;
635                    lprev = new_chunk;
636                }
637            }
638            BytesPerPixel::Two => {
639                let mut lprev = [0; 2];
640                for (chunk, above) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2)) {
641                    let new_chunk = [
642                        chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
643                        chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
644                    ];
645                    *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
646                    lprev = new_chunk;
647                }
648            }
649            BytesPerPixel::Three => {
650                let mut lprev = [0; 3];
651                for (chunk, above) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3)) {
652                    let new_chunk = [
653                        chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
654                        chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
655                        chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
656                    ];
657                    *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
658                    lprev = new_chunk;
659                }
660            }
661            BytesPerPixel::Four => {
662                let mut lprev = [0; 4];
663                for (chunk, above) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) {
664                    let new_chunk = [
665                        chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
666                        chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
667                        chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
668                        chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
669                    ];
670                    *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
671                    lprev = new_chunk;
672                }
673            }
674            BytesPerPixel::Six => {
675                let mut lprev = [0; 6];
676                for (chunk, above) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6)) {
677                    let new_chunk = [
678                        chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
679                        chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
680                        chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
681                        chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
682                        chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8),
683                        chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8),
684                    ];
685                    *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
686                    lprev = new_chunk;
687                }
688            }
689            BytesPerPixel::Eight => {
690                let mut lprev = [0; 8];
691                for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) {
692                    let new_chunk = [
693                        chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
694                        chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
695                        chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
696                        chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
697                        chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8),
698                        chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8),
699                        chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8),
700                        chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8),
701                    ];
702                    *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
703                    lprev = new_chunk;
704                }
705            }
706        },
707        #[allow(unreachable_code)]
708        Paeth => {
709            // Select the fastest Paeth filter implementation based on the target architecture.
710            let filter_paeth_decode = if cfg!(target_arch = "x86_64") {
711                filter_paeth_stbi
712            } else {
713                filter_paeth
714            };
715
716            // Paeth filter pixels:
717            // C B D
718            // A X
719            match tbpp {
720                BytesPerPixel::One => {
721                    let mut a_bpp = [0; 1];
722                    let mut c_bpp = [0; 1];
723                    for (chunk, b_bpp) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1))
724                    {
725                        let new_chunk = [chunk[0]
726                            .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0]))];
727                        *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk;
728                        a_bpp = new_chunk;
729                        c_bpp = b_bpp.try_into().unwrap();
730                    }
731                }
732                BytesPerPixel::Two => {
733                    let mut a_bpp = [0; 2];
734                    let mut c_bpp = [0; 2];
735                    for (chunk, b_bpp) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2))
736                    {
737                        let new_chunk = [
738                            chunk[0]
739                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
740                            chunk[1]
741                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
742                        ];
743                        *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
744                        a_bpp = new_chunk;
745                        c_bpp = b_bpp.try_into().unwrap();
746                    }
747                }
748                BytesPerPixel::Three => {
749                    // Do not enable this algorithm on ARM, that would be a big performance hit
750                    #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
751                    {
752                        simd::unfilter_paeth3(previous, current);
753                        return;
754                    }
755
756                    let mut a_bpp = [0; 3];
757                    let mut c_bpp = [0; 3];
758                    for (chunk, b_bpp) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
759                    {
760                        let new_chunk = [
761                            chunk[0]
762                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
763                            chunk[1]
764                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
765                            chunk[2]
766                                .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
767                        ];
768                        *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
769                        a_bpp = new_chunk;
770                        c_bpp = b_bpp.try_into().unwrap();
771                    }
772                }
773                BytesPerPixel::Four => {
774                    #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
775                    {
776                        simd::unfilter_paeth_u8::<4>(previous, current);
777                        return;
778                    }
779
780                    let mut a_bpp = [0; 4];
781                    let mut c_bpp = [0; 4];
782                    for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4))
783                    {
784                        let new_chunk = [
785                            chunk[0]
786                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
787                            chunk[1]
788                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
789                            chunk[2]
790                                .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
791                            chunk[3]
792                                .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
793                        ];
794                        *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
795                        a_bpp = new_chunk;
796                        c_bpp = b_bpp.try_into().unwrap();
797                    }
798                }
799                BytesPerPixel::Six => {
800                    #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
801                    {
802                        simd::unfilter_paeth6(previous, current);
803                        return;
804                    }
805
806                    let mut a_bpp = [0; 6];
807                    let mut c_bpp = [0; 6];
808                    for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
809                    {
810                        let new_chunk = [
811                            chunk[0]
812                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
813                            chunk[1]
814                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
815                            chunk[2]
816                                .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
817                            chunk[3]
818                                .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
819                            chunk[4]
820                                .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
821                            chunk[5]
822                                .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
823                        ];
824                        *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
825                        a_bpp = new_chunk;
826                        c_bpp = b_bpp.try_into().unwrap();
827                    }
828                }
829                BytesPerPixel::Eight => {
830                    #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
831                    {
832                        simd::unfilter_paeth_u8::<8>(previous, current);
833                        return;
834                    }
835
836                    let mut a_bpp = [0; 8];
837                    let mut c_bpp = [0; 8];
838                    for (chunk, b_bpp) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8))
839                    {
840                        let new_chunk = [
841                            chunk[0]
842                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
843                            chunk[1]
844                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
845                            chunk[2]
846                                .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
847                            chunk[3]
848                                .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
849                            chunk[4]
850                                .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
851                            chunk[5]
852                                .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
853                            chunk[6]
854                                .wrapping_add(filter_paeth_decode(a_bpp[6], b_bpp[6], c_bpp[6])),
855                            chunk[7]
856                                .wrapping_add(filter_paeth_decode(a_bpp[7], b_bpp[7], c_bpp[7])),
857                        ];
858                        *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
859                        a_bpp = new_chunk;
860                        c_bpp = b_bpp.try_into().unwrap();
861                    }
862                }
863            }
864        }
865    }
866}
867
868fn filter_internal(
869    method: FilterType,
870    bpp: usize,
871    len: usize,
872    previous: &[u8],
873    current: &[u8],
874    output: &mut [u8],
875) -> FilterType {
876    use self::FilterType::*;
877
878    // This value was chosen experimentally based on what achieved the best performance. The
879    // Rust compiler does auto-vectorization, and 32-bytes per loop iteration seems to enable
880    // the fastest code when doing so.
881    const CHUNK_SIZE: usize = 32;
882
883    match method {
884        NoFilter => {
885            output.copy_from_slice(current);
886            NoFilter
887        }
888        Sub => {
889            let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE);
890            let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE);
891            let mut prev_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE);
892
893            for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) {
894                for i in 0..CHUNK_SIZE {
895                    out[i] = cur[i].wrapping_sub(prev[i]);
896                }
897            }
898
899            for ((out, cur), &prev) in out_chunks
900                .into_remainder()
901                .iter_mut()
902                .zip(cur_chunks.remainder())
903                .zip(prev_chunks.remainder())
904            {
905                *out = cur.wrapping_sub(prev);
906            }
907
908            output[..bpp].copy_from_slice(&current[..bpp]);
909            Sub
910        }
911        Up => {
912            let mut out_chunks = output.chunks_exact_mut(CHUNK_SIZE);
913            let mut cur_chunks = current.chunks_exact(CHUNK_SIZE);
914            let mut prev_chunks = previous.chunks_exact(CHUNK_SIZE);
915
916            for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) {
917                for i in 0..CHUNK_SIZE {
918                    out[i] = cur[i].wrapping_sub(prev[i]);
919                }
920            }
921
922            for ((out, cur), &prev) in out_chunks
923                .into_remainder()
924                .iter_mut()
925                .zip(cur_chunks.remainder())
926                .zip(prev_chunks.remainder())
927            {
928                *out = cur.wrapping_sub(prev);
929            }
930            Up
931        }
932        Avg => {
933            let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE);
934            let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE);
935            let mut cur_minus_bpp_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE);
936            let mut prev_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE);
937
938            for (((out, cur), cur_minus_bpp), prev) in (&mut out_chunks)
939                .zip(&mut cur_chunks)
940                .zip(&mut cur_minus_bpp_chunks)
941                .zip(&mut prev_chunks)
942            {
943                for i in 0..CHUNK_SIZE {
944                    // Bitwise average of two integers without overflow and
945                    // without converting to a wider bit-width. See:
946                    // http://aggregate.org/MAGIC/#Average%20of%20Integers
947                    // If this is unrolled by component, consider reverting to
948                    // `((cur_minus_bpp[i] as u16 + prev[i] as u16) / 2) as u8`
949                    out[i] = cur[i].wrapping_sub(
950                        (cur_minus_bpp[i] & prev[i]) + ((cur_minus_bpp[i] ^ prev[i]) >> 1),
951                    );
952                }
953            }
954
955            for (((out, cur), &cur_minus_bpp), &prev) in out_chunks
956                .into_remainder()
957                .iter_mut()
958                .zip(cur_chunks.remainder())
959                .zip(cur_minus_bpp_chunks.remainder())
960                .zip(prev_chunks.remainder())
961            {
962                *out = cur.wrapping_sub((cur_minus_bpp & prev) + ((cur_minus_bpp ^ prev) >> 1));
963            }
964
965            for i in 0..bpp {
966                output[i] = current[i].wrapping_sub(previous[i] / 2);
967            }
968            Avg
969        }
970        Paeth => {
971            let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE);
972            let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE);
973            let mut a_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE);
974            let mut b_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE);
975            let mut c_chunks = previous[..len - bpp].chunks_exact(CHUNK_SIZE);
976
977            for ((((out, cur), a), b), c) in (&mut out_chunks)
978                .zip(&mut cur_chunks)
979                .zip(&mut a_chunks)
980                .zip(&mut b_chunks)
981                .zip(&mut c_chunks)
982            {
983                for i in 0..CHUNK_SIZE {
984                    out[i] = cur[i].wrapping_sub(filter_paeth_fpnge(a[i], b[i], c[i]));
985                }
986            }
987
988            for ((((out, cur), &a), &b), &c) in out_chunks
989                .into_remainder()
990                .iter_mut()
991                .zip(cur_chunks.remainder())
992                .zip(a_chunks.remainder())
993                .zip(b_chunks.remainder())
994                .zip(c_chunks.remainder())
995            {
996                *out = cur.wrapping_sub(filter_paeth_fpnge(a, b, c));
997            }
998
999            for i in 0..bpp {
1000                output[i] = current[i].wrapping_sub(filter_paeth_fpnge(0, previous[i], 0));
1001            }
1002            Paeth
1003        }
1004    }
1005}
1006
1007pub(crate) fn filter(
1008    method: FilterType,
1009    adaptive: AdaptiveFilterType,
1010    bpp: BytesPerPixel,
1011    previous: &[u8],
1012    current: &[u8],
1013    output: &mut [u8],
1014) -> FilterType {
1015    use FilterType::*;
1016    let bpp = bpp.into_usize();
1017    let len = current.len();
1018
1019    match adaptive {
1020        AdaptiveFilterType::NonAdaptive => {
1021            filter_internal(method, bpp, len, previous, current, output)
1022        }
1023        AdaptiveFilterType::Adaptive => {
1024            let mut min_sum: u64 = u64::MAX;
1025            let mut filter_choice = FilterType::NoFilter;
1026            for &filter in [Sub, Up, Avg, Paeth].iter() {
1027                filter_internal(filter, bpp, len, previous, current, output);
1028                let sum = sum_buffer(output);
1029                if sum <= min_sum {
1030                    min_sum = sum;
1031                    filter_choice = filter;
1032                }
1033            }
1034
1035            if filter_choice != Paeth {
1036                filter_internal(filter_choice, bpp, len, previous, current, output);
1037            }
1038            filter_choice
1039        }
1040    }
1041}
1042
1043// Helper function for Adaptive filter buffer summation
1044fn sum_buffer(buf: &[u8]) -> u64 {
1045    const CHUNK_SIZE: usize = 32;
1046
1047    let mut buf_chunks = buf.chunks_exact(CHUNK_SIZE);
1048    let mut sum = 0_u64;
1049
1050    for chunk in &mut buf_chunks {
1051        // At most, `acc` can be `32 * (i8::MIN as u8) = 32 * 128 = 4096`.
1052        let mut acc = 0;
1053        for &b in chunk {
1054            acc += u64::from((b as i8).unsigned_abs());
1055        }
1056        sum = sum.saturating_add(acc);
1057    }
1058
1059    let mut acc = 0;
1060    for &b in buf_chunks.remainder() {
1061        acc += u64::from((b as i8).unsigned_abs());
1062    }
1063
1064    sum.saturating_add(acc)
1065}
1066
1067#[cfg(test)]
1068mod test {
1069    use super::*;
1070    use core::iter;
1071
1072    #[test]
1073    fn roundtrip() {
1074        // A multiple of 8, 6, 4, 3, 2, 1
1075        const LEN: u8 = 240;
1076        let previous: Vec<_> = iter::repeat(1).take(LEN.into()).collect();
1077        let current: Vec<_> = (0..LEN).collect();
1078        let expected = current.clone();
1079        let adaptive = AdaptiveFilterType::NonAdaptive;
1080
1081        let roundtrip = |kind, bpp: BytesPerPixel| {
1082            let mut output = vec![0; LEN.into()];
1083            filter(kind, adaptive, bpp, &previous, &current, &mut output);
1084            unfilter(kind, bpp, &previous, &mut output);
1085            assert_eq!(
1086                output, expected,
1087                "Filtering {:?} with {:?} does not roundtrip",
1088                bpp, kind
1089            );
1090        };
1091
1092        let filters = [
1093            FilterType::NoFilter,
1094            FilterType::Sub,
1095            FilterType::Up,
1096            FilterType::Avg,
1097            FilterType::Paeth,
1098        ];
1099
1100        let bpps = [
1101            BytesPerPixel::One,
1102            BytesPerPixel::Two,
1103            BytesPerPixel::Three,
1104            BytesPerPixel::Four,
1105            BytesPerPixel::Six,
1106            BytesPerPixel::Eight,
1107        ];
1108
1109        for &filter in filters.iter() {
1110            for &bpp in bpps.iter() {
1111                roundtrip(filter, bpp);
1112            }
1113        }
1114    }
1115
1116    #[test]
1117    #[ignore] // takes ~20s without optimizations
1118    fn paeth_impls_are_equivalent() {
1119        for a in 0..=255 {
1120            for b in 0..=255 {
1121                for c in 0..=255 {
1122                    let baseline = filter_paeth(a, b, c);
1123                    let fpnge = filter_paeth_fpnge(a, b, c);
1124                    let stbi = filter_paeth_stbi(a, b, c);
1125                    let stbi_i16 = filter_paeth_stbi_i16(a as i16, b as i16, c as i16);
1126
1127                    assert_eq!(baseline, fpnge);
1128                    assert_eq!(baseline, stbi);
1129                    assert_eq!(baseline as i16, stbi_i16);
1130                }
1131            }
1132        }
1133    }
1134
1135    #[test]
1136    fn roundtrip_ascending_previous_line() {
1137        // A multiple of 8, 6, 4, 3, 2, 1
1138        const LEN: u8 = 240;
1139        let previous: Vec<_> = (0..LEN).collect();
1140        let current: Vec<_> = (0..LEN).collect();
1141        let expected = current.clone();
1142        let adaptive = AdaptiveFilterType::NonAdaptive;
1143
1144        let roundtrip = |kind, bpp: BytesPerPixel| {
1145            let mut output = vec![0; LEN.into()];
1146            filter(kind, adaptive, bpp, &previous, &current, &mut output);
1147            unfilter(kind, bpp, &previous, &mut output);
1148            assert_eq!(
1149                output, expected,
1150                "Filtering {:?} with {:?} does not roundtrip",
1151                bpp, kind
1152            );
1153        };
1154
1155        let filters = [
1156            FilterType::NoFilter,
1157            FilterType::Sub,
1158            FilterType::Up,
1159            FilterType::Avg,
1160            FilterType::Paeth,
1161        ];
1162
1163        let bpps = [
1164            BytesPerPixel::One,
1165            BytesPerPixel::Two,
1166            BytesPerPixel::Three,
1167            BytesPerPixel::Four,
1168            BytesPerPixel::Six,
1169            BytesPerPixel::Eight,
1170        ];
1171
1172        for &filter in filters.iter() {
1173            for &bpp in bpps.iter() {
1174                roundtrip(filter, bpp);
1175            }
1176        }
1177    }
1178
1179    #[test]
1180    // This tests that converting u8 to i8 doesn't overflow when taking the
1181    // absolute value for adaptive filtering: -128_i8.abs() will panic in debug
1182    // or produce garbage in release mode. The sum of 0..=255u8 should equal the
1183    // sum of the absolute values of -128_i8..=127, or abs(-128..=0) + 1..=127.
1184    fn sum_buffer_test() {
1185        let sum = (0..=128).sum::<u64>() + (1..=127).sum::<u64>();
1186        let buf: Vec<u8> = (0_u8..=255).collect();
1187
1188        assert_eq!(sum, crate::filter::sum_buffer(&buf));
1189    }
1190}