fast_srgb8/
lib.rs

Help
1//! Small crate implementing fast conversion between linear float and 8-bit
2//! sRGB.
3//!
4//! - [`f32_to_srgb8`]: Convert f32 to an sRGB u8. Meets all the requirements of
5//!   [the most relevent public
6//!   spec](https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#FLOATtoSRGB)
7//!   which includes:
8//!     - Maximum error of 0.6 ULP (on integer side) — Note that in practice
9//!       this is a higher max error than the naive implementation will give
10//!       you, so for applications like scientific or medical imaging, perhaps
11//!       this is less acceptable. That said, for normal graphics work, this
12//!       should be fine.
13//!     - Monotonic across the 0.0..=1.0 range. (If `f32_to_srgb8(a) >
14//!       f32_to_srgb8(b)`, then `a > b`)
15//!     - All possible outputs are achievable (round-trips with
16//!       [`srgb8_to_f32`]).
17//!
18//! - [`f32x4_to_srgb8`]: Produces results identical to calling [`f32_to_srgb8`]
19//!   4 times in a row. On targets where we have a SIMD implementation
20//!   (currently SSE2-enabled x86 and x86_64), this will use that. Otherwise, it
21//!   will just call `f32_to_srgb8` four times in a row, and return the results.
22//!
23//! - [`srgb8_to_f32`]: Inverse operation of [`f32_to_srgb8`]. Uses the standard
24//!   technique of a 256-item lookup table.
25//!
26//! ## Benefits
27//! - Large performance improvments over the naive implementation (see
28//!   [README.md](github.com/thomcc/fast-srgb8) for benchmarks)
29//! - Supports `no_std` (normally this is tricky, as these operations require
30//!   `powf` naively, which is not available to libcore)
31//! - No dependencies.
32//! - SIMD support for conversion to sRGB (conversion from sRGB is already ~20x
33//!   faster than naive impl, and would probably be slower in SIMD, so for now
34//!   it's not implemented).
35//! - Consistent and correct (according to at least one relevant spec) handling
36//!   of edge cases, such as NaN/Inf/etc.
37//! - Exhaustive checking of all inputs for correctness (in tests).
38
39#![cfg_attr(not(test), no_std)]
40#![cfg_attr(all(test, unstable_bench), feature(test))]
41#[cfg(all(test, unstable_bench))]
42extern crate test;
43
44#[cfg(all(
45    not(miri),
46    any(target_arch = "x86_64", target_arch = "x86"),
47    target_feature = "sse2"
48))]
49mod sse2;
50
51/// Converts linear f32 RGB component to an 8-bit sRGB value.
52///
53/// If you have to do this for many values simultaneously, use
54/// [`f32x4_to_srgb8`], which will compute 4 results at once (using SIMD
55/// instructions if available).
56///
57/// Input less than 0.0, or greater than 1.0, is clamped to be inside that
58/// range. NaN input is treated as identical to 0.0.
59///
60/// # Details
61///
62/// Conceptually, this is an optimized (and slightly approximated — see the
63/// "Approximation" section below) version of the following "reference
64/// implementation", which more or less looks like:
65///
66/// ```
67/// // Conceptually equivalent (but see below)
68/// fn to_srgb_reference(f: f32) -> u8 {
69///     let v = if !(f > 0.0) {
70///         0.0
71///     } else if f <= 0.0031308 {
72///         12.92 * f
73///     } else if f < 1.0 {
74///         1.055 * f.powf(1.0 / 2.4) - 0.055
75///     } else {
76///         1.0
77///     };
78///     (v * 255.0 + 0.5) as u8
79/// }
80/// ```
81///
82/// This crate's implementation uses a small lookup table (a `[u32; 104]` --
83/// around 6.5 cache lines), and avoids needing to call `powf` (which, as an
84/// added bonus, means it works great in `no_std`), and in practice is many
85/// times faster than the alternative.
86///
87/// Additional, it's fairly amenable to implementing in SIMD (— everything is
88/// easily parallelized aside from the table lookup), and so a 4-wide
89/// implementation is also provided as [`f32x4_to_srgb8`]
90///
91/// ## Approximation
92/// Note that this is *not* bitwise identical to the results of the
93/// `to_srgb_reference` function above, it's just very close. The maximum error
94/// is 0.544403 for an input of 0.31152344, where error is computed as the
95/// absolute difference between the rounded integer and the "exact" value.
96///
97/// This almost certainly meets requirements for graphics: [The DirectX
98/// spec](https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#FLOATtoSRGB)
99/// mandates that compliant implementations of this function have a maximum
100/// error of less than "0.6 ULP on the integer side" — Ours is ~0.54, which is
101/// within the requirement.
102///
103/// This means function is probably at least as accurate as whatever your GPU
104/// driver and/or hardware does for sRGB framebuffers and such — very likely
105/// even if it isn't using DirectX (it's spec tends to be descriptive of what's
106/// available commonly, especially in cases like this (most cases) where it's
107/// the only one that bothers to put a requirement).
108///
109/// Additionally, because this function converts the result `u8` — for the vast
110/// majority of inputs it will return an identical result to the reference impl.
111///
112/// To be completely clear (since it was brought up as a concern): despite this
113/// approximation, this function and [`srgb8_to_f32`] are inverses of eachother,
114/// and round trip appropriately.
115#[inline]
116pub fn f32_to_srgb8(f: f32) -> u8 {
117    const MAXV_BITS: u32 = 0x3f7fffff; // 1.0 - f32::EPSILON
118    const MINV_BITS: u32 = 0x39000000; // 2^(-13)
119    let minv = f32::from_bits(MINV_BITS);
120    let maxv = f32::from_bits(MAXV_BITS);
121    // written like this to handle nans.
122    let mut input = f;
123    if !(input > minv) {
124        input = minv;
125    }
126    if input > maxv {
127        input = maxv;
128    }
129    let fu = input.to_bits();
130    #[cfg(all(not(unstable_bench), test))]
131    {
132        debug_assert!(MINV_BITS <= fu && fu <= MAXV_BITS);
133    }
134    // Safety: all input floats are clamped into the {minv, maxv} range, which
135    // turns out in this case to guarantee that their bitwise reprs are clamped
136    // to the {MINV_BITS, MAXV_BITS} range (guaranteed by the fact that
137    // minv/maxv are the normal, finite, the same sign, and not zero).
138    //
139    // Because of that, the smallest result of `fu - MINV_BITS` is 0 (when `fu`
140    // is `MINV_BITS`), and the largest is `0x067fffff`, (when `fu` is
141    // `MAXV_BITS`). `0x067fffff >> 20` is 0x67, e.g. 103, and thus all possible
142    // results are inbounds for the (104 item) table. This is all verified in
143    // test code.
144    //
145    // Note that the compiler can't figure this out on it's own, so the
146    // get_unchecked does help some.
147    let entry = unsafe {
148        let i = ((fu - MINV_BITS) >> 20) as usize;
149        #[cfg(all(not(unstable_bench), test))]
150        {
151            debug_assert!(TO_SRGB8_TABLE.get(i).is_some());
152        }
153        *TO_SRGB8_TABLE.get_unchecked(i)
154    };
155    // bottom 16 bits are bias, top 9 are scale.
156    let bias = (entry >> 16) << 9;
157    let scale = entry & 0xffff;
158
159    // lerp to the next highest mantissa bits.
160    let t = (fu >> 12) & 0xff;
161    let res = (bias + scale * t) >> 16;
162    #[cfg(all(not(unstable_bench), test))]
163    {
164        debug_assert!(res < 256, "{}", res);
165    }
166    res as u8
167}
168
169/// Performs 4 simultaneous calls to [`f32_to_srgb8`], and returns 4 results.
170///
171/// If available, this uses SIMD to perform all 4 computations simultaneously —
172/// currently this is just on x86_64 and x86 targets that suppost SSE2 (which in
173/// practice will be all x86_64 (aside from weird things like OS kernels), and
174/// all Rust targets beginning with `i686-`). On machines where it cannot use
175/// the CPU's vector instructions, this function simply performs 4 calls to
176/// [`f32_to_srgb8`].
177///
178/// The check for this support is performed at compile time, so it does no
179/// runtime SIMD feature checks. This seems like the right call for SSE2.
180///
181/// Behavior is otherwise exactly (bitwise) identical to [`f32_to_srgb8`], so see
182/// it's documentation for more information.
183#[inline]
184pub fn f32x4_to_srgb8(input: [f32; 4]) -> [u8; 4] {
185    #[cfg(all(
186        not(miri),
187        any(target_arch = "x86_64", target_arch = "x86"),
188        target_feature = "sse2"
189    ))]
190    unsafe {
191        // Safety: we've checked that we're on x86/x86_64 and have SSE2
192        crate::sse2::simd_to_srgb8(input)
193    }
194    #[cfg(not(all(
195        not(miri),
196        any(target_arch = "x86_64", target_arch = "x86"),
197        target_feature = "sse2"
198    )))]
199    {
200        [
201            f32_to_srgb8(input[0]),
202            f32_to_srgb8(input[1]),
203            f32_to_srgb8(input[2]),
204            f32_to_srgb8(input[3]),
205        ]
206    }
207}
208
209const TO_SRGB8_TABLE: [u32; 104] = [
210    0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
211    0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
212    0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
213    0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
214    0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
215    0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
216    0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
217    0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
218    0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
219    0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
220    0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
221    0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
222    0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
223];
224
225/// Convert from a 8-bit sRGB component to a linear f32.
226///
227/// This is the inverse of [`srgb8_to_f32`] — and `c: u8` is roundtripped
228/// through it, as shown below:
229/// ```
230/// use fast_srgb8::{f32_to_srgb8, srgb8_to_f32};
231/// for c in 0..=255u8 {
232///     // f32_to_srgb8(srgb8_to_f32(c)) is an identity operation
233///     assert_eq!(f32_to_srgb8(srgb8_to_f32(c)), c);
234/// }
235/// ```
236///
237/// The implementation of this function isn't particularly clever — it just uses
238/// a precomputed lookup table of all 256 results. That has a benefit in that it
239/// allows this function to be a const fn, which is somewhat nice: generally
240/// color constants hardcoded in source code are sRGB, and this means you can
241/// use them to produce linear constants.
242///
243/// In practice this is way faster than the naive approach, and I'm unaware of
244/// any faster ways of implementing it, but it's not really amenable to SIMD, so
245/// no SIMD version is provided.
246#[inline]
247pub const fn srgb8_to_f32(c: u8) -> f32 {
248    FROM_SRGB8_TABLE[c as usize]
249}
250
251#[rustfmt::skip]
252const FROM_SRGB8_TABLE: [f32; 256] = [
253    0.0, 0.000303527, 0.000607054, 0.00091058103, 0.001214108, 0.001517635, 0.0018211621, 0.002124689,
254    0.002428216, 0.002731743, 0.00303527, 0.0033465356, 0.003676507, 0.004024717, 0.004391442,
255    0.0047769533, 0.005181517, 0.0056053917, 0.0060488326, 0.006512091, 0.00699541, 0.0074990317,
256    0.008023192, 0.008568125, 0.009134057, 0.009721218, 0.010329823, 0.010960094, 0.011612245,
257    0.012286487, 0.012983031, 0.013702081, 0.014443844, 0.015208514, 0.015996292, 0.016807375,
258    0.017641952, 0.018500218, 0.019382361, 0.020288562, 0.02121901, 0.022173883, 0.023153365,
259    0.02415763, 0.025186857, 0.026241222, 0.027320892, 0.028426038, 0.029556843, 0.03071345, 0.03189604,
260    0.033104774, 0.03433981, 0.035601325, 0.036889452, 0.038204376, 0.039546248, 0.04091521, 0.042311423,
261    0.043735042, 0.045186214, 0.046665095, 0.048171833, 0.049706575, 0.051269468, 0.052860655, 0.05448028,
262    0.056128494, 0.057805434, 0.05951124, 0.06124607, 0.06301003, 0.06480328, 0.06662595, 0.06847818,
263    0.07036011, 0.07227186, 0.07421358, 0.07618539, 0.07818743, 0.08021983, 0.082282715, 0.084376216,
264    0.086500466, 0.088655606, 0.09084173, 0.09305898, 0.095307484, 0.09758736, 0.09989874, 0.10224175,
265    0.10461649, 0.10702311, 0.10946172, 0.111932434, 0.11443538, 0.116970696, 0.11953845, 0.12213881,
266    0.12477186, 0.12743773, 0.13013652, 0.13286836, 0.13563336, 0.13843165, 0.14126332, 0.1441285,
267    0.1470273, 0.14995982, 0.15292618, 0.1559265, 0.15896086, 0.16202943, 0.16513224, 0.16826946,
268    0.17144115, 0.17464745, 0.17788847, 0.1811643, 0.18447503, 0.1878208, 0.19120172, 0.19461787,
269    0.19806935, 0.2015563, 0.20507877, 0.2086369, 0.21223079, 0.21586053, 0.21952623, 0.22322798,
270    0.22696589, 0.23074007, 0.23455065, 0.23839766, 0.2422812, 0.2462014, 0.25015837, 0.25415218,
271    0.2581829, 0.26225072, 0.26635566, 0.27049786, 0.27467737, 0.27889434, 0.2831488, 0.2874409,
272    0.2917707, 0.29613832, 0.30054384, 0.30498737, 0.30946895, 0.31398875, 0.31854683, 0.32314324,
273    0.32777813, 0.33245158, 0.33716366, 0.34191445, 0.3467041, 0.3515327, 0.35640025, 0.36130688,
274    0.3662527, 0.37123778, 0.37626222, 0.3813261, 0.38642952, 0.39157256, 0.3967553, 0.40197787,
275    0.4072403, 0.4125427, 0.41788515, 0.42326775, 0.42869055, 0.4341537, 0.43965724, 0.44520125,
276    0.45078585, 0.45641106, 0.46207705, 0.46778384, 0.47353154, 0.47932023, 0.48514998, 0.4910209,
277    0.49693304, 0.5028866, 0.50888145, 0.5149178, 0.5209957, 0.52711535, 0.5332766, 0.5394797,
278    0.5457247, 0.5520116, 0.5583406, 0.5647117, 0.57112503, 0.57758063, 0.5840786, 0.590619, 0.597202,
279    0.60382754, 0.61049575, 0.61720675, 0.62396055, 0.63075733, 0.637597, 0.6444799, 0.6514058,
280    0.65837497, 0.66538745, 0.67244333, 0.6795426, 0.68668544, 0.69387203, 0.70110214, 0.70837605,
281    0.7156938, 0.72305536, 0.730461, 0.7379107, 0.7454045, 0.75294244, 0.76052475, 0.7681514, 0.77582246,
282    0.78353804, 0.79129815, 0.79910296, 0.8069525, 0.8148468, 0.822786, 0.8307701, 0.83879924, 0.84687346,
283    0.8549928, 0.8631574, 0.87136734, 0.8796226, 0.8879232, 0.89626956, 0.90466136, 0.913099, 0.92158204,
284    0.93011117, 0.9386859, 0.9473069, 0.9559735, 0.9646866, 0.9734455, 0.98225087, 0.9911022, 1.0
285];
286
287#[cfg(test)]
288mod tests {
289    use super::*;
290    fn srgb8_to_f32_ref(c: u8) -> f32 {
291        let c = c as f32 * (1.0 / 255.0);
292        if c <= 0.04045 {
293            c / 12.92
294        } else {
295            ((c + 0.055) / 1.055).powf(2.4)
296        }
297    }
298    #[test]
299    fn test_from_srgb8() {
300        let wanted = (0..=255).map(srgb8_to_f32_ref).collect::<Vec<_>>();
301        assert_eq!(&FROM_SRGB8_TABLE[..], &wanted[..]);
302        for i in 0..=255u8 {
303            assert_eq!(srgb8_to_f32(i), srgb8_to_f32_ref(i));
304            assert_eq!(f32_to_srgb8(srgb8_to_f32(i)), i, "{}", i);
305        }
306    }
307
308    // run as `cargo test --release -- --nocapture --ignored`
309    #[test]
310    #[ignore]
311    fn test_exhaustive_scalar() {
312        // Simultaneously test that:
313        // - monotonicity is respected
314        // - error < 0.6f ULP on int side
315        // - SIMD and Scalar return identical values
316        let mut prev = 0;
317        for i in 0..=!0u32 {
318            // offset by the first NaN so that we iterate in a way that makes monotonicity easy to check.
319            let f = f32::from_bits(i.wrapping_add((255 << 23) + 1));
320            let c = f32_to_srgb8(f);
321            let reference = unrounded_f32_to_srgb_ref(f);
322            let err = (c as f32 - reference).abs();
323            assert!(
324                err < 0.6,
325                "Error exceeds limit, {} >= 0.6 at {:?} (0x{:08x})",
326                err,
327                f,
328                f.to_bits(),
329            );
330            assert!(
331                c >= prev,
332                "Monotonicity not respected {} < {} at  {:?} (0x{:08x})",
333                c,
334                prev,
335                f,
336                f.to_bits(),
337            );
338            prev = c;
339            let v = f32x4_to_srgb8([f, f, f, f]);
340            assert_eq!([c, c, c, c], v);
341            if (i & 0xffffff) == 0 {
342                println!("scalar: {}", i >> 24);
343            }
344        }
345    }
346    #[test]
347    #[ignore]
348    fn test_exhaustive_simd() {
349        // verifies exactly identical results for all inputs.
350        let mut i = 0;
351        loop {
352            let f0 = f32::from_bits(i);
353            let f1 = f32::from_bits(i + 1);
354            let f2 = f32::from_bits(i + 2);
355            let f3 = f32::from_bits(i + 3);
356            let v = f32x4_to_srgb8([f0, f1, f2, f3]);
357            let c0 = f32_to_srgb8(f0);
358            let c1 = f32_to_srgb8(f1);
359            let c2 = f32_to_srgb8(f2);
360            let c3 = f32_to_srgb8(f3);
361            assert_eq!(
362                v,
363                [c0, c1, c2, c3],
364                "simd/scalar mismatch at {:?} (starting at 0x{:08x})",
365                [f0, f1, f2, f3],
366                i,
367            );
368            if (i & 0xffffff) == 0 {
369                println!("simd: {}", i >> 24);
370            }
371            i = i.wrapping_add(4);
372            if i == 0 {
373                break;
374            }
375        }
376    }
377
378    fn unrounded_f32_to_srgb_ref(f: f32) -> f32 {
379        let v = if !(f > 0.0) {
380            0.0
381        } else if f <= 0.0031308 {
382            12.92 * f
383        } else if f < 1.0 {
384            1.055 * f.powf(1.0 / 2.4) - 0.055
385        } else {
386            1.0
387        };
388        v * 255.0
389    }
390
391    #[cfg(unstable_bench)]
392    mod bench {
393        use super::*;
394        fn f32_to_srgb_ref(f: f32) -> u8 {
395            (unrounded_f32_to_srgb_ref(f) + 0.5) as u8
396        }
397        const BENCH_SUBDIV: usize = 50;
398        #[bench]
399        fn fast_scalar(b: &mut test::Bencher) {
400            b.iter(|| {
401                for i in 0..=BENCH_SUBDIV {
402                    test::black_box(f32_to_srgb8(i as f32 / BENCH_SUBDIV as f32));
403                }
404            });
405        }
406        #[bench]
407        fn naive_scalar(b: &mut test::Bencher) {
408            b.iter(|| {
409                for i in 0..=BENCH_SUBDIV {
410                    test::black_box(f32_to_srgb_ref(i as f32 / BENCH_SUBDIV as f32));
411                }
412            });
413        }
414        #[bench]
415        fn naive_f32x4(b: &mut test::Bencher) {
416            b.iter(|| {
417                for i in 0..=BENCH_SUBDIV {
418                    let a = f32_to_srgb_ref(i as f32 / BENCH_SUBDIV as f32);
419                    let b = f32_to_srgb_ref(i as f32 / BENCH_SUBDIV as f32 + 0.025);
420                    let c = f32_to_srgb_ref(i as f32 / BENCH_SUBDIV as f32 + 0.05);
421                    let d = f32_to_srgb_ref(i as f32 / BENCH_SUBDIV as f32 + 0.075);
422                    test::black_box([a, b, c, d]);
423                }
424            });
425        }
426        #[bench]
427        fn fast_f32x4(b: &mut test::Bencher) {
428            b.iter(|| {
429                for i in 0..=BENCH_SUBDIV {
430                    let v = f32x4_to_srgb8([
431                        i as f32 / BENCH_SUBDIV as f32,
432                        i as f32 / BENCH_SUBDIV as f32 + 0.025,
433                        i as f32 / BENCH_SUBDIV as f32 + 0.05,
434                        i as f32 / BENCH_SUBDIV as f32 + 0.075,
435                    ]);
436                    test::black_box(v);
437                }
438            });
439        }
440        #[bench]
441        fn fast_f32x4_nosimd(b: &mut test::Bencher) {
442            b.iter(|| {
443                for i in 0..=BENCH_SUBDIV {
444                    let a = f32_to_srgb8(i as f32 / BENCH_SUBDIV as f32);
445                    let b = f32_to_srgb8(i as f32 / BENCH_SUBDIV as f32 + 0.025);
446                    let c = f32_to_srgb8(i as f32 / BENCH_SUBDIV as f32 + 0.05);
447                    let d = f32_to_srgb8(i as f32 / BENCH_SUBDIV as f32 + 0.075);
448                    test::black_box([a, b, c, d]);
449                }
450            });
451        }
452
453        #[bench]
454        fn naive_from_srgb8(b: &mut test::Bencher) {
455            b.iter(|| {
456                for i in 0..=255 {
457                    test::black_box(srgb8_to_f32_ref(i));
458                }
459            });
460        }
461        #[bench]
462        fn fast_from_srgb8(b: &mut test::Bencher) {
463            b.iter(|| {
464                for i in 0..=255 {
465                    test::black_box(srgb8_to_f32(i));
466                }
467            });
468        }
469    }
470}
fast_srgb8/lib.rs

fast_srgb8/
lib.rs