tiny_skia/pipeline/
lowp.rs

1// Copyright 2018 Google Inc.
2// Copyright 2020 Yevhenii Reizner
3//
4// Use of this source code is governed by a BSD-style license that can be
5// found in the LICENSE file.
6
7/*!
8A low precision raster pipeline implementation.
9
10A lowp pipeline uses u16 instead of f32 for math.
11Because of that, it doesn't implement stages that require high precision.
12The pipeline compiler will automatically decide which one to use.
13
14Skia uses u16x8 (128bit) types for a generic CPU and u16x16 (256bit) for modern x86 CPUs.
15But instead of explicit SIMD instructions, it mainly relies on clang's vector extensions.
16And since they are unavailable in Rust, we have to do everything manually.
17
18According to our benchmarks, a SIMD-accelerated u16x8 in Rust is almost 2x slower than in Skia.
19Not sure why. For example, there are no div instruction for u16x8, so we have to use
20a basic scalar version. Which means unnecessary load/store. No idea what clang does in this case.
21Surprisingly, a SIMD-accelerated u16x8 is even slower than a scalar one. Again, not sure why.
22
23Therefore we are using scalar u16x16 by default and relying on rustc/llvm auto vectorization instead.
24When targeting a generic CPU, we're just 5-10% slower than Skia. While u16x8 is 30-40% slower.
25And while `-C target-cpu=haswell` boosts our performance by around 25%,
26we are still 40-60% behind Skia built for Haswell.
27
28On ARM AArch64 the story is different and explicit SIMD make our code up to 2-3x faster.
29*/
30
31use crate::PremultipliedColorU8;
32
33use crate::pixmap::SubPixmapMut;
34use crate::wide::{f32x8, u16x16, f32x16};
35use crate::geom::ScreenIntRect;
36
37pub const STAGE_WIDTH: usize = 16;
38
39pub type StageFn = fn(p: &mut Pipeline);
40
41pub struct Pipeline<'a, 'b: 'a> {
42    index: usize,
43    functions: &'a [StageFn],
44    pixmap: &'a mut SubPixmapMut<'b>,
45    mask_ctx: super::MaskCtx<'a>,
46    aa_mask_ctx: super::AAMaskCtx,
47    ctx: &'a mut super::Context,
48    r: u16x16,
49    g: u16x16,
50    b: u16x16,
51    a: u16x16,
52    dr: u16x16,
53    dg: u16x16,
54    db: u16x16,
55    da: u16x16,
56    tail: usize,
57    dx: usize,
58    dy: usize,
59}
60
61impl Pipeline<'_, '_> {
62    #[inline(always)]
63    fn next_stage(&mut self) {
64        let next: fn(&mut Self) = self.functions[self.index];
65        self.index += 1;
66        next(self);
67    }
68}
69
70
71// Must be in the same order as raster_pipeline::Stage
72pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
73    move_source_to_destination,
74    move_destination_to_source,
75    null_fn, // Clamp0
76    null_fn, // ClampA
77    premultiply,
78    uniform_color,
79    seed_shader,
80    load_dst,
81    store,
82    load_dst_u8,
83    store_u8,
84    null_fn, // Gather
85    load_mask_u8,
86    mask_u8,
87    scale_u8,
88    lerp_u8,
89    scale_1_float,
90    lerp_1_float,
91    destination_atop,
92    destination_in,
93    destination_out,
94    destination_over,
95    source_atop,
96    source_in,
97    source_out,
98    source_over,
99    clear,
100    modulate,
101    multiply,
102    plus,
103    screen,
104    xor,
105    null_fn, // ColorBurn
106    null_fn, // ColorDodge
107    darken,
108    difference,
109    exclusion,
110    hard_light,
111    lighten,
112    overlay,
113    null_fn, // SoftLight
114    null_fn, // Hue
115    null_fn, // Saturation
116    null_fn, // Color
117    null_fn, // Luminosity
118    source_over_rgba,
119    transform,
120    null_fn, // Reflect
121    null_fn, // Repeat
122    null_fn, // Bilinear
123    null_fn, // Bicubic
124    pad_x1,
125    reflect_x1,
126    repeat_x1,
127    gradient,
128    evenly_spaced_2_stop_gradient,
129    xy_to_radius,
130    null_fn, // XYTo2PtConicalFocalOnCircle
131    null_fn, // XYTo2PtConicalWellBehaved
132    null_fn, // XYTo2PtConicalGreater
133    null_fn, // Mask2PtConicalDegenerates
134    null_fn, // ApplyVectorMask
135];
136
137pub fn fn_ptr(f: StageFn) -> *const () {
138    f as *const ()
139}
140
141pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool {
142    core::ptr::eq(f1 as *const (), f2 as *const ())
143}
144
145#[inline(never)]
146pub fn start(
147    functions: &[StageFn],
148    functions_tail: &[StageFn],
149    rect: &ScreenIntRect,
150    aa_mask_ctx: super::AAMaskCtx,
151    mask_ctx: super::MaskCtx,
152    ctx: &mut super::Context,
153    pixmap: &mut SubPixmapMut,
154) {
155    let mut p = Pipeline {
156        index: 0,
157        functions: &[],
158        pixmap,
159        mask_ctx,
160        aa_mask_ctx,
161        ctx,
162        r: u16x16::default(),
163        g: u16x16::default(),
164        b: u16x16::default(),
165        a: u16x16::default(),
166        dr: u16x16::default(),
167        dg: u16x16::default(),
168        db: u16x16::default(),
169        da: u16x16::default(),
170        tail: 0,
171        dx: 0,
172        dy: 0,
173    };
174
175    for y in rect.y()..rect.bottom() {
176        let mut x = rect.x() as usize;
177        let end = rect.right() as usize;
178
179        p.functions = functions;
180        while x + STAGE_WIDTH <= end {
181            p.index = 0;
182            p.dx = x;
183            p.dy = y as usize;
184            p.tail = STAGE_WIDTH;
185            p.next_stage();
186            x += STAGE_WIDTH;
187        }
188
189        if x != end {
190            p.index = 0;
191            p.functions = functions_tail;
192            p.dx = x;
193            p.dy = y as usize;
194            p.tail = end - x;
195            p.next_stage();
196        }
197    }
198}
199
200fn move_source_to_destination(p: &mut Pipeline) {
201    p.dr = p.r;
202    p.dg = p.g;
203    p.db = p.b;
204    p.da = p.a;
205
206    p.next_stage();
207}
208
209fn move_destination_to_source(p: &mut Pipeline) {
210    p.r = p.dr;
211    p.g = p.dg;
212    p.b = p.db;
213    p.a = p.da;
214
215    p.next_stage();
216}
217
218fn premultiply(p: &mut Pipeline) {
219    p.r = div255(p.r * p.a);
220    p.g = div255(p.g * p.a);
221    p.b = div255(p.b * p.a);
222
223    p.next_stage();
224}
225
226fn uniform_color(p: &mut Pipeline) {
227    let ctx = p.ctx.uniform_color;
228    p.r = u16x16::splat(ctx.rgba[0]);
229    p.g = u16x16::splat(ctx.rgba[1]);
230    p.b = u16x16::splat(ctx.rgba[2]);
231    p.a = u16x16::splat(ctx.rgba[3]);
232
233    p.next_stage();
234}
235
236fn seed_shader(p: &mut Pipeline) {
237    let iota = f32x16(
238        f32x8::from([0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5]),
239        f32x8::from([8.5,  9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]),
240    );
241
242    let x = f32x16::splat(p.dx as f32) + iota;
243    let y = f32x16::splat(p.dy as f32 + 0.5);
244    split(&x, &mut p.r, &mut p.g);
245    split(&y, &mut p.b, &mut p.a);
246
247    p.next_stage();
248}
249
250pub fn load_dst(p: &mut Pipeline) {
251    load_8888(p.pixmap.slice16_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
252    p.next_stage();
253}
254
255pub fn load_dst_tail(p: &mut Pipeline) {
256    load_8888_tail(p.tail, p.pixmap.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
257    p.next_stage();
258}
259
260pub fn store(p: &mut Pipeline) {
261    store_8888(&p.r, &p.g, &p.b, &p.a, p.pixmap.slice16_at_xy(p.dx, p.dy));
262    p.next_stage();
263}
264
265pub fn store_tail(p: &mut Pipeline) {
266    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, p.pixmap.slice_at_xy(p.dx, p.dy));
267    p.next_stage();
268}
269
270pub fn load_dst_u8(p: &mut Pipeline) {
271    load_8(p.pixmap.slice16_mask_at_xy(p.dx, p.dy), &mut p.da);
272    p.next_stage();
273}
274
275pub fn load_dst_u8_tail(p: &mut Pipeline) {
276    // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
277    // This way we can reuse the `load_8888__` method and remove any branches.
278    let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
279    let mut tmp = [0u8; STAGE_WIDTH];
280    tmp[0..p.tail].copy_from_slice(&data[0..p.tail]);
281    load_8(&tmp, &mut p.da);
282
283    p.next_stage();
284}
285
286pub fn store_u8(p: &mut Pipeline) {
287    let data = p.pixmap.slice16_mask_at_xy(p.dx, p.dy);
288    let a = p.a.as_slice();
289
290    data[ 0] = a[ 0] as u8;
291    data[ 1] = a[ 1] as u8;
292    data[ 2] = a[ 2] as u8;
293    data[ 3] = a[ 3] as u8;
294    data[ 4] = a[ 4] as u8;
295    data[ 5] = a[ 5] as u8;
296    data[ 6] = a[ 6] as u8;
297    data[ 7] = a[ 7] as u8;
298    data[ 8] = a[ 8] as u8;
299    data[ 9] = a[ 9] as u8;
300    data[10] = a[10] as u8;
301    data[11] = a[11] as u8;
302    data[12] = a[12] as u8;
303    data[13] = a[13] as u8;
304    data[14] = a[14] as u8;
305    data[15] = a[15] as u8;
306
307    p.next_stage();
308}
309
310pub fn store_u8_tail(p: &mut Pipeline) {
311    let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
312    let a = p.a.as_slice();
313
314    // This is better than `for i in 0..tail`, because this way the compiler
315    // knows that we have only 16 steps and slices access is guarantee to be valid.
316    // This removes bounds checking and a possible panic call.
317    for i in 0..STAGE_WIDTH {
318        data[i] = a[i] as u8;
319
320        if i + 1 == p.tail {
321            break;
322        }
323    }
324
325    p.next_stage();
326}
327
328// Similar to mask_u8, but only loads the mask values without actually masking the pipeline.
329fn load_mask_u8(p: &mut Pipeline) {
330    let offset = p.mask_ctx.offset(p.dx, p.dy);
331
332    let mut c = u16x16::default();
333    for i in 0..p.tail {
334        c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
335    }
336
337    p.r = u16x16::splat(0);
338    p.g = u16x16::splat(0);
339    p.b = u16x16::splat(0);
340    p.a = c;
341
342    p.next_stage();
343}
344
345fn mask_u8(p: &mut Pipeline) {
346    let offset = p.mask_ctx.offset(p.dx, p.dy);
347
348    let mut c = u16x16::default();
349    for i in 0..p.tail {
350        c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
351    }
352
353    if c == u16x16::default() {
354        return;
355    }
356
357    p.r = div255(p.r * c);
358    p.g = div255(p.g * c);
359    p.b = div255(p.b * c);
360    p.a = div255(p.a * c);
361
362    p.next_stage();
363}
364
365fn scale_u8(p: &mut Pipeline) {
366    // Load u8xTail and cast it to u16x16.
367    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
368    let c = u16x16([
369        u16::from(data[0]),
370        u16::from(data[1]),
371        0,
372        0,
373        0,
374        0,
375        0,
376        0,
377        0,
378        0,
379        0,
380        0,
381        0,
382        0,
383        0,
384        0,
385    ]);
386
387    p.r = div255(p.r * c);
388    p.g = div255(p.g * c);
389    p.b = div255(p.b * c);
390    p.a = div255(p.a * c);
391
392    p.next_stage();
393}
394
395fn lerp_u8(p: &mut Pipeline) {
396    // Load u8xTail and cast it to u16x16.
397    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
398    let c = u16x16([
399        u16::from(data[0]),
400        u16::from(data[1]),
401        0,
402        0,
403        0,
404        0,
405        0,
406        0,
407        0,
408        0,
409        0,
410        0,
411        0,
412        0,
413        0,
414        0,
415    ]);
416
417    p.r = lerp(p.dr, p.r, c);
418    p.g = lerp(p.dg, p.g, c);
419    p.b = lerp(p.db, p.b, c);
420    p.a = lerp(p.da, p.a, c);
421
422    p.next_stage();
423}
424
425fn scale_1_float(p: &mut Pipeline) {
426    let c = from_float(p.ctx.current_coverage);
427    p.r = div255(p.r * c);
428    p.g = div255(p.g * c);
429    p.b = div255(p.b * c);
430    p.a = div255(p.a * c);
431
432    p.next_stage();
433}
434
435fn lerp_1_float(p: &mut Pipeline) {
436    let c = from_float(p.ctx.current_coverage);
437    p.r = lerp(p.dr, p.r, c);
438    p.g = lerp(p.dg, p.g, c);
439    p.b = lerp(p.db, p.b, c);
440    p.a = lerp(p.da, p.a, c);
441
442    p.next_stage();
443}
444
445macro_rules! blend_fn {
446    ($name:ident, $f:expr) => {
447        fn $name(p: &mut Pipeline) {
448            p.r = $f(p.r, p.dr, p.a, p.da);
449            p.g = $f(p.g, p.dg, p.a, p.da);
450            p.b = $f(p.b, p.db, p.a, p.da);
451            p.a = $f(p.a, p.da, p.a, p.da);
452
453            p.next_stage();
454        }
455    };
456}
457
458blend_fn!(clear,            |_, _,  _,  _| u16x16::splat(0));
459blend_fn!(source_atop,      |s, d, sa, da| div255(s * da + d * inv(sa)));
460blend_fn!(destination_atop, |s, d, sa, da| div255(d * sa + s * inv(da)));
461blend_fn!(source_in,        |s, _,  _, da| div255(s * da));
462blend_fn!(destination_in,   |_, d, sa,  _| div255(d * sa));
463blend_fn!(source_out,       |s, _,  _, da| div255(s * inv(da)));
464blend_fn!(destination_out,  |_, d, sa,  _| div255(d * inv(sa)));
465blend_fn!(source_over,      |s, d, sa,  _| s + div255(d * inv(sa)));
466blend_fn!(destination_over, |s, d,  _, da| d + div255(s * inv(da)));
467blend_fn!(modulate,         |s, d,  _,  _| div255(s * d));
468blend_fn!(multiply,         |s, d, sa, da| div255(s * inv(da) + d * inv(sa) + s * d));
469blend_fn!(screen,           |s, d,  _,  _| s + d - div255(s * d));
470blend_fn!(xor,              |s, d, sa, da| div255(s * inv(da) + d * inv(sa)));
471
472// Wants a type for some reason.
473blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255)));
474
475
476macro_rules! blend_fn2 {
477    ($name:ident, $f:expr) => {
478        fn $name(p: &mut Pipeline) {
479            // The same logic applied to color, and source_over for alpha.
480            p.r = $f(p.r, p.dr, p.a, p.da);
481            p.g = $f(p.g, p.dg, p.a, p.da);
482            p.b = $f(p.b, p.db, p.a, p.da);
483            p.a = p.a + div255(p.da * inv(p.a));
484
485            p.next_stage();
486        }
487    };
488}
489
490blend_fn2!(darken,      |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa))));
491blend_fn2!(lighten,     |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa))));
492blend_fn2!(exclusion,   |s: u16x16, d,  _,  _| s + d - u16x16::splat(2) * div255(s * d));
493
494blend_fn2!(difference,  |s: u16x16, d, sa, da|
495    s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa))));
496
497blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| {
498    div255(s * inv(da) + d * inv(sa)
499        + (s+s).cmp_le(&sa).blend(
500            u16x16::splat(2) * s * d,
501            sa * da - u16x16::splat(2) * (sa-s)*(da-d)
502        )
503    )
504});
505
506blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| {
507    div255(s * inv(da) + d * inv(sa)
508        + (d+d).cmp_le(&da).blend(
509            u16x16::splat(2) * s * d,
510            sa * da - u16x16::splat(2) * (sa-s)*(da-d)
511        )
512    )
513});
514
515pub fn source_over_rgba(p: &mut Pipeline) {
516    let pixels = p.pixmap.slice16_at_xy(p.dx, p.dy);
517    load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
518    p.r = p.r + div255(p.dr * inv(p.a));
519    p.g = p.g + div255(p.dg * inv(p.a));
520    p.b = p.b + div255(p.db * inv(p.a));
521    p.a = p.a + div255(p.da * inv(p.a));
522    store_8888(&p.r, &p.g, &p.b, &p.a, pixels);
523
524    p.next_stage();
525}
526
527pub fn source_over_rgba_tail(p: &mut Pipeline) {
528    let pixels = p.pixmap.slice_at_xy(p.dx, p.dy);
529    load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
530    p.r = p.r + div255(p.dr * inv(p.a));
531    p.g = p.g + div255(p.dg * inv(p.a));
532    p.b = p.b + div255(p.db * inv(p.a));
533    p.a = p.a + div255(p.da * inv(p.a));
534    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels);
535
536    p.next_stage();
537}
538
539fn transform(p: &mut Pipeline) {
540    let ts = &p.ctx.transform;
541
542    let x = join(&p.r, &p.g);
543    let y = join(&p.b, &p.a);
544
545    let nx = mad(x, f32x16::splat(ts.sx), mad(y, f32x16::splat(ts.kx), f32x16::splat(ts.tx)));
546    let ny = mad(x, f32x16::splat(ts.ky), mad(y, f32x16::splat(ts.sy), f32x16::splat(ts.ty)));
547
548    split(&nx, &mut p.r, &mut p.g);
549    split(&ny, &mut p.b, &mut p.a);
550
551    p.next_stage();
552}
553
554fn pad_x1(p: &mut Pipeline) {
555    let x = join(&p.r, &p.g);
556    let x = x.normalize();
557    split(&x, &mut p.r, &mut p.g);
558
559    p.next_stage();
560}
561
562fn reflect_x1(p: &mut Pipeline) {
563    let x = join(&p.r, &p.g);
564    let two = |x| x + x;
565    let x = (
566        (x - f32x16::splat(1.0))
567        - two(((x - f32x16::splat(1.0)) * f32x16::splat(0.5)).floor())
568        - f32x16::splat(1.0)
569    ).abs().normalize();
570    split(&x, &mut p.r, &mut p.g);
571
572    p.next_stage();
573}
574
575fn repeat_x1(p: &mut Pipeline) {
576    let x = join(&p.r, &p.g);
577    let x = (x - x.floor()).normalize();
578    split(&x, &mut p.r, &mut p.g);
579
580    p.next_stage();
581}
582
583fn gradient(p: &mut Pipeline) {
584    let ctx = &p.ctx.gradient;
585
586    // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
587    let t = join(&p.r, &p.g);
588    let mut idx = u16x16::splat(0);
589    for i in 1..ctx.len {
590        let tt = ctx.t_values[i].get();
591        let t0: [f32; 8] = t.0.into();
592        let t1: [f32; 8] = t.1.into();
593        idx.0[ 0] += (t0[0] >= tt) as u16;
594        idx.0[ 1] += (t0[1] >= tt) as u16;
595        idx.0[ 2] += (t0[2] >= tt) as u16;
596        idx.0[ 3] += (t0[3] >= tt) as u16;
597        idx.0[ 4] += (t0[4] >= tt) as u16;
598        idx.0[ 5] += (t0[5] >= tt) as u16;
599        idx.0[ 6] += (t0[6] >= tt) as u16;
600        idx.0[ 7] += (t0[7] >= tt) as u16;
601        idx.0[ 8] += (t1[0] >= tt) as u16;
602        idx.0[ 9] += (t1[1] >= tt) as u16;
603        idx.0[10] += (t1[2] >= tt) as u16;
604        idx.0[11] += (t1[3] >= tt) as u16;
605        idx.0[12] += (t1[4] >= tt) as u16;
606        idx.0[13] += (t1[5] >= tt) as u16;
607        idx.0[14] += (t1[6] >= tt) as u16;
608        idx.0[15] += (t1[7] >= tt) as u16;
609    }
610    gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
611
612    p.next_stage();
613}
614
615fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
616    let ctx = &p.ctx.evenly_spaced_2_stop_gradient;
617
618    let t = join(&p.r, &p.g);
619    round_f32_to_u16(
620        mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)),
621        mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)),
622        mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)),
623        mad(t, f32x16::splat(ctx.factor.a), f32x16::splat(ctx.bias.a)),
624        &mut p.r, &mut p.g, &mut p.b, &mut p.a,
625    );
626
627    p.next_stage();
628}
629
630fn xy_to_radius(p: &mut Pipeline) {
631    let x = join(&p.r, &p.g);
632    let y = join(&p.b, &p.a);
633    let x = (x*x + y*y).sqrt();
634    split(&x, &mut p.r, &mut p.g);
635    split(&y, &mut p.b, &mut p.a);
636
637    p.next_stage();
638}
639
640// We are using u16 for index, not u32 as Skia, to simplify the code a bit.
641// The gradient creation code will not allow that many stops anyway.
642fn gradient_lookup(
643    ctx: &super::GradientCtx, idx: &u16x16, t: f32x16,
644    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
645) {
646    macro_rules! gather {
647        ($d:expr, $c:ident) => {
648            // Surprisingly, but bound checking doesn't affect the performance.
649            // And since `idx` can contain any number, we should leave it in place.
650            f32x16(
651                f32x8::from([
652                    $d[idx.0[ 0] as usize].$c,
653                    $d[idx.0[ 1] as usize].$c,
654                    $d[idx.0[ 2] as usize].$c,
655                    $d[idx.0[ 3] as usize].$c,
656                    $d[idx.0[ 4] as usize].$c,
657                    $d[idx.0[ 5] as usize].$c,
658                    $d[idx.0[ 6] as usize].$c,
659                    $d[idx.0[ 7] as usize].$c,
660                ]),
661                f32x8::from([
662                    $d[idx.0[ 8] as usize].$c,
663                    $d[idx.0[ 9] as usize].$c,
664                    $d[idx.0[10] as usize].$c,
665                    $d[idx.0[11] as usize].$c,
666                    $d[idx.0[12] as usize].$c,
667                    $d[idx.0[13] as usize].$c,
668                    $d[idx.0[14] as usize].$c,
669                    $d[idx.0[15] as usize].$c,
670                ]),
671            )
672        };
673    }
674
675    let fr = gather!(&ctx.factors, r);
676    let fg = gather!(&ctx.factors, g);
677    let fb = gather!(&ctx.factors, b);
678    let fa = gather!(&ctx.factors, a);
679
680    let br = gather!(&ctx.biases, r);
681    let bg = gather!(&ctx.biases, g);
682    let bb = gather!(&ctx.biases, b);
683    let ba = gather!(&ctx.biases, a);
684
685    round_f32_to_u16(
686        mad(t, fr, br),
687        mad(t, fg, bg),
688        mad(t, fb, bb),
689        mad(t, fa, ba),
690        r, g, b, a,
691    );
692}
693
694#[inline(always)]
695fn round_f32_to_u16(
696    rf: f32x16, gf: f32x16, bf: f32x16, af: f32x16,
697    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
698) {
699    // TODO: may produce a slightly different result to Skia
700    //       affects the two_stops_linear_mirror test
701
702    let rf = rf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
703    let gf = gf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
704    let bf = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
705    let af = af * f32x16::splat(255.0) + f32x16::splat(0.5);
706
707    rf.save_to_u16x16(r);
708    gf.save_to_u16x16(g);
709    bf.save_to_u16x16(b);
710    af.save_to_u16x16(a);
711}
712
713pub fn just_return(_: &mut Pipeline) {
714    // Ends the loop.
715}
716
717pub fn null_fn(_: &mut Pipeline) {
718    // Just for unsupported functions in STAGES.
719}
720
721#[inline(always)]
722fn load_8888(
723    data: &[PremultipliedColorU8; STAGE_WIDTH],
724    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
725) {
726    *r = u16x16([
727        data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16,
728        data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16,
729        data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16,
730        data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16,
731    ]);
732
733    *g = u16x16([
734        data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16,
735        data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16,
736        data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16,
737        data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16,
738    ]);
739
740    *b = u16x16([
741        data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16,
742        data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16,
743        data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16,
744        data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16,
745    ]);
746
747    *a = u16x16([
748        data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16,
749        data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16,
750        data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16,
751        data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16,
752    ]);
753}
754
755#[inline(always)]
756fn load_8888_tail(
757    tail: usize, data: &[PremultipliedColorU8],
758    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
759) {
760    // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
761    // This way we can reuse the `load_8888__` method and remove any branches.
762    let mut tmp = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
763    tmp[0..tail].copy_from_slice(&data[0..tail]);
764    load_8888(&tmp, r, g, b, a);
765}
766
767#[inline(always)]
768fn store_8888(
769    r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
770    data: &mut [PremultipliedColorU8; STAGE_WIDTH],
771) {
772    let r = r.as_slice();
773    let g = g.as_slice();
774    let b = b.as_slice();
775    let a = a.as_slice();
776
777    data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8);
778    data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8);
779    data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r[ 2] as u8, g[ 2] as u8, b[ 2] as u8, a[ 2] as u8);
780    data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r[ 3] as u8, g[ 3] as u8, b[ 3] as u8, a[ 3] as u8);
781    data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r[ 4] as u8, g[ 4] as u8, b[ 4] as u8, a[ 4] as u8);
782    data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r[ 5] as u8, g[ 5] as u8, b[ 5] as u8, a[ 5] as u8);
783    data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r[ 6] as u8, g[ 6] as u8, b[ 6] as u8, a[ 6] as u8);
784    data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r[ 7] as u8, g[ 7] as u8, b[ 7] as u8, a[ 7] as u8);
785    data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r[ 8] as u8, g[ 8] as u8, b[ 8] as u8, a[ 8] as u8);
786    data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r[ 9] as u8, g[ 9] as u8, b[ 9] as u8, a[ 9] as u8);
787    data[10] = PremultipliedColorU8::from_rgba_unchecked(r[10] as u8, g[10] as u8, b[10] as u8, a[10] as u8);
788    data[11] = PremultipliedColorU8::from_rgba_unchecked(r[11] as u8, g[11] as u8, b[11] as u8, a[11] as u8);
789    data[12] = PremultipliedColorU8::from_rgba_unchecked(r[12] as u8, g[12] as u8, b[12] as u8, a[12] as u8);
790    data[13] = PremultipliedColorU8::from_rgba_unchecked(r[13] as u8, g[13] as u8, b[13] as u8, a[13] as u8);
791    data[14] = PremultipliedColorU8::from_rgba_unchecked(r[14] as u8, g[14] as u8, b[14] as u8, a[14] as u8);
792    data[15] = PremultipliedColorU8::from_rgba_unchecked(r[15] as u8, g[15] as u8, b[15] as u8, a[15] as u8);
793}
794
795#[inline(always)]
796fn store_8888_tail(
797    r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
798    tail: usize, data: &mut [PremultipliedColorU8],
799) {
800    let r = r.as_slice();
801    let g = g.as_slice();
802    let b = b.as_slice();
803    let a = a.as_slice();
804
805    // This is better than `for i in 0..tail`, because this way the compiler
806    // knows that we have only 16 steps and slices access is guarantee to be valid.
807    // This removes bounds checking and a possible panic call.
808    for i in 0..STAGE_WIDTH {
809        data[i] = PremultipliedColorU8::from_rgba_unchecked(
810            r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8,
811        );
812
813        if i + 1 == tail {
814            break;
815        }
816    }
817}
818
819#[inline(always)]
820fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) {
821    *a = u16x16([
822        data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16,
823        data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16,
824        data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16,
825        data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16,
826    ]);
827}
828
829#[inline(always)]
830fn div255(v: u16x16) -> u16x16 {
831    // Skia uses `vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8)` here when NEON is available,
832    // but it doesn't affect performance much and breaks reproducible result. Ignore it.
833    // NOTE: the compiler does not replace the devision with a shift.
834    (v + u16x16::splat(255)) >> u16x16::splat(8) // / u16x16::splat(256)
835}
836
837#[inline(always)]
838fn inv(v: u16x16) -> u16x16 {
839    u16x16::splat(255) - v
840}
841
842#[inline(always)]
843fn from_float(f: f32) -> u16x16 {
844    u16x16::splat((f * 255.0 + 0.5) as u16)
845}
846
847#[inline(always)]
848fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 {
849    div255(from * inv(t) + to * t)
850}
851
852#[inline(always)]
853fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) {
854    // We're splitting f32x16 (512bit) into two u16x16 (256 bit).
855    let data: [u8; 64] = bytemuck::cast(*v);
856    let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0);
857    let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0);
858
859    d0.copy_from_slice(&data[0..32]);
860    d1.copy_from_slice(&data[32..64]);
861}
862
863#[inline(always)]
864fn join(lo: &u16x16, hi: &u16x16) -> f32x16 {
865    // We're joining two u16x16 (256 bit) into f32x16 (512bit).
866
867    let d0: [u8; 32] = bytemuck::cast(lo.0);
868    let d1: [u8; 32] = bytemuck::cast(hi.0);
869
870    let mut v = f32x16::default();
871    let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v);
872
873    data[0..32].copy_from_slice(&d0);
874    data[32..64].copy_from_slice(&d1);
875
876    v
877}
878
879#[inline(always)]
880fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 {
881    // NEON vmlaq_f32 doesn't seem to affect performance in any way. Ignore it.
882    f * m + a
883}