tiny_skia/pipeline/
highp.rs

1// Copyright 2018 Google Inc.
2// Copyright 2020 Yevhenii Reizner
3//
4// Use of this source code is governed by a BSD-style license that can be
5// found in the LICENSE file.
6
7/*!
8A high precision raster pipeline implementation.
9
10Unlike lowp, this one implements all stages.
11
12Just like Skia, this pipeline is implemented using f32x8.
13
14For some reason, we are almost 2x slower. Maybe because Skia uses clang's vector extensions
15and we're using a manual implementation.
16*/
17
18use crate::{PremultipliedColorU8, SpreadMode, PixmapRef};
19
20use crate::geom::ScreenIntRect;
21use crate::pixmap::SubPixmapMut;
22use crate::wide::{f32x8, i32x8, u32x8};
23
24pub const STAGE_WIDTH: usize = 8;
25
26pub type StageFn = fn(p: &mut Pipeline);
27
28pub struct Pipeline<'a, 'b: 'a> {
29    index: usize,
30    functions: &'a [StageFn],
31    pixmap_src: PixmapRef<'a>,
32    pixmap_dst: &'a mut SubPixmapMut<'b>,
33    ctx: &'a mut super::Context, // TODO: remove mut
34    mask_ctx: super::MaskCtx<'a>,
35    aa_mask_ctx: super::AAMaskCtx,
36    r: f32x8,
37    g: f32x8,
38    b: f32x8,
39    a: f32x8,
40    dr: f32x8,
41    dg: f32x8,
42    db: f32x8,
43    da: f32x8,
44    tail: usize,
45    dx: usize,
46    dy: usize,
47}
48
49impl Pipeline<'_, '_> {
50    #[inline(always)]
51    fn next_stage(&mut self) {
52        let next: fn(&mut Self) = self.functions[self.index];
53        self.index += 1;
54        next(self);
55    }
56}
57
58// Must be in the same order as raster_pipeline::Stage
59pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
60    move_source_to_destination,
61    move_destination_to_source,
62    clamp_0,
63    clamp_a,
64    premultiply,
65    uniform_color,
66    seed_shader,
67    load_dst,
68    store,
69    load_dst_u8,
70    store_u8,
71    gather,
72    load_mask_u8,
73    mask_u8,
74    scale_u8,
75    lerp_u8,
76    scale_1_float,
77    lerp_1_float,
78    destination_atop,
79    destination_in,
80    destination_out,
81    destination_over,
82    source_atop,
83    source_in,
84    source_out,
85    source_over,
86    clear,
87    modulate,
88    multiply,
89    plus,
90    screen,
91    xor,
92    color_burn,
93    color_dodge,
94    darken,
95    difference,
96    exclusion,
97    hard_light,
98    lighten,
99    overlay,
100    soft_light,
101    hue,
102    saturation,
103    color,
104    luminosity,
105    source_over_rgba,
106    transform,
107    reflect,
108    repeat,
109    bilinear,
110    bicubic,
111    pad_x1,
112    reflect_x1,
113    repeat_x1,
114    gradient,
115    evenly_spaced_2_stop_gradient,
116    xy_to_radius,
117    xy_to_2pt_conical_focal_on_circle,
118    xy_to_2pt_conical_well_behaved,
119    xy_to_2pt_conical_greater,
120    mask_2pt_conical_degenerates,
121    apply_vector_mask,
122];
123
124pub fn fn_ptr(f: StageFn) -> *const () {
125    f as *const ()
126}
127
128#[inline(never)]
129pub fn start(
130    functions: &[StageFn],
131    functions_tail: &[StageFn],
132    rect: &ScreenIntRect,
133    aa_mask_ctx: super::AAMaskCtx,
134    mask_ctx: super::MaskCtx,
135    ctx: &mut super::Context,
136    pixmap_src: PixmapRef,
137    pixmap_dst: &mut SubPixmapMut,
138) {
139    let mut p = Pipeline {
140        index: 0,
141        functions: &[],
142        pixmap_src,
143        pixmap_dst,
144        mask_ctx,
145        aa_mask_ctx,
146        ctx,
147        r: f32x8::default(),
148        g: f32x8::default(),
149        b: f32x8::default(),
150        a: f32x8::default(),
151        dr: f32x8::default(),
152        dg: f32x8::default(),
153        db: f32x8::default(),
154        da: f32x8::default(),
155        tail: 0,
156        dx: 0,
157        dy: 0,
158    };
159
160    for y in rect.y()..rect.bottom() {
161        let mut x = rect.x() as usize;
162        let end = rect.right() as usize;
163
164        p.functions = functions;
165        while x + STAGE_WIDTH <= end {
166            p.index = 0;
167            p.dx = x;
168            p.dy = y as usize;
169            p.tail = STAGE_WIDTH;
170            p.next_stage();
171            x += STAGE_WIDTH;
172        }
173
174        if x != end {
175            p.index = 0;
176            p.functions = functions_tail;
177            p.dx = x;
178            p.dy = y as usize;
179            p.tail = end - x;
180            p.next_stage();
181        }
182    }
183}
184
185fn move_source_to_destination(p: &mut Pipeline) {
186    p.dr = p.r;
187    p.dg = p.g;
188    p.db = p.b;
189    p.da = p.a;
190
191    p.next_stage();
192}
193
194fn premultiply(p: &mut Pipeline) {
195    p.r *= p.a;
196    p.g *= p.a;
197    p.b *= p.a;
198
199    p.next_stage();
200}
201
202fn move_destination_to_source(p: &mut Pipeline) {
203    p.r = p.dr;
204    p.g = p.dg;
205    p.b = p.db;
206    p.a = p.da;
207
208    p.next_stage();
209}
210
211fn clamp_0(p: &mut Pipeline) {
212    p.r = p.r.max(f32x8::default());
213    p.g = p.g.max(f32x8::default());
214    p.b = p.b.max(f32x8::default());
215    p.a = p.a.max(f32x8::default());
216
217    p.next_stage();
218}
219
220fn clamp_a(p: &mut Pipeline) {
221    p.r = p.r.min(f32x8::splat(1.0));
222    p.g = p.g.min(f32x8::splat(1.0));
223    p.b = p.b.min(f32x8::splat(1.0));
224    p.a = p.a.min(f32x8::splat(1.0));
225
226    p.next_stage();
227}
228
229fn uniform_color(p: &mut Pipeline) {
230    let ctx = &p.ctx.uniform_color;
231    p.r = f32x8::splat(ctx.r);
232    p.g = f32x8::splat(ctx.g);
233    p.b = f32x8::splat(ctx.b);
234    p.a = f32x8::splat(ctx.a);
235
236    p.next_stage();
237}
238
239fn seed_shader(p: &mut Pipeline) {
240    let iota = f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]);
241
242    p.r = f32x8::splat(p.dx as f32) + iota;
243    p.g = f32x8::splat(p.dy as f32 + 0.5);
244    p.b = f32x8::splat(1.0);
245    p.a = f32x8::default();
246
247    p.dr = f32x8::default();
248    p.dg = f32x8::default();
249    p.db = f32x8::default();
250    p.da = f32x8::default();
251
252    p.next_stage();
253}
254
255pub fn load_dst(p: &mut Pipeline) {
256    load_8888(p.pixmap_dst.slice4_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
257    p.next_stage();
258}
259
260pub fn load_dst_tail(p: &mut Pipeline) {
261    load_8888_tail(p.tail, p.pixmap_dst.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
262    p.next_stage();
263}
264
265pub fn store(p: &mut Pipeline) {
266    store_8888(&p.r, &p.g, &p.b, &p.a, p.pixmap_dst.slice4_at_xy(p.dx, p.dy));
267    p.next_stage();
268}
269
270pub fn store_tail(p: &mut Pipeline) {
271    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, p.pixmap_dst.slice_at_xy(p.dx, p.dy));
272    p.next_stage();
273}
274
275// Currently, all mask/A8 pixmaps are handled by lowp.
276pub fn load_dst_u8(_: &mut Pipeline) {
277    // unreachable
278}
279
280pub fn load_dst_u8_tail(_: &mut Pipeline) {
281    // unreachable
282}
283
284pub fn store_u8(_: &mut Pipeline) {
285    // unreachable
286}
287
288pub fn store_u8_tail(_: &mut Pipeline) {
289    // unreachable
290}
291
292pub fn gather(p: &mut Pipeline) {
293    let ix = gather_ix(p.pixmap_src, p.r, p.g);
294    load_8888(&p.pixmap_src.gather(ix), &mut p.r, &mut p.g, &mut p.b, &mut p.a);
295
296    p.next_stage();
297}
298
299#[inline(always)]
300fn gather_ix(pixmap: PixmapRef, mut x: f32x8, mut y: f32x8) -> u32x8 {
301    // Exclusive -> inclusive.
302    let w = ulp_sub(pixmap.width() as f32);
303    let h = ulp_sub(pixmap.height() as f32);
304    x = x.max(f32x8::default()).min(f32x8::splat(w));
305    y = y.max(f32x8::default()).min(f32x8::splat(h));
306
307    (y.trunc_int() * i32x8::splat(pixmap.width() as i32) + x.trunc_int()).to_u32x8_bitcast()
308}
309
310#[inline(always)]
311fn ulp_sub(v: f32) -> f32 {
312    // Somewhat similar to v - f32::EPSILON
313    bytemuck::cast::<u32, f32>(bytemuck::cast::<f32, u32>(v) - 1)
314}
315
316fn load_mask_u8(_: &mut Pipeline) {
317    // unreachable
318}
319
320fn mask_u8(p: &mut Pipeline) {
321    let offset = p.mask_ctx.offset(p.dx, p.dy);
322    let mut c = [0.0; 8];
323    for i in 0..p.tail {
324        c[i] = p.mask_ctx.data[offset + i] as f32;
325    }
326    let c = f32x8::from(c) / f32x8::splat(255.0);
327
328    if c == f32x8::default() {
329        return;
330    }
331
332    p.r *= c;
333    p.g *= c;
334    p.b *= c;
335    p.a *= c;
336
337    p.next_stage();
338}
339
340fn scale_u8(p: &mut Pipeline) {
341    // Load u8xTail and cast it to f32x8.
342    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
343    let c = f32x8::from([data[0] as f32, data[1] as f32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
344    let c = c / f32x8::splat(255.0);
345
346    p.r *= c;
347    p.g *= c;
348    p.b *= c;
349    p.a *= c;
350
351    p.next_stage();
352}
353
354fn lerp_u8(p: &mut Pipeline) {
355    // Load u8xTail and cast it to f32x8.
356    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
357    let c = f32x8::from([data[0] as f32, data[1] as f32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
358    let c = c / f32x8::splat(255.0);
359
360    p.r = lerp(p.dr, p.r, c);
361    p.g = lerp(p.dg, p.g, c);
362    p.b = lerp(p.db, p.b, c);
363    p.a = lerp(p.da, p.a, c);
364
365    p.next_stage();
366}
367
368fn scale_1_float(p: &mut Pipeline) {
369    let c = f32x8::splat(p.ctx.current_coverage);
370    p.r *= c;
371    p.g *= c;
372    p.b *= c;
373    p.a *= c;
374
375    p.next_stage();
376}
377
378fn lerp_1_float(p: &mut Pipeline) {
379    let c = f32x8::splat(p.ctx.current_coverage);
380    p.r = lerp(p.dr, p.r, c);
381    p.g = lerp(p.dg, p.g, c);
382    p.b = lerp(p.db, p.b, c);
383    p.a = lerp(p.da, p.a, c);
384
385    p.next_stage();
386}
387
388macro_rules! blend_fn {
389    ($name:ident, $f:expr) => {
390        fn $name(p: &mut Pipeline) {
391            p.r = $f(p.r, p.dr, p.a, p.da);
392            p.g = $f(p.g, p.dg, p.a, p.da);
393            p.b = $f(p.b, p.db, p.a, p.da);
394            p.a = $f(p.a, p.da, p.a, p.da);
395
396            p.next_stage();
397        }
398    };
399}
400
401blend_fn!(clear,            |_, _,  _,  _| f32x8::default());
402blend_fn!(source_atop,      |s, d, sa, da| s * da + d * inv(sa));
403blend_fn!(destination_atop, |s, d, sa, da| d * sa + s * inv(da));
404blend_fn!(source_in,        |s, _,  _, da| s * da);
405blend_fn!(destination_in,   |_, d, sa,  _| d * sa);
406blend_fn!(source_out,       |s, _,  _, da| s * inv(da));
407blend_fn!(destination_out,  |_, d, sa,  _| d * inv(sa));
408blend_fn!(source_over,      |s, d, sa,  _| mad(d, inv(sa), s));
409blend_fn!(destination_over, |s, d,  _, da| mad(s, inv(da), d));
410blend_fn!(modulate,         |s, d,  _,  _| s * d);
411blend_fn!(multiply,         |s, d, sa, da| s * inv(da) + d * inv(sa) + s * d);
412blend_fn!(screen,           |s, d,  _,  _| s + d - s * d);
413blend_fn!(xor,              |s, d, sa, da| s * inv(da) + d * inv(sa));
414
415// Wants a type for some reason.
416blend_fn!(plus, |s: f32x8, d: f32x8, _, _| (s + d).min(f32x8::splat(1.0)));
417
418macro_rules! blend_fn2 {
419    ($name:ident, $f:expr) => {
420        fn $name(p: &mut Pipeline) {
421            // The same logic applied to color, and source_over for alpha.
422            p.r = $f(p.r, p.dr, p.a, p.da);
423            p.g = $f(p.g, p.dg, p.a, p.da);
424            p.b = $f(p.b, p.db, p.a, p.da);
425            p.a = mad(p.da, inv(p.a), p.a);
426
427            p.next_stage();
428        }
429    };
430}
431
432blend_fn2!(darken,      |s: f32x8, d, sa, da: f32x8| s + d - (s * da).max(d * sa));
433blend_fn2!(lighten,     |s: f32x8, d, sa, da: f32x8| s + d - (s * da).min(d * sa));
434blend_fn2!(difference,  |s: f32x8, d, sa, da: f32x8| s + d - two((s * da).min(d * sa)));
435blend_fn2!(exclusion,   |s: f32x8, d,  _,  _| s + d - two(s * d));
436
437blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
438    d.cmp_eq(da).blend(
439        d + s * inv(da),
440        s.cmp_eq(f32x8::default()).blend(
441            d * inv(sa),
442            sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa)
443        )
444    )
445);
446
447blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
448    d.cmp_eq(f32x8::default()).blend(
449        s * inv(da),
450        s.cmp_eq(sa).blend(
451            s + d * inv(sa),
452            sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa)
453        )
454    )
455);
456
457blend_fn2!(hard_light, |s: f32x8, d: f32x8, sa, da|
458    s * inv(da) + d * inv(sa) + two(s).cmp_le(sa).blend(
459        two(s * d),
460        sa * da - two((da - d) * (sa - s))
461    )
462);
463
464blend_fn2!(overlay, |s: f32x8, d: f32x8, sa, da|
465    s * inv(da) + d * inv(sa) + two(d).cmp_le(da).blend(
466        two(s * d),
467        sa * da - two((da - d) * (sa - s))
468    )
469);
470
471blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| {
472    let m  = da.cmp_gt(f32x8::default()).blend(d / da, f32x8::default());
473    let s2 = two(s);
474    let m4 = two(two(m));
475
476    // The logic forks three ways:
477    //    1. dark src?
478    //    2. light src, dark dst?
479    //    3. light src, light dst?
480    let dark_src = d * (sa + (s2 - sa) * (f32x8::splat(1.0) - m));
481    let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m;
482    let lite_dst = m.sqrt() - m;
483    let lite_src = d * sa + da * (s2 - sa)
484        * two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3?
485
486    s * inv(da) + d * inv(sa) + s2.cmp_le(sa).blend(dark_src, lite_src) // 1 or (2 or 3)?
487});
488
489// We're basing our implementation of non-separable blend modes on
490//   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
491// and
492//   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
493// They're equivalent, but ES' math has been better simplified.
494//
495// Anything extra we add beyond that is to make the math work with premul inputs.
496
497macro_rules! blend_fn3 {
498    ($name:ident, $f:expr) => {
499        fn $name(p: &mut Pipeline) {
500            let (tr, tg, tb, ta) = $f(p.r, p.g, p.b, p.a, p.dr, p.dg, p.db, p.da);
501            p.r = tr;
502            p.g = tg;
503            p.b = tb;
504            p.a = ta;
505
506            p.next_stage();
507        }
508    };
509}
510
511blend_fn3!(hue, hue_k);
512
513#[inline(always)]
514fn hue_k(
515    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
516    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
517) -> (f32x8, f32x8, f32x8, f32x8) {
518    let rr = &mut (r * a);
519    let gg = &mut (g * a);
520    let bb = &mut (b * a);
521
522    set_sat(rr, gg, bb, sat(dr, dg, db) * a);
523    set_lum(rr, gg, bb, lum(dr, dg, db) * a);
524    clip_color(rr, gg, bb, a * da);
525
526    let r = r * inv(da) + dr * inv(a) + *rr;
527    let g = g * inv(da) + dg * inv(a) + *gg;
528    let b = b * inv(da) + db * inv(a) + *bb;
529    let a = a + da - a * da;
530
531    (r, g, b, a)
532}
533
534blend_fn3!(saturation, saturation_k);
535
536#[inline(always)]
537fn saturation_k(
538    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
539    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
540) -> (f32x8, f32x8, f32x8, f32x8) {
541    let rr = &mut (dr * a);
542    let gg = &mut (dg * a);
543    let bb = &mut (db * a);
544
545    set_sat(rr, gg, bb, sat(r, g, b) * da);
546    set_lum(rr, gg, bb, lum(dr, dg, db) * a); // (This is not redundant.)
547    clip_color(rr, gg, bb, a * da);
548
549    let r = r * inv(da) + dr * inv(a) + *rr;
550    let g = g * inv(da) + dg * inv(a) + *gg;
551    let b = b * inv(da) + db * inv(a) + *bb;
552    let a = a + da - a * da;
553
554    (r, g, b, a)
555}
556
557blend_fn3!(color, color_k);
558
559#[inline(always)]
560fn color_k(
561    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
562    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
563) -> (f32x8, f32x8, f32x8, f32x8) {
564    let rr = &mut (r * da);
565    let gg = &mut (g * da);
566    let bb = &mut (b * da);
567
568    set_lum(rr, gg, bb, lum(dr, dg, db) * a);
569    clip_color(rr, gg, bb, a * da);
570
571    let r = r * inv(da) + dr * inv(a) + *rr;
572    let g = g * inv(da) + dg * inv(a) + *gg;
573    let b = b * inv(da) + db * inv(a) + *bb;
574    let a = a + da - a * da;
575
576    (r, g, b, a)
577}
578
579blend_fn3!(luminosity, luminosity_k);
580
581#[inline(always)]
582fn luminosity_k(
583    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
584    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
585) -> (f32x8, f32x8, f32x8, f32x8) {
586    let rr = &mut (dr * a);
587    let gg = &mut (dg * a);
588    let bb = &mut (db * a);
589
590    set_lum(rr, gg, bb, lum(r, g, b) * da);
591    clip_color(rr, gg, bb, a * da);
592
593    let r = r * inv(da) + dr * inv(a) + *rr;
594    let g = g * inv(da) + dg * inv(a) + *gg;
595    let b = b * inv(da) + db * inv(a) + *bb;
596    let a = a + da - a * da;
597
598    (r, g, b, a)
599}
600
601#[inline(always)]
602fn sat(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
603    r.max(g.max(b)) - r.min(g.min(b))
604}
605
606#[inline(always)]
607fn lum(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
608    r * f32x8::splat(0.30) + g * f32x8::splat(0.59) + b * f32x8::splat(0.11)
609}
610
611#[inline(always)]
612fn set_sat(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, s: f32x8) {
613    let mn  = r.min(g.min(*b));
614    let mx  = r.max(g.max(*b));
615    let sat = mx - mn;
616
617    // Map min channel to 0, max channel to s, and scale the middle proportionally.
618    let scale = |c| sat.cmp_eq(f32x8::default())
619                       .blend(f32x8::default(), (c - mn) * s / sat);
620
621    *r = scale(*r);
622    *g = scale(*g);
623    *b = scale(*b);
624}
625
626#[inline(always)]
627fn set_lum(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, l: f32x8) {
628    let diff = l - lum(*r, *g, *b);
629    *r += diff;
630    *g += diff;
631    *b += diff;
632}
633
634#[inline(always)]
635fn clip_color(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: f32x8) {
636    let mn = r.min(g.min(*b));
637    let mx = r.max(g.max(*b));
638    let l  = lum(*r, *g, *b);
639
640    let clip = |mut c| {
641        c = mx.cmp_ge(f32x8::default()).blend(c, l + (c - l) * l / (l - mn));
642        c = mx.cmp_gt(a).blend(l + (c - l) * (a - l) / (mx - l), c);
643        c = c.max(f32x8::default()); // Sometimes without this we may dip just a little negative.
644        c
645    };
646
647    *r = clip(*r);
648    *g = clip(*g);
649    *b = clip(*b);
650}
651
652pub fn source_over_rgba(p: &mut Pipeline) {
653    let pixels = p.pixmap_dst.slice4_at_xy(p.dx, p.dy);
654    load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
655    p.r = mad(p.dr, inv(p.a), p.r);
656    p.g = mad(p.dg, inv(p.a), p.g);
657    p.b = mad(p.db, inv(p.a), p.b);
658    p.a = mad(p.da, inv(p.a), p.a);
659    store_8888(&p.r, &p.g, &p.b, &p.a, pixels);
660
661    p.next_stage();
662}
663
664pub fn source_over_rgba_tail(p: &mut Pipeline) {
665    let pixels = p.pixmap_dst.slice_at_xy(p.dx, p.dy);
666    load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
667    p.r = mad(p.dr, inv(p.a), p.r);
668    p.g = mad(p.dg, inv(p.a), p.g);
669    p.b = mad(p.db, inv(p.a), p.b);
670    p.a = mad(p.da, inv(p.a), p.a);
671    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels);
672
673    p.next_stage();
674}
675
676fn transform(p: &mut Pipeline) {
677    let ts = &p.ctx.transform;
678
679    let tr = mad(p.r, f32x8::splat(ts.sx), mad(p.g, f32x8::splat(ts.kx), f32x8::splat(ts.tx)));
680    let tg = mad(p.r, f32x8::splat(ts.ky), mad(p.g, f32x8::splat(ts.sy), f32x8::splat(ts.ty)));
681    p.r = tr;
682    p.g = tg;
683
684    p.next_stage();
685}
686
687// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
688// The gather stages will hard clamp the output of these stages to [0,limit)...
689// we just need to do the basic repeat or mirroring.
690
691fn reflect(p: &mut Pipeline) {
692    let ctx = &p.ctx.limit_x;
693    p.r = exclusive_reflect(p.r, ctx.scale, ctx.inv_scale);
694
695    let ctx = &p.ctx.limit_y;
696    p.g = exclusive_reflect(p.g, ctx.scale, ctx.inv_scale);
697
698    p.next_stage();
699}
700
701#[inline(always)]
702fn exclusive_reflect(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
703    let limit = f32x8::splat(limit);
704    let inv_limit = f32x8::splat(inv_limit);
705    ((v - limit) - (limit + limit)
706        * ((v - limit) * (inv_limit * f32x8::splat(0.5))).floor() - limit).abs()
707}
708
709fn repeat(p: &mut Pipeline) {
710    let ctx = &p.ctx.limit_x;
711    p.r = exclusive_repeat(p.r, ctx.scale, ctx.inv_scale);
712
713    let ctx = &p.ctx.limit_y;
714    p.g = exclusive_repeat(p.g, ctx.scale, ctx.inv_scale);
715
716    p.next_stage();
717}
718
719#[inline(always)]
720fn exclusive_repeat(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
721    v - (v * f32x8::splat(inv_limit)).floor() * f32x8::splat(limit)
722}
723
724fn bilinear(p: &mut Pipeline) {
725    let x = p.r;
726    let fx = (x + f32x8::splat(0.5)).fract();
727    let y = p.g;
728    let fy = (y + f32x8::splat(0.5)).fract();
729    let one = f32x8::splat(1.0);
730    let wx = [one - fx, fx];
731    let wy = [one - fy, fy];
732
733    sampler_2x2(p.pixmap_src, &p.ctx.sampler, x, y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
734
735    p.next_stage();
736}
737
738fn bicubic(p: &mut Pipeline) {
739    let x = p.r;
740    let fx = (x + f32x8::splat(0.5)).fract();
741    let y = p.g;
742    let fy = (y + f32x8::splat(0.5)).fract();
743    let one = f32x8::splat(1.0);
744    let wx = [bicubic_far(one - fx), bicubic_near(one - fx), bicubic_near(fx), bicubic_far(fx)];
745    let wy = [bicubic_far(one - fy), bicubic_near(one - fy), bicubic_near(fy), bicubic_far(fy)];
746
747    sampler_4x4(p.pixmap_src, &p.ctx.sampler, x, y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
748
749    p.next_stage();
750}
751
752// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
753// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
754//
755// We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
756
757#[inline(always)]
758fn bicubic_near(t: f32x8) -> f32x8 {
759    // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
760    mad(
761        t,
762        mad(t,
763            mad(
764                f32x8::splat(-21.0/18.0),
765                t,
766                f32x8::splat(27.0/18.0),
767            ),
768            f32x8::splat(9.0/18.0),
769        ),
770        f32x8::splat(1.0/18.0),
771    )
772}
773
774#[inline(always)]
775fn bicubic_far(t: f32x8) -> f32x8 {
776    // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
777    (t * t) * mad(f32x8::splat(7.0/18.0), t, f32x8::splat(-6.0/18.0))
778}
779
780#[inline(always)]
781fn sampler_2x2(
782    pixmap: PixmapRef,
783    ctx: &super::SamplerCtx,
784    cx: f32x8, cy: f32x8,
785    wx: &[f32x8; 2], wy: &[f32x8; 2],
786    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
787) {
788    *r = f32x8::default();
789    *g = f32x8::default();
790    *b = f32x8::default();
791    *a = f32x8::default();
792
793    let one = f32x8::splat(1.0);
794    let start = -0.5;
795    let mut y = cy + f32x8::splat(start);
796    for j in 0..2 {
797        let mut x = cx + f32x8::splat(start);
798        for i in 0..2 {
799            let mut rr = f32x8::default();
800            let mut gg = f32x8::default();
801            let mut bb = f32x8::default();
802            let mut aa = f32x8::default();
803            sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
804
805            let w = wx[i] * wy[j];
806            *r = mad(w, rr, *r);
807            *g = mad(w, gg, *g);
808            *b = mad(w, bb, *b);
809            *a = mad(w, aa, *a);
810
811            x += one;
812        }
813
814        y += one;
815    }
816}
817
818#[inline(always)]
819fn sampler_4x4(
820    pixmap: PixmapRef,
821    ctx: &super::SamplerCtx,
822    cx: f32x8, cy: f32x8,
823    wx: &[f32x8; 4], wy: &[f32x8; 4],
824    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
825) {
826    *r = f32x8::default();
827    *g = f32x8::default();
828    *b = f32x8::default();
829    *a = f32x8::default();
830
831    let one = f32x8::splat(1.0);
832    let start = -1.5;
833    let mut y = cy + f32x8::splat(start);
834    for j in 0..4 {
835        let mut x = cx + f32x8::splat(start);
836        for i in 0..4 {
837            let mut rr = f32x8::default();
838            let mut gg = f32x8::default();
839            let mut bb = f32x8::default();
840            let mut aa = f32x8::default();
841            sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
842
843            let w = wx[i] * wy[j];
844            *r = mad(w, rr, *r);
845            *g = mad(w, gg, *g);
846            *b = mad(w, bb, *b);
847            *a = mad(w, aa, *a);
848
849            x += one;
850        }
851
852        y += one;
853    }
854}
855
856#[inline(always)]
857fn sample(
858    pixmap: PixmapRef, ctx: &super::SamplerCtx, mut x: f32x8, mut y: f32x8,
859    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
860) {
861    x = tile(x, ctx.spread_mode, pixmap.width() as f32, ctx.inv_width);
862    y = tile(y, ctx.spread_mode, pixmap.height() as f32, ctx.inv_height);
863
864    let ix = gather_ix(pixmap, x, y);
865    load_8888(&pixmap.gather(ix), r, g, b, a);
866}
867
868#[inline(always)]
869fn tile(v: f32x8, mode: SpreadMode, limit: f32, inv_limit: f32) -> f32x8 {
870    match mode {
871        SpreadMode::Pad => v,
872        SpreadMode::Repeat => exclusive_repeat(v, limit, inv_limit),
873        SpreadMode::Reflect => exclusive_reflect(v, limit, inv_limit),
874    }
875}
876
877fn pad_x1(p: &mut Pipeline) {
878    p.r = p.r.normalize();
879
880    p.next_stage();
881}
882
883fn reflect_x1(p: &mut Pipeline) {
884    p.r = (
885        (p.r - f32x8::splat(1.0))
886            - two(((p.r - f32x8::splat(1.0)) * f32x8::splat(0.5)).floor())
887            - f32x8::splat(1.0)
888    ).abs().normalize();
889
890    p.next_stage();
891}
892
893fn repeat_x1(p: &mut Pipeline) {
894    p.r = (p.r - p.r.floor()).normalize();
895
896    p.next_stage();
897}
898
899fn gradient(p: &mut Pipeline) {
900    let ctx = &p.ctx.gradient;
901
902    // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
903    let t: [f32; 8] = p.r.into();
904    let mut idx = u32x8::default();
905    for i in 1..ctx.len {
906        let tt = ctx.t_values[i].get();
907        let n: u32x8 = bytemuck::cast([
908            (t[0] >= tt) as u32,
909            (t[1] >= tt) as u32,
910            (t[2] >= tt) as u32,
911            (t[3] >= tt) as u32,
912            (t[4] >= tt) as u32,
913            (t[5] >= tt) as u32,
914            (t[6] >= tt) as u32,
915            (t[7] >= tt) as u32,
916        ]);
917        idx = idx + n;
918    }
919    gradient_lookup(ctx, &idx, p.r, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
920
921    p.next_stage();
922}
923
924fn gradient_lookup(
925    ctx: &super::GradientCtx, idx: &u32x8, t: f32x8,
926    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
927) {
928    let idx: [u32; 8] = bytemuck::cast(*idx);
929
930    macro_rules! gather {
931        ($d:expr, $c:ident) => {
932            // Surprisingly, but bound checking doesn't affect the performance.
933            // And since `idx` can contain any number, we should leave it in place.
934            f32x8::from([
935                $d[idx[0] as usize].$c,
936                $d[idx[1] as usize].$c,
937                $d[idx[2] as usize].$c,
938                $d[idx[3] as usize].$c,
939                $d[idx[4] as usize].$c,
940                $d[idx[5] as usize].$c,
941                $d[idx[6] as usize].$c,
942                $d[idx[7] as usize].$c,
943            ])
944        };
945    }
946
947    let fr = gather!(&ctx.factors, r);
948    let fg = gather!(&ctx.factors, g);
949    let fb = gather!(&ctx.factors, b);
950    let fa = gather!(&ctx.factors, a);
951
952    let br = gather!(&ctx.biases, r);
953    let bg = gather!(&ctx.biases, g);
954    let bb = gather!(&ctx.biases, b);
955    let ba = gather!(&ctx.biases, a);
956
957    *r = mad(t, fr, br);
958    *g = mad(t, fg, bg);
959    *b = mad(t, fb, bb);
960    *a = mad(t, fa, ba);
961}
962
963fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
964    let ctx = &p.ctx.evenly_spaced_2_stop_gradient;
965
966    let t = p.r;
967    p.r = mad(t, f32x8::splat(ctx.factor.r), f32x8::splat(ctx.bias.r));
968    p.g = mad(t, f32x8::splat(ctx.factor.g), f32x8::splat(ctx.bias.g));
969    p.b = mad(t, f32x8::splat(ctx.factor.b), f32x8::splat(ctx.bias.b));
970    p.a = mad(t, f32x8::splat(ctx.factor.a), f32x8::splat(ctx.bias.a));
971
972    p.next_stage();
973}
974
975fn xy_to_radius(p: &mut Pipeline) {
976    let x2 = p.r * p.r;
977    let y2 = p.g * p.g;
978    p.r = (x2 + y2).sqrt();
979
980    p.next_stage();
981}
982
983fn xy_to_2pt_conical_focal_on_circle(p: &mut Pipeline) {
984    let x = p.r;
985    let y = p.g;
986    p.r = x + y * y / x;
987
988    p.next_stage();
989}
990
991fn xy_to_2pt_conical_well_behaved(p: &mut Pipeline) {
992    let ctx = &p.ctx.two_point_conical_gradient;
993
994    let x = p.r;
995    let y = p.g;
996    p.r = (x * x + y * y).sqrt() - x * f32x8::splat(ctx.p0);
997
998    p.next_stage();
999}
1000
1001fn xy_to_2pt_conical_greater(p: &mut Pipeline) {
1002    let ctx = &p.ctx.two_point_conical_gradient;
1003
1004    let x = p.r;
1005    let y = p.g;
1006    p.r = (x * x - y * y).sqrt() - x * f32x8::splat(ctx.p0);
1007
1008    p.next_stage();
1009}
1010
1011fn mask_2pt_conical_degenerates(p: &mut Pipeline) {
1012    let ctx = &mut p.ctx.two_point_conical_gradient;
1013
1014    let t = p.r;
1015    let is_degenerate = t.cmp_le(f32x8::default()) | t.cmp_ne(t);
1016    p.r = is_degenerate.blend(f32x8::default(), t);
1017
1018    let is_not_degenerate = !is_degenerate.to_u32x8_bitcast();
1019    let is_not_degenerate: [u32; 8] = bytemuck::cast(is_not_degenerate);
1020    ctx.mask = bytemuck::cast([
1021        if is_not_degenerate[0] != 0 { !0 } else { 0 },
1022        if is_not_degenerate[1] != 0 { !0 } else { 0 },
1023        if is_not_degenerate[2] != 0 { !0 } else { 0 },
1024        if is_not_degenerate[3] != 0 { !0 } else { 0 },
1025        if is_not_degenerate[4] != 0 { !0 } else { 0 },
1026        if is_not_degenerate[5] != 0 { !0 } else { 0 },
1027        if is_not_degenerate[6] != 0 { !0 } else { 0 },
1028        if is_not_degenerate[7] != 0 { !0 } else { 0 },
1029    ]);
1030
1031    p.next_stage();
1032}
1033
1034fn apply_vector_mask(p: &mut Pipeline) {
1035    let ctx = &p.ctx.two_point_conical_gradient;
1036
1037    p.r = (p.r.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1038    p.g = (p.g.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1039    p.b = (p.b.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1040    p.a = (p.a.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1041
1042    p.next_stage();
1043}
1044
1045pub fn just_return(_: &mut Pipeline) {
1046    // Ends the loop.
1047}
1048
1049#[inline(always)]
1050fn load_8888(
1051    data: &[PremultipliedColorU8; STAGE_WIDTH],
1052    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1053) {
1054    // Surprisingly, `f32 * FACTOR` is way faster than `f32x8 * f32x8::splat(FACTOR)`.
1055
1056    const FACTOR: f32 = 1.0 / 255.0;
1057
1058    *r = f32x8::from([
1059        data[0].red() as f32 * FACTOR, data[1].red() as f32 * FACTOR,
1060        data[2].red() as f32 * FACTOR, data[3].red() as f32 * FACTOR,
1061        data[4].red() as f32 * FACTOR, data[5].red() as f32 * FACTOR,
1062        data[6].red() as f32 * FACTOR, data[7].red() as f32 * FACTOR,
1063    ]);
1064
1065    *g = f32x8::from([
1066        data[0].green() as f32 * FACTOR, data[1].green() as f32 * FACTOR,
1067        data[2].green() as f32 * FACTOR, data[3].green() as f32 * FACTOR,
1068        data[4].green() as f32 * FACTOR, data[5].green() as f32 * FACTOR,
1069        data[6].green() as f32 * FACTOR, data[7].green() as f32 * FACTOR,
1070    ]);
1071
1072    *b = f32x8::from([
1073        data[0].blue() as f32 * FACTOR, data[1].blue() as f32 * FACTOR,
1074        data[2].blue() as f32 * FACTOR, data[3].blue() as f32 * FACTOR,
1075        data[4].blue() as f32 * FACTOR, data[5].blue() as f32 * FACTOR,
1076        data[6].blue() as f32 * FACTOR, data[7].blue() as f32 * FACTOR,
1077    ]);
1078
1079    *a = f32x8::from([
1080        data[0].alpha() as f32 * FACTOR, data[1].alpha() as f32 * FACTOR,
1081        data[2].alpha() as f32 * FACTOR, data[3].alpha() as f32 * FACTOR,
1082        data[4].alpha() as f32 * FACTOR, data[5].alpha() as f32 * FACTOR,
1083        data[6].alpha() as f32 * FACTOR, data[7].alpha() as f32 * FACTOR,
1084    ]);
1085}
1086
1087#[inline(always)]
1088fn load_8888_tail(
1089    tail: usize, data: &[PremultipliedColorU8],
1090    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1091) {
1092    // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
1093    // This way we can reuse the `load_8888_` method and remove any branches.
1094    let mut tmp = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
1095    tmp[0..tail].copy_from_slice(&data[0..tail]);
1096    load_8888(&tmp, r, g, b, a);
1097}
1098
1099#[inline(always)]
1100fn store_8888(
1101    r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1102    data: &mut [PremultipliedColorU8; STAGE_WIDTH],
1103) {
1104    let r: [i32; 8] = unnorm(r).into();
1105    let g: [i32; 8] = unnorm(g).into();
1106    let b: [i32; 8] = unnorm(b).into();
1107    let a: [i32; 8] = unnorm(a).into();
1108
1109    let conv = |rr, gg, bb, aa|
1110        PremultipliedColorU8::from_rgba_unchecked(rr as u8, gg as u8, bb as u8, aa as u8);
1111
1112    data[0] = conv(r[0], g[0], b[0], a[0]);
1113    data[1] = conv(r[1], g[1], b[1], a[1]);
1114    data[2] = conv(r[2], g[2], b[2], a[2]);
1115    data[3] = conv(r[3], g[3], b[3], a[3]);
1116    data[4] = conv(r[4], g[4], b[4], a[4]);
1117    data[5] = conv(r[5], g[5], b[5], a[5]);
1118    data[6] = conv(r[6], g[6], b[6], a[6]);
1119    data[7] = conv(r[7], g[7], b[7], a[7]);
1120}
1121
1122#[inline(always)]
1123fn store_8888_tail(
1124    r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1125    tail: usize, data: &mut [PremultipliedColorU8],
1126) {
1127    let r: [i32; 8] = unnorm(r).into();
1128    let g: [i32; 8] = unnorm(g).into();
1129    let b: [i32; 8] = unnorm(b).into();
1130    let a: [i32; 8] = unnorm(a).into();
1131
1132    // This is better than `for i in 0..tail`, because this way the compiler
1133    // knows that we have only 4 steps and slices access is guarantee to be valid.
1134    // This removes bounds checking and a possible panic call.
1135    for i in 0..STAGE_WIDTH {
1136        data[i] = PremultipliedColorU8::from_rgba_unchecked(
1137            r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8,
1138        );
1139
1140        if i + 1 == tail {
1141            break;
1142        }
1143    }
1144}
1145
1146#[inline(always)]
1147fn unnorm(v: &f32x8) -> i32x8 {
1148    (v.max(f32x8::default()).min(f32x8::splat(1.0)) * f32x8::splat(255.0)).round_int()
1149}
1150
1151#[inline(always)]
1152fn inv(v: f32x8) -> f32x8 {
1153    f32x8::splat(1.0) - v
1154}
1155
1156#[inline(always)]
1157fn two(v: f32x8) -> f32x8 {
1158    v + v
1159}
1160
1161#[inline(always)]
1162fn mad(f: f32x8, m: f32x8, a: f32x8) -> f32x8 {
1163    f * m + a
1164}
1165
1166#[inline(always)]
1167fn lerp(from: f32x8, to: f32x8, t: f32x8) -> f32x8 {
1168    mad(to - from, t, from)
1169}