1use crate::PremultipliedColorU8;
32
33use crate::pixmap::SubPixmapMut;
34use crate::wide::{f32x8, u16x16, f32x16};
35use crate::geom::ScreenIntRect;
36
37pub const STAGE_WIDTH: usize = 16;
38
39pub type StageFn = fn(p: &mut Pipeline);
40
41pub struct Pipeline<'a, 'b: 'a> {
42 index: usize,
43 functions: &'a [StageFn],
44 pixmap: &'a mut SubPixmapMut<'b>,
45 mask_ctx: super::MaskCtx<'a>,
46 aa_mask_ctx: super::AAMaskCtx,
47 ctx: &'a mut super::Context,
48 r: u16x16,
49 g: u16x16,
50 b: u16x16,
51 a: u16x16,
52 dr: u16x16,
53 dg: u16x16,
54 db: u16x16,
55 da: u16x16,
56 tail: usize,
57 dx: usize,
58 dy: usize,
59}
60
61impl Pipeline<'_, '_> {
62 #[inline(always)]
63 fn next_stage(&mut self) {
64 let next: fn(&mut Self) = self.functions[self.index];
65 self.index += 1;
66 next(self);
67 }
68}
69
70
71pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
73 move_source_to_destination,
74 move_destination_to_source,
75 null_fn, null_fn, premultiply,
78 uniform_color,
79 seed_shader,
80 load_dst,
81 store,
82 load_dst_u8,
83 store_u8,
84 null_fn, load_mask_u8,
86 mask_u8,
87 scale_u8,
88 lerp_u8,
89 scale_1_float,
90 lerp_1_float,
91 destination_atop,
92 destination_in,
93 destination_out,
94 destination_over,
95 source_atop,
96 source_in,
97 source_out,
98 source_over,
99 clear,
100 modulate,
101 multiply,
102 plus,
103 screen,
104 xor,
105 null_fn, null_fn, darken,
108 difference,
109 exclusion,
110 hard_light,
111 lighten,
112 overlay,
113 null_fn, null_fn, null_fn, null_fn, null_fn, source_over_rgba,
119 transform,
120 null_fn, null_fn, null_fn, null_fn, pad_x1,
125 reflect_x1,
126 repeat_x1,
127 gradient,
128 evenly_spaced_2_stop_gradient,
129 xy_to_radius,
130 null_fn, null_fn, null_fn, null_fn, null_fn, ];
136
137pub fn fn_ptr(f: StageFn) -> *const () {
138 f as *const ()
139}
140
141pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool {
142 core::ptr::eq(f1 as *const (), f2 as *const ())
143}
144
145#[inline(never)]
146pub fn start(
147 functions: &[StageFn],
148 functions_tail: &[StageFn],
149 rect: &ScreenIntRect,
150 aa_mask_ctx: super::AAMaskCtx,
151 mask_ctx: super::MaskCtx,
152 ctx: &mut super::Context,
153 pixmap: &mut SubPixmapMut,
154) {
155 let mut p = Pipeline {
156 index: 0,
157 functions: &[],
158 pixmap,
159 mask_ctx,
160 aa_mask_ctx,
161 ctx,
162 r: u16x16::default(),
163 g: u16x16::default(),
164 b: u16x16::default(),
165 a: u16x16::default(),
166 dr: u16x16::default(),
167 dg: u16x16::default(),
168 db: u16x16::default(),
169 da: u16x16::default(),
170 tail: 0,
171 dx: 0,
172 dy: 0,
173 };
174
175 for y in rect.y()..rect.bottom() {
176 let mut x = rect.x() as usize;
177 let end = rect.right() as usize;
178
179 p.functions = functions;
180 while x + STAGE_WIDTH <= end {
181 p.index = 0;
182 p.dx = x;
183 p.dy = y as usize;
184 p.tail = STAGE_WIDTH;
185 p.next_stage();
186 x += STAGE_WIDTH;
187 }
188
189 if x != end {
190 p.index = 0;
191 p.functions = functions_tail;
192 p.dx = x;
193 p.dy = y as usize;
194 p.tail = end - x;
195 p.next_stage();
196 }
197 }
198}
199
200fn move_source_to_destination(p: &mut Pipeline) {
201 p.dr = p.r;
202 p.dg = p.g;
203 p.db = p.b;
204 p.da = p.a;
205
206 p.next_stage();
207}
208
209fn move_destination_to_source(p: &mut Pipeline) {
210 p.r = p.dr;
211 p.g = p.dg;
212 p.b = p.db;
213 p.a = p.da;
214
215 p.next_stage();
216}
217
218fn premultiply(p: &mut Pipeline) {
219 p.r = div255(p.r * p.a);
220 p.g = div255(p.g * p.a);
221 p.b = div255(p.b * p.a);
222
223 p.next_stage();
224}
225
226fn uniform_color(p: &mut Pipeline) {
227 let ctx = p.ctx.uniform_color;
228 p.r = u16x16::splat(ctx.rgba[0]);
229 p.g = u16x16::splat(ctx.rgba[1]);
230 p.b = u16x16::splat(ctx.rgba[2]);
231 p.a = u16x16::splat(ctx.rgba[3]);
232
233 p.next_stage();
234}
235
236fn seed_shader(p: &mut Pipeline) {
237 let iota = f32x16(
238 f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]),
239 f32x8::from([8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]),
240 );
241
242 let x = f32x16::splat(p.dx as f32) + iota;
243 let y = f32x16::splat(p.dy as f32 + 0.5);
244 split(&x, &mut p.r, &mut p.g);
245 split(&y, &mut p.b, &mut p.a);
246
247 p.next_stage();
248}
249
250pub fn load_dst(p: &mut Pipeline) {
251 load_8888(p.pixmap.slice16_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
252 p.next_stage();
253}
254
255pub fn load_dst_tail(p: &mut Pipeline) {
256 load_8888_tail(p.tail, p.pixmap.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
257 p.next_stage();
258}
259
260pub fn store(p: &mut Pipeline) {
261 store_8888(&p.r, &p.g, &p.b, &p.a, p.pixmap.slice16_at_xy(p.dx, p.dy));
262 p.next_stage();
263}
264
265pub fn store_tail(p: &mut Pipeline) {
266 store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, p.pixmap.slice_at_xy(p.dx, p.dy));
267 p.next_stage();
268}
269
270pub fn load_dst_u8(p: &mut Pipeline) {
271 load_8(p.pixmap.slice16_mask_at_xy(p.dx, p.dy), &mut p.da);
272 p.next_stage();
273}
274
275pub fn load_dst_u8_tail(p: &mut Pipeline) {
276 let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
279 let mut tmp = [0u8; STAGE_WIDTH];
280 tmp[0..p.tail].copy_from_slice(&data[0..p.tail]);
281 load_8(&tmp, &mut p.da);
282
283 p.next_stage();
284}
285
286pub fn store_u8(p: &mut Pipeline) {
287 let data = p.pixmap.slice16_mask_at_xy(p.dx, p.dy);
288 let a = p.a.as_slice();
289
290 data[ 0] = a[ 0] as u8;
291 data[ 1] = a[ 1] as u8;
292 data[ 2] = a[ 2] as u8;
293 data[ 3] = a[ 3] as u8;
294 data[ 4] = a[ 4] as u8;
295 data[ 5] = a[ 5] as u8;
296 data[ 6] = a[ 6] as u8;
297 data[ 7] = a[ 7] as u8;
298 data[ 8] = a[ 8] as u8;
299 data[ 9] = a[ 9] as u8;
300 data[10] = a[10] as u8;
301 data[11] = a[11] as u8;
302 data[12] = a[12] as u8;
303 data[13] = a[13] as u8;
304 data[14] = a[14] as u8;
305 data[15] = a[15] as u8;
306
307 p.next_stage();
308}
309
310pub fn store_u8_tail(p: &mut Pipeline) {
311 let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
312 let a = p.a.as_slice();
313
314 for i in 0..STAGE_WIDTH {
318 data[i] = a[i] as u8;
319
320 if i + 1 == p.tail {
321 break;
322 }
323 }
324
325 p.next_stage();
326}
327
328fn load_mask_u8(p: &mut Pipeline) {
330 let offset = p.mask_ctx.offset(p.dx, p.dy);
331
332 let mut c = u16x16::default();
333 for i in 0..p.tail {
334 c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
335 }
336
337 p.r = u16x16::splat(0);
338 p.g = u16x16::splat(0);
339 p.b = u16x16::splat(0);
340 p.a = c;
341
342 p.next_stage();
343}
344
345fn mask_u8(p: &mut Pipeline) {
346 let offset = p.mask_ctx.offset(p.dx, p.dy);
347
348 let mut c = u16x16::default();
349 for i in 0..p.tail {
350 c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
351 }
352
353 if c == u16x16::default() {
354 return;
355 }
356
357 p.r = div255(p.r * c);
358 p.g = div255(p.g * c);
359 p.b = div255(p.b * c);
360 p.a = div255(p.a * c);
361
362 p.next_stage();
363}
364
365fn scale_u8(p: &mut Pipeline) {
366 let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
368 let c = u16x16([
369 u16::from(data[0]),
370 u16::from(data[1]),
371 0,
372 0,
373 0,
374 0,
375 0,
376 0,
377 0,
378 0,
379 0,
380 0,
381 0,
382 0,
383 0,
384 0,
385 ]);
386
387 p.r = div255(p.r * c);
388 p.g = div255(p.g * c);
389 p.b = div255(p.b * c);
390 p.a = div255(p.a * c);
391
392 p.next_stage();
393}
394
395fn lerp_u8(p: &mut Pipeline) {
396 let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
398 let c = u16x16([
399 u16::from(data[0]),
400 u16::from(data[1]),
401 0,
402 0,
403 0,
404 0,
405 0,
406 0,
407 0,
408 0,
409 0,
410 0,
411 0,
412 0,
413 0,
414 0,
415 ]);
416
417 p.r = lerp(p.dr, p.r, c);
418 p.g = lerp(p.dg, p.g, c);
419 p.b = lerp(p.db, p.b, c);
420 p.a = lerp(p.da, p.a, c);
421
422 p.next_stage();
423}
424
425fn scale_1_float(p: &mut Pipeline) {
426 let c = from_float(p.ctx.current_coverage);
427 p.r = div255(p.r * c);
428 p.g = div255(p.g * c);
429 p.b = div255(p.b * c);
430 p.a = div255(p.a * c);
431
432 p.next_stage();
433}
434
435fn lerp_1_float(p: &mut Pipeline) {
436 let c = from_float(p.ctx.current_coverage);
437 p.r = lerp(p.dr, p.r, c);
438 p.g = lerp(p.dg, p.g, c);
439 p.b = lerp(p.db, p.b, c);
440 p.a = lerp(p.da, p.a, c);
441
442 p.next_stage();
443}
444
445macro_rules! blend_fn {
446 ($name:ident, $f:expr) => {
447 fn $name(p: &mut Pipeline) {
448 p.r = $f(p.r, p.dr, p.a, p.da);
449 p.g = $f(p.g, p.dg, p.a, p.da);
450 p.b = $f(p.b, p.db, p.a, p.da);
451 p.a = $f(p.a, p.da, p.a, p.da);
452
453 p.next_stage();
454 }
455 };
456}
457
458blend_fn!(clear, |_, _, _, _| u16x16::splat(0));
459blend_fn!(source_atop, |s, d, sa, da| div255(s * da + d * inv(sa)));
460blend_fn!(destination_atop, |s, d, sa, da| div255(d * sa + s * inv(da)));
461blend_fn!(source_in, |s, _, _, da| div255(s * da));
462blend_fn!(destination_in, |_, d, sa, _| div255(d * sa));
463blend_fn!(source_out, |s, _, _, da| div255(s * inv(da)));
464blend_fn!(destination_out, |_, d, sa, _| div255(d * inv(sa)));
465blend_fn!(source_over, |s, d, sa, _| s + div255(d * inv(sa)));
466blend_fn!(destination_over, |s, d, _, da| d + div255(s * inv(da)));
467blend_fn!(modulate, |s, d, _, _| div255(s * d));
468blend_fn!(multiply, |s, d, sa, da| div255(s * inv(da) + d * inv(sa) + s * d));
469blend_fn!(screen, |s, d, _, _| s + d - div255(s * d));
470blend_fn!(xor, |s, d, sa, da| div255(s * inv(da) + d * inv(sa)));
471
472blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255)));
474
475
476macro_rules! blend_fn2 {
477 ($name:ident, $f:expr) => {
478 fn $name(p: &mut Pipeline) {
479 p.r = $f(p.r, p.dr, p.a, p.da);
481 p.g = $f(p.g, p.dg, p.a, p.da);
482 p.b = $f(p.b, p.db, p.a, p.da);
483 p.a = p.a + div255(p.da * inv(p.a));
484
485 p.next_stage();
486 }
487 };
488}
489
490blend_fn2!(darken, |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa))));
491blend_fn2!(lighten, |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa))));
492blend_fn2!(exclusion, |s: u16x16, d, _, _| s + d - u16x16::splat(2) * div255(s * d));
493
494blend_fn2!(difference, |s: u16x16, d, sa, da|
495 s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa))));
496
497blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| {
498 div255(s * inv(da) + d * inv(sa)
499 + (s+s).cmp_le(&sa).blend(
500 u16x16::splat(2) * s * d,
501 sa * da - u16x16::splat(2) * (sa-s)*(da-d)
502 )
503 )
504});
505
506blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| {
507 div255(s * inv(da) + d * inv(sa)
508 + (d+d).cmp_le(&da).blend(
509 u16x16::splat(2) * s * d,
510 sa * da - u16x16::splat(2) * (sa-s)*(da-d)
511 )
512 )
513});
514
515pub fn source_over_rgba(p: &mut Pipeline) {
516 let pixels = p.pixmap.slice16_at_xy(p.dx, p.dy);
517 load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
518 p.r = p.r + div255(p.dr * inv(p.a));
519 p.g = p.g + div255(p.dg * inv(p.a));
520 p.b = p.b + div255(p.db * inv(p.a));
521 p.a = p.a + div255(p.da * inv(p.a));
522 store_8888(&p.r, &p.g, &p.b, &p.a, pixels);
523
524 p.next_stage();
525}
526
527pub fn source_over_rgba_tail(p: &mut Pipeline) {
528 let pixels = p.pixmap.slice_at_xy(p.dx, p.dy);
529 load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
530 p.r = p.r + div255(p.dr * inv(p.a));
531 p.g = p.g + div255(p.dg * inv(p.a));
532 p.b = p.b + div255(p.db * inv(p.a));
533 p.a = p.a + div255(p.da * inv(p.a));
534 store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels);
535
536 p.next_stage();
537}
538
539fn transform(p: &mut Pipeline) {
540 let ts = &p.ctx.transform;
541
542 let x = join(&p.r, &p.g);
543 let y = join(&p.b, &p.a);
544
545 let nx = mad(x, f32x16::splat(ts.sx), mad(y, f32x16::splat(ts.kx), f32x16::splat(ts.tx)));
546 let ny = mad(x, f32x16::splat(ts.ky), mad(y, f32x16::splat(ts.sy), f32x16::splat(ts.ty)));
547
548 split(&nx, &mut p.r, &mut p.g);
549 split(&ny, &mut p.b, &mut p.a);
550
551 p.next_stage();
552}
553
554fn pad_x1(p: &mut Pipeline) {
555 let x = join(&p.r, &p.g);
556 let x = x.normalize();
557 split(&x, &mut p.r, &mut p.g);
558
559 p.next_stage();
560}
561
562fn reflect_x1(p: &mut Pipeline) {
563 let x = join(&p.r, &p.g);
564 let two = |x| x + x;
565 let x = (
566 (x - f32x16::splat(1.0))
567 - two(((x - f32x16::splat(1.0)) * f32x16::splat(0.5)).floor())
568 - f32x16::splat(1.0)
569 ).abs().normalize();
570 split(&x, &mut p.r, &mut p.g);
571
572 p.next_stage();
573}
574
575fn repeat_x1(p: &mut Pipeline) {
576 let x = join(&p.r, &p.g);
577 let x = (x - x.floor()).normalize();
578 split(&x, &mut p.r, &mut p.g);
579
580 p.next_stage();
581}
582
583fn gradient(p: &mut Pipeline) {
584 let ctx = &p.ctx.gradient;
585
586 let t = join(&p.r, &p.g);
588 let mut idx = u16x16::splat(0);
589 for i in 1..ctx.len {
590 let tt = ctx.t_values[i].get();
591 let t0: [f32; 8] = t.0.into();
592 let t1: [f32; 8] = t.1.into();
593 idx.0[ 0] += (t0[0] >= tt) as u16;
594 idx.0[ 1] += (t0[1] >= tt) as u16;
595 idx.0[ 2] += (t0[2] >= tt) as u16;
596 idx.0[ 3] += (t0[3] >= tt) as u16;
597 idx.0[ 4] += (t0[4] >= tt) as u16;
598 idx.0[ 5] += (t0[5] >= tt) as u16;
599 idx.0[ 6] += (t0[6] >= tt) as u16;
600 idx.0[ 7] += (t0[7] >= tt) as u16;
601 idx.0[ 8] += (t1[0] >= tt) as u16;
602 idx.0[ 9] += (t1[1] >= tt) as u16;
603 idx.0[10] += (t1[2] >= tt) as u16;
604 idx.0[11] += (t1[3] >= tt) as u16;
605 idx.0[12] += (t1[4] >= tt) as u16;
606 idx.0[13] += (t1[5] >= tt) as u16;
607 idx.0[14] += (t1[6] >= tt) as u16;
608 idx.0[15] += (t1[7] >= tt) as u16;
609 }
610 gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
611
612 p.next_stage();
613}
614
615fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
616 let ctx = &p.ctx.evenly_spaced_2_stop_gradient;
617
618 let t = join(&p.r, &p.g);
619 round_f32_to_u16(
620 mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)),
621 mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)),
622 mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)),
623 mad(t, f32x16::splat(ctx.factor.a), f32x16::splat(ctx.bias.a)),
624 &mut p.r, &mut p.g, &mut p.b, &mut p.a,
625 );
626
627 p.next_stage();
628}
629
630fn xy_to_radius(p: &mut Pipeline) {
631 let x = join(&p.r, &p.g);
632 let y = join(&p.b, &p.a);
633 let x = (x*x + y*y).sqrt();
634 split(&x, &mut p.r, &mut p.g);
635 split(&y, &mut p.b, &mut p.a);
636
637 p.next_stage();
638}
639
640fn gradient_lookup(
643 ctx: &super::GradientCtx, idx: &u16x16, t: f32x16,
644 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
645) {
646 macro_rules! gather {
647 ($d:expr, $c:ident) => {
648 f32x16(
651 f32x8::from([
652 $d[idx.0[ 0] as usize].$c,
653 $d[idx.0[ 1] as usize].$c,
654 $d[idx.0[ 2] as usize].$c,
655 $d[idx.0[ 3] as usize].$c,
656 $d[idx.0[ 4] as usize].$c,
657 $d[idx.0[ 5] as usize].$c,
658 $d[idx.0[ 6] as usize].$c,
659 $d[idx.0[ 7] as usize].$c,
660 ]),
661 f32x8::from([
662 $d[idx.0[ 8] as usize].$c,
663 $d[idx.0[ 9] as usize].$c,
664 $d[idx.0[10] as usize].$c,
665 $d[idx.0[11] as usize].$c,
666 $d[idx.0[12] as usize].$c,
667 $d[idx.0[13] as usize].$c,
668 $d[idx.0[14] as usize].$c,
669 $d[idx.0[15] as usize].$c,
670 ]),
671 )
672 };
673 }
674
675 let fr = gather!(&ctx.factors, r);
676 let fg = gather!(&ctx.factors, g);
677 let fb = gather!(&ctx.factors, b);
678 let fa = gather!(&ctx.factors, a);
679
680 let br = gather!(&ctx.biases, r);
681 let bg = gather!(&ctx.biases, g);
682 let bb = gather!(&ctx.biases, b);
683 let ba = gather!(&ctx.biases, a);
684
685 round_f32_to_u16(
686 mad(t, fr, br),
687 mad(t, fg, bg),
688 mad(t, fb, bb),
689 mad(t, fa, ba),
690 r, g, b, a,
691 );
692}
693
694#[inline(always)]
695fn round_f32_to_u16(
696 rf: f32x16, gf: f32x16, bf: f32x16, af: f32x16,
697 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
698) {
699 let rf = rf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
703 let gf = gf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
704 let bf = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
705 let af = af * f32x16::splat(255.0) + f32x16::splat(0.5);
706
707 rf.save_to_u16x16(r);
708 gf.save_to_u16x16(g);
709 bf.save_to_u16x16(b);
710 af.save_to_u16x16(a);
711}
712
713pub fn just_return(_: &mut Pipeline) {
714 }
716
717pub fn null_fn(_: &mut Pipeline) {
718 }
720
721#[inline(always)]
722fn load_8888(
723 data: &[PremultipliedColorU8; STAGE_WIDTH],
724 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
725) {
726 *r = u16x16([
727 data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16,
728 data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16,
729 data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16,
730 data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16,
731 ]);
732
733 *g = u16x16([
734 data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16,
735 data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16,
736 data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16,
737 data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16,
738 ]);
739
740 *b = u16x16([
741 data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16,
742 data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16,
743 data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16,
744 data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16,
745 ]);
746
747 *a = u16x16([
748 data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16,
749 data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16,
750 data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16,
751 data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16,
752 ]);
753}
754
755#[inline(always)]
756fn load_8888_tail(
757 tail: usize, data: &[PremultipliedColorU8],
758 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
759) {
760 let mut tmp = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
763 tmp[0..tail].copy_from_slice(&data[0..tail]);
764 load_8888(&tmp, r, g, b, a);
765}
766
767#[inline(always)]
768fn store_8888(
769 r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
770 data: &mut [PremultipliedColorU8; STAGE_WIDTH],
771) {
772 let r = r.as_slice();
773 let g = g.as_slice();
774 let b = b.as_slice();
775 let a = a.as_slice();
776
777 data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8);
778 data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8);
779 data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r[ 2] as u8, g[ 2] as u8, b[ 2] as u8, a[ 2] as u8);
780 data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r[ 3] as u8, g[ 3] as u8, b[ 3] as u8, a[ 3] as u8);
781 data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r[ 4] as u8, g[ 4] as u8, b[ 4] as u8, a[ 4] as u8);
782 data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r[ 5] as u8, g[ 5] as u8, b[ 5] as u8, a[ 5] as u8);
783 data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r[ 6] as u8, g[ 6] as u8, b[ 6] as u8, a[ 6] as u8);
784 data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r[ 7] as u8, g[ 7] as u8, b[ 7] as u8, a[ 7] as u8);
785 data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r[ 8] as u8, g[ 8] as u8, b[ 8] as u8, a[ 8] as u8);
786 data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r[ 9] as u8, g[ 9] as u8, b[ 9] as u8, a[ 9] as u8);
787 data[10] = PremultipliedColorU8::from_rgba_unchecked(r[10] as u8, g[10] as u8, b[10] as u8, a[10] as u8);
788 data[11] = PremultipliedColorU8::from_rgba_unchecked(r[11] as u8, g[11] as u8, b[11] as u8, a[11] as u8);
789 data[12] = PremultipliedColorU8::from_rgba_unchecked(r[12] as u8, g[12] as u8, b[12] as u8, a[12] as u8);
790 data[13] = PremultipliedColorU8::from_rgba_unchecked(r[13] as u8, g[13] as u8, b[13] as u8, a[13] as u8);
791 data[14] = PremultipliedColorU8::from_rgba_unchecked(r[14] as u8, g[14] as u8, b[14] as u8, a[14] as u8);
792 data[15] = PremultipliedColorU8::from_rgba_unchecked(r[15] as u8, g[15] as u8, b[15] as u8, a[15] as u8);
793}
794
795#[inline(always)]
796fn store_8888_tail(
797 r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
798 tail: usize, data: &mut [PremultipliedColorU8],
799) {
800 let r = r.as_slice();
801 let g = g.as_slice();
802 let b = b.as_slice();
803 let a = a.as_slice();
804
805 for i in 0..STAGE_WIDTH {
809 data[i] = PremultipliedColorU8::from_rgba_unchecked(
810 r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8,
811 );
812
813 if i + 1 == tail {
814 break;
815 }
816 }
817}
818
819#[inline(always)]
820fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) {
821 *a = u16x16([
822 data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16,
823 data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16,
824 data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16,
825 data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16,
826 ]);
827}
828
829#[inline(always)]
830fn div255(v: u16x16) -> u16x16 {
831 (v + u16x16::splat(255)) >> u16x16::splat(8) }
836
837#[inline(always)]
838fn inv(v: u16x16) -> u16x16 {
839 u16x16::splat(255) - v
840}
841
842#[inline(always)]
843fn from_float(f: f32) -> u16x16 {
844 u16x16::splat((f * 255.0 + 0.5) as u16)
845}
846
847#[inline(always)]
848fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 {
849 div255(from * inv(t) + to * t)
850}
851
852#[inline(always)]
853fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) {
854 let data: [u8; 64] = bytemuck::cast(*v);
856 let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0);
857 let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0);
858
859 d0.copy_from_slice(&data[0..32]);
860 d1.copy_from_slice(&data[32..64]);
861}
862
863#[inline(always)]
864fn join(lo: &u16x16, hi: &u16x16) -> f32x16 {
865 let d0: [u8; 32] = bytemuck::cast(lo.0);
868 let d1: [u8; 32] = bytemuck::cast(hi.0);
869
870 let mut v = f32x16::default();
871 let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v);
872
873 data[0..32].copy_from_slice(&d0);
874 data[32..64].copy_from_slice(&d1);
875
876 v
877}
878
879#[inline(always)]
880fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 {
881 f * m + a
883}