tiny_skia/wide/
u16x16_t.rs

1// Copyright 2020 Yevhenii Reizner
2//
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file.
5
6// No need to use explicit 256bit AVX2 SIMD.
7// `-C target-cpu=native` will autovectorize it better than us.
8// Not even sure why explicit instructions are so slow...
9//
10// On ARM AArch64 we can actually get up to 2x performance boost by using SIMD.
11//
12// We also have to inline all the methods. They are pretty large,
13// but without the inlining the performance is plummeting.
14
15#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
16use bytemuck::cast;
17#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
18use core::arch::aarch64::uint16x8_t;
19
20#[allow(non_camel_case_types)]
21#[derive(Copy, Clone, PartialEq, Default, Debug)]
22pub struct u16x16(pub [u16; 16]);
23
24macro_rules! impl_u16x16_op {
25    ($a:expr, $op:ident, $b:expr) => {
26        u16x16([
27            $a.0[0].$op($b.0[0]),
28            $a.0[1].$op($b.0[1]),
29            $a.0[2].$op($b.0[2]),
30            $a.0[3].$op($b.0[3]),
31            $a.0[4].$op($b.0[4]),
32            $a.0[5].$op($b.0[5]),
33            $a.0[6].$op($b.0[6]),
34            $a.0[7].$op($b.0[7]),
35            $a.0[8].$op($b.0[8]),
36            $a.0[9].$op($b.0[9]),
37            $a.0[10].$op($b.0[10]),
38            $a.0[11].$op($b.0[11]),
39            $a.0[12].$op($b.0[12]),
40            $a.0[13].$op($b.0[13]),
41            $a.0[14].$op($b.0[14]),
42            $a.0[15].$op($b.0[15]),
43        ])
44    };
45}
46
47#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
48macro_rules! impl_aarch64_call {
49    ($f:ident, $a:expr, $b:expr) => {
50        let a = $a.split();
51        let b = $b.split();
52        Self(bytemuck::cast([
53            unsafe { core::arch::aarch64::$f(a.0, b.0) },
54            unsafe { core::arch::aarch64::$f(a.1, b.1) },
55        ]))
56    };
57}
58
59impl u16x16 {
60    #[inline]
61    pub fn splat(n: u16) -> Self {
62        Self([n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n])
63    }
64
65    #[inline]
66    pub fn as_slice(&self) -> &[u16; 16] {
67        &self.0
68    }
69
70    #[inline]
71    pub fn min(&self, rhs: &Self) -> Self {
72        cfg_if::cfg_if! {
73            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
74                impl_aarch64_call!(vminq_u16, self, rhs)
75            } else {
76                impl_u16x16_op!(self, min, rhs)
77            }
78        }
79    }
80
81    #[inline]
82    pub fn max(&self, rhs: &Self) -> Self {
83        cfg_if::cfg_if! {
84            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
85                impl_aarch64_call!(vmaxq_u16, self, rhs)
86            } else {
87                impl_u16x16_op!(self, max, rhs)
88            }
89        }
90    }
91
92    #[inline]
93    pub fn cmp_le(&self, rhs: &Self) -> Self {
94        cfg_if::cfg_if! {
95            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
96                impl_aarch64_call!(vcleq_u16, self, rhs)
97            } else {
98                Self([
99                    if self.0[ 0] <= rhs.0[ 0] { !0 } else { 0 },
100                    if self.0[ 1] <= rhs.0[ 1] { !0 } else { 0 },
101                    if self.0[ 2] <= rhs.0[ 2] { !0 } else { 0 },
102                    if self.0[ 3] <= rhs.0[ 3] { !0 } else { 0 },
103                    if self.0[ 4] <= rhs.0[ 4] { !0 } else { 0 },
104                    if self.0[ 5] <= rhs.0[ 5] { !0 } else { 0 },
105                    if self.0[ 6] <= rhs.0[ 6] { !0 } else { 0 },
106                    if self.0[ 7] <= rhs.0[ 7] { !0 } else { 0 },
107                    if self.0[ 8] <= rhs.0[ 8] { !0 } else { 0 },
108                    if self.0[ 9] <= rhs.0[ 9] { !0 } else { 0 },
109                    if self.0[10] <= rhs.0[10] { !0 } else { 0 },
110                    if self.0[11] <= rhs.0[11] { !0 } else { 0 },
111                    if self.0[12] <= rhs.0[12] { !0 } else { 0 },
112                    if self.0[13] <= rhs.0[13] { !0 } else { 0 },
113                    if self.0[14] <= rhs.0[14] { !0 } else { 0 },
114                    if self.0[15] <= rhs.0[15] { !0 } else { 0 },
115                ])
116            }
117        }
118    }
119
120    #[inline]
121    pub fn blend(self, t: Self, e: Self) -> Self {
122        (t & self) | (e & !self)
123    }
124
125    #[inline]
126    #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
127    pub fn split(self) -> (uint16x8_t, uint16x8_t) {
128        let pair: [uint16x8_t; 2] = cast(self.0);
129        (pair[0], pair[1])
130    }
131}
132
133impl core::ops::Add<u16x16> for u16x16 {
134    type Output = Self;
135
136    #[inline]
137    fn add(self, rhs: Self) -> Self::Output {
138        cfg_if::cfg_if! {
139            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
140                impl_aarch64_call!(vaddq_u16, self, rhs)
141            } else {
142                impl_u16x16_op!(self, add, rhs)
143            }
144        }
145    }
146}
147
148impl core::ops::Sub<u16x16> for u16x16 {
149    type Output = Self;
150
151    #[inline]
152    fn sub(self, rhs: Self) -> Self::Output {
153        cfg_if::cfg_if! {
154            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
155                impl_aarch64_call!(vsubq_u16, self, rhs)
156            } else {
157                impl_u16x16_op!(self, sub, rhs)
158            }
159        }
160    }
161}
162
163impl core::ops::Mul<u16x16> for u16x16 {
164    type Output = Self;
165
166    #[inline]
167    fn mul(self, rhs: Self) -> Self::Output {
168        cfg_if::cfg_if! {
169            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
170                impl_aarch64_call!(vmulq_u16, self, rhs)
171            } else {
172                impl_u16x16_op!(self, mul, rhs)
173            }
174        }
175    }
176}
177
178impl core::ops::Div<u16x16> for u16x16 {
179    type Output = Self;
180
181    #[inline]
182    fn div(self, rhs: Self) -> Self::Output {
183        impl_u16x16_op!(self, div, rhs)
184    }
185}
186
187impl core::ops::BitAnd<u16x16> for u16x16 {
188    type Output = Self;
189
190    #[inline]
191    fn bitand(self, rhs: Self) -> Self::Output {
192        cfg_if::cfg_if! {
193            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
194                impl_aarch64_call!(vandq_u16, self, rhs)
195            } else {
196                impl_u16x16_op!(self, bitand, rhs)
197            }
198        }
199    }
200}
201
202impl core::ops::BitOr<u16x16> for u16x16 {
203    type Output = Self;
204
205    #[inline]
206    fn bitor(self, rhs: Self) -> Self::Output {
207        cfg_if::cfg_if! {
208            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
209                impl_aarch64_call!(vorrq_u16, self, rhs)
210            } else {
211                impl_u16x16_op!(self, bitor, rhs)
212            }
213        }
214    }
215}
216
217impl core::ops::Not for u16x16 {
218    type Output = Self;
219
220    #[inline]
221    fn not(self) -> Self::Output {
222        u16x16([
223            !self.0[0],
224            !self.0[1],
225            !self.0[2],
226            !self.0[3],
227            !self.0[4],
228            !self.0[5],
229            !self.0[6],
230            !self.0[7],
231            !self.0[8],
232            !self.0[9],
233            !self.0[10],
234            !self.0[11],
235            !self.0[12],
236            !self.0[13],
237            !self.0[14],
238            !self.0[15],
239        ])
240    }
241}
242
243impl core::ops::Shr for u16x16 {
244    type Output = Self;
245
246    #[inline]
247    fn shr(self, rhs: Self) -> Self::Output {
248        impl_u16x16_op!(self, shr, rhs)
249    }
250}