1#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
32#![cfg(feature = "x86")]
33#![allow(
34 clippy::wildcard_imports,
35 clippy::cast_possible_truncation,
36 clippy::too_many_arguments,
37 clippy::inline_always,
38 clippy::doc_markdown,
39 dead_code
40)]
41
42#[cfg(target_arch = "x86")]
43use core::arch::x86::*;
44#[cfg(target_arch = "x86_64")]
45use core::arch::x86_64::*;
46
47pub union YmmRegister {
48 mm256: __m256i,
50 array: [i16; 16]
52}
53
54#[inline(always)]
75pub fn ycbcr_to_rgb_avx2(
76 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
77) {
78 unsafe {
81 ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
82 }
83}
84
85#[inline]
86#[target_feature(enable = "avx2")]
87#[target_feature(enable = "avx")]
88unsafe fn ycbcr_to_rgb_avx2_1(
89 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
90) {
91 let tmp: &mut [u8; 48] = out
93 .get_mut(*offset..*offset + 48)
94 .expect("Slice to small cannot write")
95 .try_into()
96 .unwrap();
97
98 let (r, g, b) = ycbcr_to_rgb_baseline(y, cb, cr);
99
100 let mut j = 0;
101 let mut i = 0;
102 while i < 48 {
103 tmp[i] = r.array[j] as u8;
104
105 tmp[i + 1] = g.array[j] as u8;
106 tmp[i + 2] = b.array[j] as u8;
107 i += 3;
108 j += 1;
109 }
110
111 *offset += 48;
112}
113
114#[inline]
126#[target_feature(enable = "avx2")]
127#[target_feature(enable = "avx")]
128unsafe fn ycbcr_to_rgb_baseline(
129 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
130) -> (YmmRegister, YmmRegister, YmmRegister) {
131 let y_c = _mm256_loadu_si256(y.as_ptr().cast());
136
137 let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
138
139 let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
140
141 let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
145
146 let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
148
149 let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
153
154 let r2 = _mm256_srai_epi16::<5>(r1);
156
157 let r = YmmRegister {
160 mm256: clamp_avx(_mm256_add_epi16(y_c, r2))
161 };
162
163 let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
167
168 let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
170
171 let g3 = _mm256_add_epi16(g1, g2);
174
175 let g4 = _mm256_srai_epi16::<5>(g3);
177
178 let g = YmmRegister {
180 mm256: clamp_avx(_mm256_sub_epi16(y_c, g4))
181 };
182
183 let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
186
187 let b2 = _mm256_srai_epi16::<6>(b1);
189
190 let b = YmmRegister {
192 mm256: clamp_avx(_mm256_add_epi16(b2, y_c))
193 };
194
195 return (r, g, b);
196}
197
198#[inline]
199#[target_feature(enable = "avx2")]
200unsafe fn ycbcr_to_rgb_baseline_no_clamp(
206 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
207) -> (__m256i, __m256i, __m256i) {
208 let y_c = _mm256_loadu_si256(y.as_ptr().cast());
211
212 let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
213
214 let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
215
216 let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
220
221 let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
223
224 let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
228
229 let r2 = _mm256_srai_epi16::<5>(r1);
231
232 let r = _mm256_add_epi16(y_c, r2);
235
236 let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
240
241 let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
243
244 let g3 = _mm256_add_epi16(g1, g2);
247
248 let g4 = _mm256_srai_epi16::<5>(g3);
250
251 let g = _mm256_sub_epi16(y_c, g4);
253
254 let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
257
258 let b2 = _mm256_srai_epi16::<6>(b1);
260
261 let b = _mm256_add_epi16(b2, y_c);
263
264 return (r, g, b);
265}
266
267#[inline(always)]
268pub fn ycbcr_to_rgba_avx2(
269 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
270) {
271 unsafe {
272 ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
273 }
274}
275
276#[inline]
277#[target_feature(enable = "avx2")]
278#[rustfmt::skip]
279unsafe fn ycbcr_to_rgba_unsafe(
280 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
281 out: &mut [u8],
282 offset: &mut usize,
283)
284{
285 let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write").try_into().unwrap();
287
288 let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
289
290 let c = _mm256_packus_epi16(r, g); let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); let e = _mm256_unpacklo_epi8(c, d); let f = _mm256_unpackhi_epi8(c, d); let g = _mm256_unpacklo_epi8(e, f); let h = _mm256_unpackhi_epi8(e, f);
303
304
305 let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);
307
308 let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);
309
310 let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);
311
312 let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);
313
314 let m = _mm256_blend_epi32::<0b1111_0000>(i, j);
315
316 let n = _mm256_blend_epi32::<0b1111_0000>(k, l);
317
318
319 _mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
322
323 _mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);
324
325 *offset += 64;
326}
327
328#[inline]
333#[target_feature(enable = "avx2")]
334#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
335unsafe fn clamp_avx(reg: __m256i) -> __m256i {
336 let min_s = _mm256_set1_epi16(0);
338
339 let max_s = _mm256_set1_epi16(255);
341
342 let max_v = _mm256_max_epi16(reg, min_s); let min_v = _mm256_min_epi16(max_v, max_s); return min_v;
345}
346
347#[inline]
348const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
349 (z << 6) | (y << 4) | (x << 2) | w
350}