zune_jpeg/color_convert/
avx.rs

1/*
2 * Copyright (c) 2023.
3 *
4 * This software is free software;
5 *
6 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7 */
8
9//! AVX color conversion routines
10//!
11//! Okay these codes are cool
12//!
13//! Herein lies super optimized codes to do color conversions.
14//!
15//!
16//! 1. The YCbCr to RGB use integer approximations and not the floating point equivalent.
17//! That means we may be +- 2 of pixels generated by libjpeg-turbo jpeg decoding
18//! (also libjpeg uses routines like `Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G`)
19//!
20//! Firstly, we use integers (fun fact:there is no part of this code base where were dealing with
21//! floating points.., fun fact: the first fun fact wasn't even fun.)
22//!
23//! Secondly ,we have cool clamping code, especially for rgba , where we don't need clamping and we
24//! spend our time cursing that Intel decided permute instructions to work like 2 128 bit vectors(the compiler opitmizes
25//! it out to something cool).
26//!
27//! There isn't a lot here (not as fun as bitstream ) but I hope you find what you're looking for.
28//!
29//! O and ~~subscribe to my youtube channel~~
30
31#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
32#![cfg(feature = "x86")]
33#![allow(
34    clippy::wildcard_imports,
35    clippy::cast_possible_truncation,
36    clippy::too_many_arguments,
37    clippy::inline_always,
38    clippy::doc_markdown,
39    dead_code
40)]
41
42#[cfg(target_arch = "x86")]
43use core::arch::x86::*;
44#[cfg(target_arch = "x86_64")]
45use core::arch::x86_64::*;
46
47pub union YmmRegister {
48    // both are 32 when using std::mem::size_of
49    mm256: __m256i,
50    // for avx color conversion
51    array: [i16; 16]
52}
53
54//--------------------------------------------------------------------------------------------------
55// AVX conversion routines
56//--------------------------------------------------------------------------------------------------
57
58///
59/// Convert YCBCR to RGB using AVX instructions
60///
61///  # Note
62///**IT IS THE RESPONSIBILITY OF THE CALLER TO CALL THIS IN CPUS SUPPORTING
63/// AVX2 OTHERWISE THIS IS UB**
64///
65/// *Peace*
66///
67/// This library itself will ensure that it's never called in CPU's not
68/// supporting AVX2
69///
70/// # Arguments
71/// - `y`,`cb`,`cr`: A reference of 8 i32's
72/// - `out`: The output  array where we store our converted items
73/// - `offset`: The position from 0 where we write these RGB values
74#[inline(always)]
75pub fn ycbcr_to_rgb_avx2(
76    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
77) {
78    // call this in another function to tell RUST to vectorize this
79    // storing
80    unsafe {
81        ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
82    }
83}
84
85#[inline]
86#[target_feature(enable = "avx2")]
87#[target_feature(enable = "avx")]
88unsafe fn ycbcr_to_rgb_avx2_1(
89    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
90) {
91    // Load output buffer
92    let tmp: &mut [u8; 48] = out
93        .get_mut(*offset..*offset + 48)
94        .expect("Slice to small cannot write")
95        .try_into()
96        .unwrap();
97
98    let (r, g, b) = ycbcr_to_rgb_baseline(y, cb, cr);
99
100    let mut j = 0;
101    let mut i = 0;
102    while i < 48 {
103        tmp[i] = r.array[j] as u8;
104
105        tmp[i + 1] = g.array[j] as u8;
106        tmp[i + 2] = b.array[j] as u8;
107        i += 3;
108        j += 1;
109    }
110
111    *offset += 48;
112}
113
114/// Baseline implementation of YCBCR to RGB for avx,
115///
116/// It uses integer operations as opposed to floats, the approximation is
117/// difficult for the  eye to see, but this means that it may produce different
118/// values with libjpeg_turbo.  if accuracy is of utmost importance, use that.
119///
120/// this function should be called for most implementations, including
121/// - ycbcr->rgb
122/// - ycbcr->rgba
123/// - ycbcr->brga
124/// - ycbcr->rgbx
125#[inline]
126#[target_feature(enable = "avx2")]
127#[target_feature(enable = "avx")]
128unsafe fn ycbcr_to_rgb_baseline(
129    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
130) -> (YmmRegister, YmmRegister, YmmRegister) {
131    // Load values into a register
132    //
133    // dst[127:0] := MEM[loaddr+127:loaddr]
134    // dst[255:128] := MEM[hiaddr+127:hiaddr]
135    let y_c = _mm256_loadu_si256(y.as_ptr().cast());
136
137    let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
138
139    let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
140
141    // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb
142
143    // Cb = Cb-128;
144    let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
145
146    // cr = Cb -128;
147    let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
148
149    // Calculate Y->R
150    // r = Y + 45 * Cr / 32
151    // 45*cr
152    let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
153
154    // r1>>5
155    let r2 = _mm256_srai_epi16::<5>(r1);
156
157    //y+r2
158
159    let r = YmmRegister {
160        mm256: clamp_avx(_mm256_add_epi16(y_c, r2))
161    };
162
163    // g = Y - (11 * Cb + 23 * Cr) / 32 ;
164
165    // 11*cb
166    let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
167
168    // 23*cr
169    let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
170
171    //(11
172    //(11 * Cb + 23 * Cr)
173    let g3 = _mm256_add_epi16(g1, g2);
174
175    // (11 * Cb + 23 * Cr) / 32
176    let g4 = _mm256_srai_epi16::<5>(g3);
177
178    // Y - (11 * Cb + 23 * Cr) / 32 ;
179    let g = YmmRegister {
180        mm256: clamp_avx(_mm256_sub_epi16(y_c, g4))
181    };
182
183    // b = Y + 113 * Cb / 64
184    // 113 * cb
185    let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
186
187    //113 * Cb / 64
188    let b2 = _mm256_srai_epi16::<6>(b1);
189
190    // b = Y + 113 * Cb / 64 ;
191    let b = YmmRegister {
192        mm256: clamp_avx(_mm256_add_epi16(b2, y_c))
193    };
194
195    return (r, g, b);
196}
197
198#[inline]
199#[target_feature(enable = "avx2")]
200/// A baseline implementation of YCbCr to RGB conversion which does not carry
201/// out clamping
202///
203/// This is used by the `ycbcr_to_rgba_avx` and `ycbcr_to_rgbx` conversion
204/// routines
205unsafe fn ycbcr_to_rgb_baseline_no_clamp(
206    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
207) -> (__m256i, __m256i, __m256i) {
208    // Load values into a register
209    //
210    let y_c = _mm256_loadu_si256(y.as_ptr().cast());
211
212    let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
213
214    let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
215
216    // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb
217
218    // Cb = Cb-128;
219    let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
220
221    // cr = Cb -128;
222    let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
223
224    // Calculate Y->R
225    // r = Y + 45 * Cr / 32
226    // 45*cr
227    let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
228
229    // r1>>5
230    let r2 = _mm256_srai_epi16::<5>(r1);
231
232    //y+r2
233
234    let r = _mm256_add_epi16(y_c, r2);
235
236    // g = Y - (11 * Cb + 23 * Cr) / 32 ;
237
238    // 11*cb
239    let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
240
241    // 23*cr
242    let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
243
244    //(11
245    //(11 * Cb + 23 * Cr)
246    let g3 = _mm256_add_epi16(g1, g2);
247
248    // (11 * Cb + 23 * Cr) / 32
249    let g4 = _mm256_srai_epi16::<5>(g3);
250
251    // Y - (11 * Cb + 23 * Cr) / 32 ;
252    let g = _mm256_sub_epi16(y_c, g4);
253
254    // b = Y + 113 * Cb / 64
255    // 113 * cb
256    let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
257
258    //113 * Cb / 64
259    let b2 = _mm256_srai_epi16::<6>(b1);
260
261    // b = Y + 113 * Cb / 64 ;
262    let b = _mm256_add_epi16(b2, y_c);
263
264    return (r, g, b);
265}
266
267#[inline(always)]
268pub fn ycbcr_to_rgba_avx2(
269    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
270) {
271    unsafe {
272        ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
273    }
274}
275
276#[inline]
277#[target_feature(enable = "avx2")]
278#[rustfmt::skip]
279unsafe fn ycbcr_to_rgba_unsafe(
280    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
281    out: &mut [u8],
282    offset: &mut usize,
283)
284{
285    // check if we have enough space to write.
286    let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write").try_into().unwrap();
287
288    let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
289
290    // set alpha channel to 255 for opaque
291
292    // And no these comments were not from me pressing the keyboard
293
294    // Pack the integers into u8's using signed saturation.
295    let c = _mm256_packus_epi16(r, g); //aaaaa_bbbbb_aaaaa_bbbbbb
296    let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); // cccccc_dddddd_ccccccc_ddddd
297    // transpose_u16 and interleave channels
298    let e = _mm256_unpacklo_epi8(c, d); //ab_ab_ab_ab_ab_ab_ab_ab
299    let f = _mm256_unpackhi_epi8(c, d); //cd_cd_cd_cd_cd_cd_cd_cd
300    // final transpose_u16
301    let g = _mm256_unpacklo_epi8(e, f); //abcd_abcd_abcd_abcd_abcd
302    let h = _mm256_unpackhi_epi8(e, f);
303
304
305    // undo packus shuffling...
306    let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);
307
308    let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);
309
310    let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);
311
312    let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);
313
314    let m = _mm256_blend_epi32::<0b1111_0000>(i, j);
315
316    let n = _mm256_blend_epi32::<0b1111_0000>(k, l);
317
318
319    // Store
320    // Use streaming instructions to prevent polluting the cache?
321    _mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
322
323    _mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);
324
325    *offset += 64;
326}
327
328/// Clamp values between 0 and 255
329///
330/// This function clamps all values in `reg` to be between 0 and 255
331///( the accepted values for RGB)
332#[inline]
333#[target_feature(enable = "avx2")]
334#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
335unsafe fn clamp_avx(reg: __m256i) -> __m256i {
336    // the lowest value
337    let min_s = _mm256_set1_epi16(0);
338
339    // Highest value
340    let max_s = _mm256_set1_epi16(255);
341
342    let max_v = _mm256_max_epi16(reg, min_s); //max(a,0)
343    let min_v = _mm256_min_epi16(max_v, max_s); //min(max(a,0),255)
344    return min_v;
345}
346
347#[inline]
348const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
349    (z << 6) | (y << 4) | (x << 2) | w
350}