zune_jpeg/color_convert/
avx.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
/*
 * Copyright (c) 2023.
 *
 * This software is free software;
 *
 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
 */

//! AVX color conversion routines
//!
//! Okay these codes are cool
//!
//! Herein lies super optimized codes to do color conversions.
//!
//!
//! 1. The YCbCr to RGB use integer approximations and not the floating point equivalent.
//! That means we may be +- 2 of pixels generated by libjpeg-turbo jpeg decoding
//! (also libjpeg uses routines like `Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G`)
//!
//! Firstly, we use integers (fun fact:there is no part of this code base where were dealing with
//! floating points.., fun fact: the first fun fact wasn't even fun.)
//!
//! Secondly ,we have cool clamping code, especially for rgba , where we don't need clamping and we
//! spend our time cursing that Intel decided permute instructions to work like 2 128 bit vectors(the compiler opitmizes
//! it out to something cool).
//!
//! There isn't a lot here (not as fun as bitstream ) but I hope you find what you're looking for.
//!
//! O and ~~subscribe to my youtube channel~~

#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#![cfg(feature = "x86")]
#![allow(
    clippy::wildcard_imports,
    clippy::cast_possible_truncation,
    clippy::too_many_arguments,
    clippy::inline_always,
    clippy::doc_markdown,
    dead_code
)]

#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

pub union YmmRegister {
    // both are 32 when using std::mem::size_of
    mm256: __m256i,
    // for avx color conversion
    array: [i16; 16]
}

//--------------------------------------------------------------------------------------------------
// AVX conversion routines
//--------------------------------------------------------------------------------------------------

///
/// Convert YCBCR to RGB using AVX instructions
///
///  # Note
///**IT IS THE RESPONSIBILITY OF THE CALLER TO CALL THIS IN CPUS SUPPORTING
/// AVX2 OTHERWISE THIS IS UB**
///
/// *Peace*
///
/// This library itself will ensure that it's never called in CPU's not
/// supporting AVX2
///
/// # Arguments
/// - `y`,`cb`,`cr`: A reference of 8 i32's
/// - `out`: The output  array where we store our converted items
/// - `offset`: The position from 0 where we write these RGB values
#[inline(always)]
pub fn ycbcr_to_rgb_avx2(
    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
) {
    // call this in another function to tell RUST to vectorize this
    // storing
    unsafe {
        ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
    }
}

#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx")]
unsafe fn ycbcr_to_rgb_avx2_1(
    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
) {
    // Load output buffer
    let tmp: &mut [u8; 48] = out
        .get_mut(*offset..*offset + 48)
        .expect("Slice to small cannot write")
        .try_into()
        .unwrap();

    let (r, g, b) = ycbcr_to_rgb_baseline(y, cb, cr);

    let mut j = 0;
    let mut i = 0;
    while i < 48 {
        tmp[i] = r.array[j] as u8;

        tmp[i + 1] = g.array[j] as u8;
        tmp[i + 2] = b.array[j] as u8;
        i += 3;
        j += 1;
    }

    *offset += 48;
}

/// Baseline implementation of YCBCR to RGB for avx,
///
/// It uses integer operations as opposed to floats, the approximation is
/// difficult for the  eye to see, but this means that it may produce different
/// values with libjpeg_turbo.  if accuracy is of utmost importance, use that.
///
/// this function should be called for most implementations, including
/// - ycbcr->rgb
/// - ycbcr->rgba
/// - ycbcr->brga
/// - ycbcr->rgbx
#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx")]
unsafe fn ycbcr_to_rgb_baseline(
    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
) -> (YmmRegister, YmmRegister, YmmRegister) {
    // Load values into a register
    //
    // dst[127:0] := MEM[loaddr+127:loaddr]
    // dst[255:128] := MEM[hiaddr+127:hiaddr]
    let y_c = _mm256_loadu_si256(y.as_ptr().cast());

    let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());

    let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());

    // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb

    // Cb = Cb-128;
    let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));

    // cr = Cb -128;
    let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));

    // Calculate Y->R
    // r = Y + 45 * Cr / 32
    // 45*cr
    let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);

    // r1>>5
    let r2 = _mm256_srai_epi16::<5>(r1);

    //y+r2

    let r = YmmRegister {
        mm256: clamp_avx(_mm256_add_epi16(y_c, r2))
    };

    // g = Y - (11 * Cb + 23 * Cr) / 32 ;

    // 11*cb
    let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);

    // 23*cr
    let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);

    //(11
    //(11 * Cb + 23 * Cr)
    let g3 = _mm256_add_epi16(g1, g2);

    // (11 * Cb + 23 * Cr) / 32
    let g4 = _mm256_srai_epi16::<5>(g3);

    // Y - (11 * Cb + 23 * Cr) / 32 ;
    let g = YmmRegister {
        mm256: clamp_avx(_mm256_sub_epi16(y_c, g4))
    };

    // b = Y + 113 * Cb / 64
    // 113 * cb
    let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);

    //113 * Cb / 64
    let b2 = _mm256_srai_epi16::<6>(b1);

    // b = Y + 113 * Cb / 64 ;
    let b = YmmRegister {
        mm256: clamp_avx(_mm256_add_epi16(b2, y_c))
    };

    return (r, g, b);
}

#[inline]
#[target_feature(enable = "avx2")]
/// A baseline implementation of YCbCr to RGB conversion which does not carry
/// out clamping
///
/// This is used by the `ycbcr_to_rgba_avx` and `ycbcr_to_rgbx` conversion
/// routines
unsafe fn ycbcr_to_rgb_baseline_no_clamp(
    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
) -> (__m256i, __m256i, __m256i) {
    // Load values into a register
    //
    let y_c = _mm256_loadu_si256(y.as_ptr().cast());

    let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());

    let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());

    // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb

    // Cb = Cb-128;
    let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));

    // cr = Cb -128;
    let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));

    // Calculate Y->R
    // r = Y + 45 * Cr / 32
    // 45*cr
    let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);

    // r1>>5
    let r2 = _mm256_srai_epi16::<5>(r1);

    //y+r2

    let r = _mm256_add_epi16(y_c, r2);

    // g = Y - (11 * Cb + 23 * Cr) / 32 ;

    // 11*cb
    let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);

    // 23*cr
    let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);

    //(11
    //(11 * Cb + 23 * Cr)
    let g3 = _mm256_add_epi16(g1, g2);

    // (11 * Cb + 23 * Cr) / 32
    let g4 = _mm256_srai_epi16::<5>(g3);

    // Y - (11 * Cb + 23 * Cr) / 32 ;
    let g = _mm256_sub_epi16(y_c, g4);

    // b = Y + 113 * Cb / 64
    // 113 * cb
    let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);

    //113 * Cb / 64
    let b2 = _mm256_srai_epi16::<6>(b1);

    // b = Y + 113 * Cb / 64 ;
    let b = _mm256_add_epi16(b2, y_c);

    return (r, g, b);
}

#[inline(always)]
pub fn ycbcr_to_rgba_avx2(
    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
) {
    unsafe {
        ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
    }
}

#[inline]
#[target_feature(enable = "avx2")]
#[rustfmt::skip]
unsafe fn ycbcr_to_rgba_unsafe(
    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
    out: &mut [u8],
    offset: &mut usize,
)
{
    // check if we have enough space to write.
    let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write").try_into().unwrap();

    let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);

    // set alpha channel to 255 for opaque

    // And no these comments were not from me pressing the keyboard

    // Pack the integers into u8's using signed saturation.
    let c = _mm256_packus_epi16(r, g); //aaaaa_bbbbb_aaaaa_bbbbbb
    let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); // cccccc_dddddd_ccccccc_ddddd
    // transpose_u16 and interleave channels
    let e = _mm256_unpacklo_epi8(c, d); //ab_ab_ab_ab_ab_ab_ab_ab
    let f = _mm256_unpackhi_epi8(c, d); //cd_cd_cd_cd_cd_cd_cd_cd
    // final transpose_u16
    let g = _mm256_unpacklo_epi8(e, f); //abcd_abcd_abcd_abcd_abcd
    let h = _mm256_unpackhi_epi8(e, f);


    // undo packus shuffling...
    let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);

    let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);

    let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);

    let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);

    let m = _mm256_blend_epi32::<0b1111_0000>(i, j);

    let n = _mm256_blend_epi32::<0b1111_0000>(k, l);


    // Store
    // Use streaming instructions to prevent polluting the cache?
    _mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);

    _mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);

    *offset += 64;
}

/// Clamp values between 0 and 255
///
/// This function clamps all values in `reg` to be between 0 and 255
///( the accepted values for RGB)
#[inline]
#[target_feature(enable = "avx2")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn clamp_avx(reg: __m256i) -> __m256i {
    // the lowest value
    let min_s = _mm256_set1_epi16(0);

    // Highest value
    let max_s = _mm256_set1_epi16(255);

    let max_v = _mm256_max_epi16(reg, min_s); //max(a,0)
    let min_v = _mm256_min_epi16(max_v, max_s); //min(max(a,0),255)
    return min_v;
}

#[inline]
const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
    (z << 6) | (y << 4) | (x << 2) | w
}