zune_jpeg/idct/
scalar.rs

1/*
2 * Copyright (c) 2023.
3 *
4 * This software is free software;
5 *
6 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7 */
8
9//! Platform independent IDCT algorithm
10//!
11//! Not as fast as AVX one.
12
13const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
14
15#[allow(unused_assignments)]
16#[allow(
17    clippy::too_many_lines,
18    clippy::op_ref,
19    clippy::cast_possible_truncation
20)]
21pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
22    // Temporary variables.
23
24    let mut pos = 0;
25
26    let mut i = 0;
27    // Don't check for zeroes inside loop, lift it and check outside
28    // we want to accelerate the case with 63 0 ac coeff
29    if &in_vector[1..] == &[0_i32; 63] {
30        // okay then if you work, yay, let's write you really quick
31        let coeff = [(((in_vector[0] >> 3) + 128) as i16).clamp(0, 255); 8];
32
33        macro_rules! store {
34            ($index:tt) => {
35                // position of the MCU
36                let mcu_stride: &mut [i16; 8] = out_vector
37                    .get_mut($index..$index + 8)
38                    .unwrap()
39                    .try_into()
40                    .unwrap();
41                // copy coefficients
42                mcu_stride.copy_from_slice(&coeff);
43                // increment index
44                $index += stride;
45            };
46        }
47        // write to four positions
48        store!(pos);
49        store!(pos);
50        store!(pos);
51        store!(pos);
52
53        store!(pos);
54        store!(pos);
55        store!(pos);
56        store!(pos);
57    } else {
58        // because the compiler fails to see that it can be auto_vectorised so i'll
59        // leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9
60        for ptr in 0..8 {
61            let p2 = in_vector[ptr + 16];
62            let p3 = in_vector[ptr + 48];
63
64            let p1 = (p2 + p3).wrapping_mul(2217);
65
66            let t2 = p1 + p3 * -7567;
67            let t3 = p1 + p2 * 3135;
68
69            let p2 = in_vector[ptr];
70            let p3 = in_vector[32 + ptr];
71            let t0 = fsh(p2 + p3);
72            let t1 = fsh(p2 - p3);
73
74            let x0 = t0 + t3 + 512;
75            let x3 = t0 - t3 + 512;
76            let x1 = t1 + t2 + 512;
77            let x2 = t1 - t2 + 512;
78
79            // odd part
80            let mut t0 = in_vector[ptr + 56];
81            let mut t1 = in_vector[ptr + 40];
82            let mut t2 = in_vector[ptr + 24];
83            let mut t3 = in_vector[ptr + 8];
84
85            let p3 = t0 + t2;
86            let p4 = t1 + t3;
87            let p1 = t0 + t3;
88            let p2 = t1 + t2;
89            let p5 = (p3 + p4) * 4816;
90
91            t0 *= 1223;
92            t1 *= 8410;
93            t2 *= 12586;
94            t3 *= 6149;
95
96            let p1 = p5 + p1 * -3685;
97            let p2 = p5 + p2 * -10497;
98            let p3 = p3 * -8034;
99            let p4 = p4 * -1597;
100
101            t3 += p1 + p4;
102            t2 += p2 + p3;
103            t1 += p2 + p4;
104            t0 += p1 + p3;
105
106            // constants scaled things up by 1<<12; let's bring them back
107            // down, but keep 2 extra bits of precision
108            in_vector[ptr] = (x0 + t3) >> 10;
109            in_vector[ptr + 8] = (x1 + t2) >> 10;
110            in_vector[ptr + 16] = (x2 + t1) >> 10;
111            in_vector[ptr + 24] = (x3 + t0) >> 10;
112            in_vector[ptr + 32] = (x3 - t0) >> 10;
113            in_vector[ptr + 40] = (x2 - t1) >> 10;
114            in_vector[ptr + 48] = (x1 - t2) >> 10;
115            in_vector[ptr + 56] = (x0 - t3) >> 10;
116        }
117
118        // This is vectorised in architectures supporting SSE 4.1
119        while i < 64 {
120            // We won't try to short circuit here because it rarely works
121
122            // Even part
123            let p2 = in_vector[i + 2];
124            let p3 = in_vector[i + 6];
125
126            let p1 = (p2 + p3) * 2217;
127            let t2 = p1 + p3 * -7567;
128            let t3 = p1 + p2 * 3135;
129
130            let p2 = in_vector[i];
131            let p3 = in_vector[i + 4];
132
133            let t0 = fsh(p2 + p3);
134            let t1 = fsh(p2 - p3);
135            // constants scaled things up by 1<<12, plus we had 1<<2 from first
136            // loop, plus horizontal and vertical each scale by sqrt(8) so together
137            // we've got an extra 1<<3, so 1<<17 total we need to remove.
138            // so we want to round that, which means adding 0.5 * 1<<17,
139            // aka 65536. Also, we'll end up with -128 to 127 that we want
140            // to encode as 0..255 by adding 128, so we'll add that before the shift
141            let x0 = t0 + t3 + SCALE_BITS;
142            let x3 = t0 - t3 + SCALE_BITS;
143            let x1 = t1 + t2 + SCALE_BITS;
144            let x2 = t1 - t2 + SCALE_BITS;
145            // odd part
146            let mut t0 = in_vector[i + 7];
147            let mut t1 = in_vector[i + 5];
148            let mut t2 = in_vector[i + 3];
149            let mut t3 = in_vector[i + 1];
150
151            let p3 = t0 + t2;
152            let p4 = t1 + t3;
153            let p1 = t0 + t3;
154            let p2 = t1 + t2;
155            let p5 = (p3 + p4) * f2f(1.175875602);
156
157            t0 = t0.wrapping_mul(1223);
158            t1 = t1.wrapping_mul(8410);
159            t2 = t2.wrapping_mul(12586);
160            t3 = t3.wrapping_mul(6149);
161
162            let p1 = p5 + p1 * -3685;
163            let p2 = p5 + p2 * -10497;
164            let p3 = p3 * -8034;
165            let p4 = p4 * -1597;
166
167            t3 += p1 + p4;
168            t2 += p2 + p3;
169            t1 += p2 + p4;
170            t0 += p1 + p3;
171
172            let out: &mut [i16; 8] = out_vector
173                .get_mut(pos..pos + 8)
174                .unwrap()
175                .try_into()
176                .unwrap();
177
178            out[0] = clamp((x0 + t3) >> 17);
179            out[1] = clamp((x1 + t2) >> 17);
180            out[2] = clamp((x2 + t1) >> 17);
181            out[3] = clamp((x3 + t0) >> 17);
182            out[4] = clamp((x3 - t0) >> 17);
183            out[5] = clamp((x2 - t1) >> 17);
184            out[6] = clamp((x1 - t2) >> 17);
185            out[7] = clamp((x0 - t3) >> 17);
186
187            i += 8;
188
189            pos += stride;
190        }
191    }
192}
193
194#[inline]
195#[allow(clippy::cast_possible_truncation)]
196/// Multiply a number by 4096
197fn f2f(x: f32) -> i32 {
198    (x * 4096.0 + 0.5) as i32
199}
200
201#[inline]
202/// Multiply a number by 4096
203fn fsh(x: i32) -> i32 {
204    x << 12
205}
206
207/// Clamp values between 0 and 255
208#[inline]
209#[allow(clippy::cast_possible_truncation)]
210fn clamp(a: i32) -> i16 {
211    a.clamp(0, 255) as i16
212}