zune_jpeg/
unsafe_utils_avx2.rs

1/*
2 * Copyright (c) 2023.
3 *
4 * This software is free software;
5 *
6 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7 */
8
9#![cfg(all(feature = "x86", any(target_arch = "x86", target_arch = "x86_64")))]
10//! This module provides unsafe ways to do some things
11#![allow(clippy::wildcard_imports)]
12
13#[cfg(target_arch = "x86")]
14use core::arch::x86::*;
15#[cfg(target_arch = "x86_64")]
16use core::arch::x86_64::*;
17use core::ops::{Add, AddAssign, Mul, MulAssign, Sub};
18
19/// A copy of `_MM_SHUFFLE()` that doesn't require
20/// a nightly compiler
21#[inline]
22const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
23    (z << 6) | (y << 4) | (x << 2) | w
24}
25
26/// An abstraction of an AVX ymm register that
27///allows some things to not look ugly
28#[derive(Clone, Copy)]
29pub struct YmmRegister {
30    /// An AVX register
31    pub(crate) mm256: __m256i
32}
33
34impl Add for YmmRegister {
35    type Output = YmmRegister;
36
37    #[inline]
38    fn add(self, rhs: Self) -> Self::Output {
39        unsafe {
40            return YmmRegister {
41                mm256: _mm256_add_epi32(self.mm256, rhs.mm256)
42            };
43        }
44    }
45}
46
47impl Add<i32> for YmmRegister {
48    type Output = YmmRegister;
49
50    #[inline]
51    fn add(self, rhs: i32) -> Self::Output {
52        unsafe {
53            let tmp = _mm256_set1_epi32(rhs);
54
55            return YmmRegister {
56                mm256: _mm256_add_epi32(self.mm256, tmp)
57            };
58        }
59    }
60}
61
62impl Sub for YmmRegister {
63    type Output = YmmRegister;
64
65    #[inline]
66    fn sub(self, rhs: Self) -> Self::Output {
67        unsafe {
68            return YmmRegister {
69                mm256: _mm256_sub_epi32(self.mm256, rhs.mm256)
70            };
71        }
72    }
73}
74
75impl AddAssign for YmmRegister {
76    #[inline]
77    fn add_assign(&mut self, rhs: Self) {
78        unsafe {
79            self.mm256 = _mm256_add_epi32(self.mm256, rhs.mm256);
80        }
81    }
82}
83
84impl AddAssign<i32> for YmmRegister {
85    #[inline]
86    fn add_assign(&mut self, rhs: i32) {
87        unsafe {
88            let tmp = _mm256_set1_epi32(rhs);
89
90            self.mm256 = _mm256_add_epi32(self.mm256, tmp);
91        }
92    }
93}
94
95impl Mul for YmmRegister {
96    type Output = YmmRegister;
97
98    #[inline]
99    fn mul(self, rhs: Self) -> Self::Output {
100        unsafe {
101            YmmRegister {
102                mm256: _mm256_mullo_epi32(self.mm256, rhs.mm256)
103            }
104        }
105    }
106}
107
108impl Mul<i32> for YmmRegister {
109    type Output = YmmRegister;
110
111    #[inline]
112    fn mul(self, rhs: i32) -> Self::Output {
113        unsafe {
114            let tmp = _mm256_set1_epi32(rhs);
115
116            YmmRegister {
117                mm256: _mm256_mullo_epi32(self.mm256, tmp)
118            }
119        }
120    }
121}
122
123impl MulAssign for YmmRegister {
124    #[inline]
125    fn mul_assign(&mut self, rhs: Self) {
126        unsafe {
127            self.mm256 = _mm256_mullo_epi32(self.mm256, rhs.mm256);
128        }
129    }
130}
131
132impl MulAssign<i32> for YmmRegister {
133    #[inline]
134    fn mul_assign(&mut self, rhs: i32) {
135        unsafe {
136            let tmp = _mm256_set1_epi32(rhs);
137
138            self.mm256 = _mm256_mullo_epi32(self.mm256, tmp);
139        }
140    }
141}
142
143impl MulAssign<__m256i> for YmmRegister {
144    #[inline]
145    fn mul_assign(&mut self, rhs: __m256i) {
146        unsafe {
147            self.mm256 = _mm256_mullo_epi32(self.mm256, rhs);
148        }
149    }
150}
151
152type Reg = YmmRegister;
153
154/// Transpose an array of 8 by 8 i32's using avx intrinsics
155///
156/// This was translated from [here](https://newbedev.com/transpose-an-8x8-float-using-avx-avx2)
157#[allow(unused_parens, clippy::too_many_arguments)]
158#[target_feature(enable = "avx2")]
159#[inline]
160pub unsafe fn transpose(
161    v0: &mut Reg, v1: &mut Reg, v2: &mut Reg, v3: &mut Reg, v4: &mut Reg, v5: &mut Reg,
162    v6: &mut Reg, v7: &mut Reg
163) {
164    macro_rules! merge_epi32 {
165        ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => {
166            let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0));
167
168            let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0));
169
170            $v2 = _mm256_unpacklo_epi32(va, vb);
171
172            $v3 = _mm256_unpackhi_epi32(va, vb);
173        };
174    }
175
176    macro_rules! merge_epi64 {
177        ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => {
178            let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0));
179
180            let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0));
181
182            $v2 = _mm256_unpacklo_epi64(va, vb);
183
184            $v3 = _mm256_unpackhi_epi64(va, vb);
185        };
186    }
187
188    macro_rules! merge_si128 {
189        ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => {
190            $v2 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 2, 0, 0));
191
192            $v3 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 3, 0, 1));
193        };
194    }
195
196    let (w0, w1, w2, w3, w4, w5, w6, w7);
197
198    merge_epi32!((v0.mm256), (v1.mm256), w0, w1);
199
200    merge_epi32!((v2.mm256), (v3.mm256), w2, w3);
201
202    merge_epi32!((v4.mm256), (v5.mm256), w4, w5);
203
204    merge_epi32!((v6.mm256), (v7.mm256), w6, w7);
205
206    let (x0, x1, x2, x3, x4, x5, x6, x7);
207
208    merge_epi64!(w0, w2, x0, x1);
209
210    merge_epi64!(w1, w3, x2, x3);
211
212    merge_epi64!(w4, w6, x4, x5);
213
214    merge_epi64!(w5, w7, x6, x7);
215
216    merge_si128!(x0, x4, (v0.mm256), (v1.mm256));
217
218    merge_si128!(x1, x5, (v2.mm256), (v3.mm256));
219
220    merge_si128!(x2, x6, (v4.mm256), (v5.mm256));
221
222    merge_si128!(x3, x7, (v6.mm256), (v7.mm256));
223}