1#![cfg(all(feature = "x86", any(target_arch = "x86", target_arch = "x86_64")))]
10#![allow(clippy::wildcard_imports)]
12
13#[cfg(target_arch = "x86")]
14use core::arch::x86::*;
15#[cfg(target_arch = "x86_64")]
16use core::arch::x86_64::*;
17use core::ops::{Add, AddAssign, Mul, MulAssign, Sub};
18
19#[inline]
22const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
23 (z << 6) | (y << 4) | (x << 2) | w
24}
25
26#[derive(Clone, Copy)]
29pub struct YmmRegister {
30 pub(crate) mm256: __m256i
32}
33
34impl Add for YmmRegister {
35 type Output = YmmRegister;
36
37 #[inline]
38 fn add(self, rhs: Self) -> Self::Output {
39 unsafe {
40 return YmmRegister {
41 mm256: _mm256_add_epi32(self.mm256, rhs.mm256)
42 };
43 }
44 }
45}
46
47impl Add<i32> for YmmRegister {
48 type Output = YmmRegister;
49
50 #[inline]
51 fn add(self, rhs: i32) -> Self::Output {
52 unsafe {
53 let tmp = _mm256_set1_epi32(rhs);
54
55 return YmmRegister {
56 mm256: _mm256_add_epi32(self.mm256, tmp)
57 };
58 }
59 }
60}
61
62impl Sub for YmmRegister {
63 type Output = YmmRegister;
64
65 #[inline]
66 fn sub(self, rhs: Self) -> Self::Output {
67 unsafe {
68 return YmmRegister {
69 mm256: _mm256_sub_epi32(self.mm256, rhs.mm256)
70 };
71 }
72 }
73}
74
75impl AddAssign for YmmRegister {
76 #[inline]
77 fn add_assign(&mut self, rhs: Self) {
78 unsafe {
79 self.mm256 = _mm256_add_epi32(self.mm256, rhs.mm256);
80 }
81 }
82}
83
84impl AddAssign<i32> for YmmRegister {
85 #[inline]
86 fn add_assign(&mut self, rhs: i32) {
87 unsafe {
88 let tmp = _mm256_set1_epi32(rhs);
89
90 self.mm256 = _mm256_add_epi32(self.mm256, tmp);
91 }
92 }
93}
94
95impl Mul for YmmRegister {
96 type Output = YmmRegister;
97
98 #[inline]
99 fn mul(self, rhs: Self) -> Self::Output {
100 unsafe {
101 YmmRegister {
102 mm256: _mm256_mullo_epi32(self.mm256, rhs.mm256)
103 }
104 }
105 }
106}
107
108impl Mul<i32> for YmmRegister {
109 type Output = YmmRegister;
110
111 #[inline]
112 fn mul(self, rhs: i32) -> Self::Output {
113 unsafe {
114 let tmp = _mm256_set1_epi32(rhs);
115
116 YmmRegister {
117 mm256: _mm256_mullo_epi32(self.mm256, tmp)
118 }
119 }
120 }
121}
122
123impl MulAssign for YmmRegister {
124 #[inline]
125 fn mul_assign(&mut self, rhs: Self) {
126 unsafe {
127 self.mm256 = _mm256_mullo_epi32(self.mm256, rhs.mm256);
128 }
129 }
130}
131
132impl MulAssign<i32> for YmmRegister {
133 #[inline]
134 fn mul_assign(&mut self, rhs: i32) {
135 unsafe {
136 let tmp = _mm256_set1_epi32(rhs);
137
138 self.mm256 = _mm256_mullo_epi32(self.mm256, tmp);
139 }
140 }
141}
142
143impl MulAssign<__m256i> for YmmRegister {
144 #[inline]
145 fn mul_assign(&mut self, rhs: __m256i) {
146 unsafe {
147 self.mm256 = _mm256_mullo_epi32(self.mm256, rhs);
148 }
149 }
150}
151
152type Reg = YmmRegister;
153
154#[allow(unused_parens, clippy::too_many_arguments)]
158#[target_feature(enable = "avx2")]
159#[inline]
160pub unsafe fn transpose(
161 v0: &mut Reg, v1: &mut Reg, v2: &mut Reg, v3: &mut Reg, v4: &mut Reg, v5: &mut Reg,
162 v6: &mut Reg, v7: &mut Reg
163) {
164 macro_rules! merge_epi32 {
165 ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => {
166 let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0));
167
168 let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0));
169
170 $v2 = _mm256_unpacklo_epi32(va, vb);
171
172 $v3 = _mm256_unpackhi_epi32(va, vb);
173 };
174 }
175
176 macro_rules! merge_epi64 {
177 ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => {
178 let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0));
179
180 let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0));
181
182 $v2 = _mm256_unpacklo_epi64(va, vb);
183
184 $v3 = _mm256_unpackhi_epi64(va, vb);
185 };
186 }
187
188 macro_rules! merge_si128 {
189 ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => {
190 $v2 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 2, 0, 0));
191
192 $v3 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 3, 0, 1));
193 };
194 }
195
196 let (w0, w1, w2, w3, w4, w5, w6, w7);
197
198 merge_epi32!((v0.mm256), (v1.mm256), w0, w1);
199
200 merge_epi32!((v2.mm256), (v3.mm256), w2, w3);
201
202 merge_epi32!((v4.mm256), (v5.mm256), w4, w5);
203
204 merge_epi32!((v6.mm256), (v7.mm256), w6, w7);
205
206 let (x0, x1, x2, x3, x4, x5, x6, x7);
207
208 merge_epi64!(w0, w2, x0, x1);
209
210 merge_epi64!(w1, w3, x2, x3);
211
212 merge_epi64!(w4, w6, x4, x5);
213
214 merge_epi64!(w5, w7, x6, x7);
215
216 merge_si128!(x0, x4, (v0.mm256), (v1.mm256));
217
218 merge_si128!(x1, x5, (v2.mm256), (v3.mm256));
219
220 merge_si128!(x2, x6, (v4.mm256), (v5.mm256));
221
222 merge_si128!(x3, x7, (v6.mm256), (v7.mm256));
223}