Skip to content

Commit bed25b2

Browse files
authored
Fix the implementation of _mm256_alignr_epi8 (#330)
This seems likely to have mostly just been a copy/paste error, so this re-reviews the intrinsics and aligns it with the implementation in clang. Closes #328
1 parent 48f4db0 commit bed25b2

File tree

1 file changed

+114
-46
lines changed

1 file changed

+114
-46
lines changed

coresimd/x86/i586/avx2.rs

Lines changed: 114 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
121121
/// result, shift the result right by `n` bytes, and return the low 16 bytes.
122122
#[inline]
123123
#[target_feature(enable = "avx2")]
124-
#[cfg_attr(test, assert_instr(vpalignr, n = 15))]
124+
#[cfg_attr(test, assert_instr(vpalignr, n = 7))]
125125
#[rustc_args_required_const(2)]
126126
pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
127127
let n = n as u32;
@@ -141,46 +141,104 @@ pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
141141
let a = a.as_i8x32();
142142
let b = b.as_i8x32();
143143

144-
macro_rules! shuffle {
145-
($shift:expr) => {
144+
let r: i8x32 = match n {
145+
0 => {
146146
simd_shuffle32(b, a, [
147-
0 + $shift, 1 + $shift,
148-
2 + $shift, 3 + $shift,
149-
4 + $shift, 5 + $shift,
150-
6 + $shift, 7 + $shift,
151-
8 + $shift, 9 + $shift,
152-
10 + $shift, 11 + $shift,
153-
12 + $shift, 13 + $shift,
154-
14 + $shift, 15 + $shift,
155-
16 + $shift, 17 + $shift,
156-
18 + $shift, 19 + $shift,
157-
20 + $shift, 21 + $shift,
158-
22 + $shift, 23 + $shift,
159-
24 + $shift, 25 + $shift,
160-
26 + $shift, 27 + $shift,
161-
28 + $shift, 29 + $shift,
162-
30 + $shift, 31 + $shift,
147+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
148+
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
163149
])
164150
}
165-
}
166-
let r: i8x32 = match n {
167-
0 => shuffle!(0),
168-
1 => shuffle!(1),
169-
2 => shuffle!(2),
170-
3 => shuffle!(3),
171-
4 => shuffle!(4),
172-
5 => shuffle!(5),
173-
6 => shuffle!(6),
174-
7 => shuffle!(7),
175-
8 => shuffle!(8),
176-
9 => shuffle!(9),
177-
10 => shuffle!(10),
178-
11 => shuffle!(11),
179-
12 => shuffle!(12),
180-
13 => shuffle!(13),
181-
14 => shuffle!(14),
182-
15 => shuffle!(15),
183-
_ => shuffle!(16),
151+
1 => {
152+
simd_shuffle32(b, a, [
153+
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32,
154+
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48,
155+
])
156+
}
157+
2 => {
158+
simd_shuffle32(b, a, [
159+
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33,
160+
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49,
161+
])
162+
}
163+
3 => {
164+
simd_shuffle32(b, a, [
165+
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34,
166+
19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
167+
])
168+
}
169+
4 => {
170+
simd_shuffle32(b, a, [
171+
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35,
172+
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
173+
])
174+
}
175+
5 => {
176+
simd_shuffle32(b, a, [
177+
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36,
178+
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
179+
])
180+
}
181+
6 => {
182+
simd_shuffle32(b, a, [
183+
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37,
184+
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
185+
])
186+
}
187+
7 => {
188+
simd_shuffle32(b, a, [
189+
7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38,
190+
23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
191+
])
192+
}
193+
8 => {
194+
simd_shuffle32(b, a, [
195+
8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39,
196+
24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
197+
])
198+
}
199+
9 => {
200+
simd_shuffle32(b, a, [
201+
9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40,
202+
25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
203+
])
204+
}
205+
10 => {
206+
simd_shuffle32(b, a, [
207+
10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
208+
26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
209+
])
210+
}
211+
11 => {
212+
simd_shuffle32(b, a, [
213+
11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
214+
27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
215+
])
216+
}
217+
12 => {
218+
simd_shuffle32(b, a, [
219+
12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
220+
28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
221+
])
222+
}
223+
13 => {
224+
simd_shuffle32(b, a, [
225+
13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
226+
29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
227+
])
228+
}
229+
14 => {
230+
simd_shuffle32(b, a, [
231+
14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
232+
30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
233+
])
234+
}
235+
15 => {
236+
simd_shuffle32(b, a, [
237+
15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
238+
31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
239+
])
240+
}
241+
_ => b,
184242
};
185243
mem::transmute(r)
186244
}
@@ -4747,29 +4805,39 @@ mod tests {
47474805
#[cfg_attr(rustfmt, rustfmt_skip)]
47484806
let expected = _mm256_setr_epi8(
47494807
2, 3, 4, 5, 6, 7, 8, 9,
4750-
10, 11, 12, 13, 14, 15, 16, 17,
4808+
10, 11, 12, 13, 14, 15, 16, 0,
47514809
18, 19, 20, 21, 22, 23, 24, 25,
47524810
26, 27, 28, 29, 30, 31, 32, 0,
47534811
);
47544812
assert_eq_m256i(r, expected);
47554813

4814+
let r = _mm256_alignr_epi8(a, b, 4);
47564815
#[cfg_attr(rustfmt, rustfmt_skip)]
47574816
let expected = _mm256_setr_epi8(
4758-
-17, -18, -19, -20, -21, -22, -23, -24,
4759-
-25, -26, -27, -28, -29, -30, -31, -32,
4760-
1, 2, 3, 4, 5, 6, 7, 8,
4761-
9, 10, 11, 12, 13, 14, 15, 16,
4817+
-5, -6, -7, -8, -9, -10, -11, -12,
4818+
-13, -14, -15, -16, 1, 2, 3, 4,
4819+
-21, -22, -23, -24, -25, -26, -27, -28,
4820+
-29, -30, -31, -32, 17, 18, 19, 20,
4821+
);
4822+
assert_eq_m256i(r, expected);
4823+
4824+
#[cfg_attr(rustfmt, rustfmt_skip)]
4825+
let expected = _mm256_setr_epi8(
4826+
-1, -2, -3, -4, -5, -6, -7, -8,
4827+
-9, -10, -11, -12, -13, -14, -15, -16, -17,
4828+
-18, -19, -20, -21, -22, -23, -24, -25,
4829+
-26, -27, -28, -29, -30, -31, -32,
47624830
);
47634831
let r = _mm256_alignr_epi8(a, b, 16);
47644832
assert_eq_m256i(r, expected);
47654833

47664834
let r = _mm256_alignr_epi8(a, b, 15);
47674835
#[cfg_attr(rustfmt, rustfmt_skip)]
47684836
let expected = _mm256_setr_epi8(
4769-
-16, -17, -18, -19, -20, -21, -22, -23,
4770-
-24, -25, -26, -27, -28, -29, -30, -31,
4771-
-32, 1, 2, 3, 4, 5, 6, 7,
4837+
-16, 1, 2, 3, 4, 5, 6, 7,
47724838
8, 9, 10, 11, 12, 13, 14, 15,
4839+
-32, 17, 18, 19, 20, 21, 22, 23,
4840+
24, 25, 26, 27, 28, 29, 30, 31,
47734841
);
47744842
assert_eq_m256i(r, expected);
47754843

0 commit comments

Comments
 (0)