Skip to content

Commit c09ef96

Browse files
committed
Implement _mm_shuffle_epi8
1 parent e4d0811 commit c09ef96

File tree

2 files changed

+34
-10
lines changed

2 files changed

+34
-10
lines changed

example/std_example.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ unsafe fn test_simd() {
197197

198198
test_mm_extract_epi8();
199199
test_mm_insert_epi16();
200+
test_mm_shuffle_epi8();
200201

201202
test_mm256_shuffle_epi8();
202203
test_mm256_permute2x128_si256();
@@ -345,6 +346,26 @@ unsafe fn test_mm_insert_epi16() {
345346
assert_eq_m128i(r, e);
346347
}
347348

349+
#[cfg(target_arch = "x86_64")]
350+
#[target_feature(enable = "ssse3")]
351+
unsafe fn test_mm_shuffle_epi8() {
352+
#[rustfmt::skip]
353+
let a = _mm_setr_epi8(
354+
1, 2, 3, 4, 5, 6, 7, 8,
355+
9, 10, 11, 12, 13, 14, 15, 16,
356+
);
357+
#[rustfmt::skip]
358+
let b = _mm_setr_epi8(
359+
4, 128_u8 as i8, 4, 3,
360+
24, 12, 6, 19,
361+
12, 5, 5, 10,
362+
4, 1, 8, 0,
363+
);
364+
let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
365+
let r = _mm_shuffle_epi8(a, b);
366+
assert_eq_m128i(r, expected);
367+
}
368+
348369
#[cfg(target_arch = "x86_64")]
349370
#[target_feature(enable = "avx2")]
350371
unsafe fn test_mm256_shuffle_epi8() {

src/intrinsics/llvm_x86.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
222222
_ => fx.bcx.ins().iconst(types::I32, 0),
223223
});
224224
}
225-
"llvm.x86.avx2.pshuf.b" => {
225+
"llvm.x86.ssse3.pshuf.b.128" | "llvm.x86.avx2.pshuf.b" => {
226226
let (a, b) = match args {
227227
[a, b] => (a, b),
228228
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
@@ -241,15 +241,18 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
241241
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
242242
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
243243
}
244-
for i in 16..32 {
245-
let b_lane = b.value_lane(fx, i).load_scalar(fx);
246-
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
247-
let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
248-
let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
249-
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
250-
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
251-
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
252-
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
244+
245+
if intrinsic == "llvm.x86.avx2.pshuf.b" {
246+
for i in 16..32 {
247+
let b_lane = b.value_lane(fx, i).load_scalar(fx);
248+
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
249+
let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
250+
let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
251+
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
252+
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
253+
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
254+
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
255+
}
253256
}
254257
}
255258
"llvm.x86.avx2.vperm2i128" => {

0 commit comments

Comments
 (0)