Skip to content
This repository was archived by the owner on May 28, 2025. It is now read-only.

Commit c8f5d35

Browse files
committed
Restructure x86 signed pack instructions
This reduces the amount of duplicated code and the chance for bugs. I validated the new code for correctness against LLVM using the following script. It found many bugs in the implementation until I was finally able to get it correct and passing. ```rust //! Test for x86 pack instructions. Prints deterministic results, use it to compare backends. use std::arch::x86_64::{self, __m128i, __m256i}; use rand::{rngs::SmallRng, Rng, SeedableRng}; fn main() { let rng = &mut SmallRng::seed_from_u64(123); for _ in 0..100_000 { unsafe { sse_test(rng); avx_test(rng); } } } unsafe fn sse_test(rng: &mut SmallRng) { print_sse_8(x86_64::_mm_packus_epi16(sse16(rng), sse16(rng))); print_sse_8(x86_64::_mm_packs_epi16(sse16(rng), sse16(rng))); print_sse_16(x86_64::_mm_packus_epi32(sse32(rng), sse32(rng))); print_sse_16(x86_64::_mm_packs_epi32(sse32(rng), sse32(rng))); } unsafe fn avx_test(rng: &mut SmallRng) { print_avx_8(x86_64::_mm256_packs_epi16(avx16(rng), avx16(rng))); print_avx_8(x86_64::_mm256_packs_epi16(avx16(rng), avx16(rng))); print_avx_16(x86_64::_mm256_packus_epi32(avx32(rng), avx32(rng))); print_avx_16(x86_64::_mm256_packs_epi32(avx32(rng), avx32(rng))); } fn print_sse_8(t: __m128i) { let ints = unsafe { std::mem::transmute::<_, [i8; 16]>(t) }; println!("{ints:?}"); } fn print_sse_16(t: __m128i) { let ints = unsafe { std::mem::transmute::<_, [i16; 8]>(t) }; println!("{ints:?}"); } fn print_avx_8(t: __m256i) { let ints = unsafe { std::mem::transmute::<_, [i8; 32]>(t) }; println!("{ints:?}"); } fn print_avx_16(t: __m256i) { let ints = unsafe { std::mem::transmute::<_, [i16; 16]>(t) }; println!("{ints:?}"); } fn sse16(rand: &mut SmallRng) -> __m128i { unsafe { std::mem::transmute([(); 8].map(|()| i16(rand))) } } fn sse32(rand: &mut SmallRng) -> __m128i { unsafe { std::mem::transmute([(); 4].map(|()| i32(rand))) } } fn avx16(rand: &mut SmallRng) -> __m256i { unsafe { std::mem::transmute([(); 16].map(|()| i16(rand))) } } fn avx32(rand: &mut SmallRng) -> __m256i { unsafe { std::mem::transmute([(); 8].map(|()| i32(rand))) } } fn i16(rand: &mut SmallRng) -> i16 { if rand.gen() { rand.gen::<i16>() } else { rand.gen::<i8>() as i16 } } fn i32(rand: &mut SmallRng) -> i32 { if rand.gen() { rand.gen::<i32>() } else { rand.gen::<i16>() as i32 } } ```
1 parent 0c72b43 commit c8f5d35

File tree

1 file changed

+132
-196
lines changed

1 file changed

+132
-196
lines changed

src/intrinsics/llvm_x86.rs

Lines changed: 132 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -610,230 +610,56 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
610610
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
611611
intrinsic_args!(fx, args => (a, b); intrinsic);
612612

613-
assert_eq!(a.layout(), b.layout());
614-
let layout = a.layout();
615-
616-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
617-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
618-
assert_eq!(lane_ty, fx.tcx.types.i16);
619-
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
620-
assert_eq!(lane_count * 2, ret_lane_count);
621-
622-
let zero = fx.bcx.ins().iconst(types::I16, 0);
623-
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
624-
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
625-
626-
for idx in 0..lane_count {
627-
let lane = a.value_lane(fx, idx).load_scalar(fx);
628-
let sat = fx.bcx.ins().smax(lane, zero);
629-
let sat = fx.bcx.ins().umin(sat, max_u8);
630-
let res = fx.bcx.ins().ireduce(types::I8, sat);
631-
632-
let res_lane = CValue::by_val(res, ret_lane_layout);
633-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
634-
}
613+
pack_instruction(fx, a, b, ret, PackSize::U8, PackWidth::Sse);
614+
}
635615

636-
for idx in 0..lane_count {
637-
let lane = b.value_lane(fx, idx).load_scalar(fx);
638-
let sat = fx.bcx.ins().smax(lane, zero);
639-
let sat = fx.bcx.ins().umin(sat, max_u8);
640-
let res = fx.bcx.ins().ireduce(types::I8, sat);
616+
"llvm.x86.sse2.packsswb.128" => {
617+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16&ig_expand=4848
618+
intrinsic_args!(fx, args => (a, b); intrinsic);
641619

642-
let res_lane = CValue::by_val(res, ret_lane_layout);
643-
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
644-
}
620+
pack_instruction(fx, a, b, ret, PackSize::S8, PackWidth::Sse);
645621
}
646622

647623
"llvm.x86.avx2.packuswb" => {
648624
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
649625
intrinsic_args!(fx, args => (a, b); intrinsic);
650626

651-
assert_eq!(a.layout(), b.layout());
652-
let layout = a.layout();
653-
654-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
655-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
656-
assert_eq!(lane_ty, fx.tcx.types.i16);
657-
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
658-
assert_eq!(lane_count * 2, ret_lane_count);
659-
660-
let zero = fx.bcx.ins().iconst(types::I16, 0);
661-
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
662-
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
663-
664-
for idx in 0..lane_count / 2 {
665-
let lane = a.value_lane(fx, idx).load_scalar(fx);
666-
let sat = fx.bcx.ins().smax(lane, zero);
667-
let sat = fx.bcx.ins().umin(sat, max_u8);
668-
let res = fx.bcx.ins().ireduce(types::I8, sat);
669-
670-
let res_lane = CValue::by_val(res, ret_lane_layout);
671-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
672-
}
673-
674-
for idx in 0..lane_count / 2 {
675-
let lane = b.value_lane(fx, idx).load_scalar(fx);
676-
let sat = fx.bcx.ins().smax(lane, zero);
677-
let sat = fx.bcx.ins().umin(sat, max_u8);
678-
let res = fx.bcx.ins().ireduce(types::I8, sat);
679-
680-
let res_lane = CValue::by_val(res, ret_lane_layout);
681-
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
682-
}
683-
684-
for idx in 0..lane_count / 2 {
685-
let lane = a.value_lane(fx, idx).load_scalar(fx);
686-
let sat = fx.bcx.ins().smax(lane, zero);
687-
let sat = fx.bcx.ins().umin(sat, max_u8);
688-
let res = fx.bcx.ins().ireduce(types::I8, sat);
689-
690-
let res_lane = CValue::by_val(res, ret_lane_layout);
691-
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
692-
}
693-
694-
for idx in 0..lane_count / 2 {
695-
let lane = b.value_lane(fx, idx).load_scalar(fx);
696-
let sat = fx.bcx.ins().smax(lane, zero);
697-
let sat = fx.bcx.ins().umin(sat, max_u8);
698-
let res = fx.bcx.ins().ireduce(types::I8, sat);
699-
700-
let res_lane = CValue::by_val(res, ret_lane_layout);
701-
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
702-
}
627+
pack_instruction(fx, a, b, ret, PackSize::U8, PackWidth::Avx);
703628
}
704629

705-
"llvm.x86.sse2.packssdw.128" => {
706-
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
630+
"llvm.x86.avx2.packsswb" => {
631+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16&ig_expand=4851
707632
intrinsic_args!(fx, args => (a, b); intrinsic);
708633

709-
assert_eq!(a.layout(), b.layout());
710-
let layout = a.layout();
711-
712-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
713-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
714-
assert_eq!(lane_ty, fx.tcx.types.i32);
715-
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
716-
assert_eq!(lane_count * 2, ret_lane_count);
717-
718-
let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
719-
let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
720-
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
721-
722-
for idx in 0..lane_count {
723-
let lane = a.value_lane(fx, idx).load_scalar(fx);
724-
let sat = fx.bcx.ins().smax(lane, min_i16);
725-
let sat = fx.bcx.ins().smin(sat, max_i16);
726-
let res = fx.bcx.ins().ireduce(types::I16, sat);
727-
728-
let res_lane = CValue::by_val(res, ret_lane_layout);
729-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
730-
}
731-
732-
for idx in 0..lane_count {
733-
let lane = b.value_lane(fx, idx).load_scalar(fx);
734-
let sat = fx.bcx.ins().smax(lane, min_i16);
735-
let sat = fx.bcx.ins().smin(sat, max_i16);
736-
let res = fx.bcx.ins().ireduce(types::I16, sat);
737-
738-
let res_lane = CValue::by_val(res, ret_lane_layout);
739-
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
740-
}
634+
pack_instruction(fx, a, b, ret, PackSize::S8, PackWidth::Avx);
741635
}
742636

743637
"llvm.x86.sse41.packusdw" => {
744638
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
745639
intrinsic_args!(fx, args => (a, b); intrinsic);
746640

747-
assert_eq!(a.layout(), b.layout());
748-
let layout = a.layout();
749-
750-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
751-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
752-
assert_eq!(lane_ty, fx.tcx.types.i32);
753-
assert_eq!(ret_lane_ty, fx.tcx.types.u16);
754-
assert_eq!(lane_count * 2, ret_lane_count);
755-
756-
let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));
757-
let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));
758-
let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);
641+
pack_instruction(fx, a, b, ret, PackSize::U16, PackWidth::Sse);
642+
}
759643

760-
for idx in 0..lane_count {
761-
let lane = a.value_lane(fx, idx).load_scalar(fx);
762-
let sat = fx.bcx.ins().smax(lane, min_u16);
763-
let sat = fx.bcx.ins().smin(sat, max_u16);
764-
let res = fx.bcx.ins().ireduce(types::I16, sat);
644+
"llvm.x86.sse2.packssdw.128" => {
645+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
646+
intrinsic_args!(fx, args => (a, b); intrinsic);
765647

766-
let res_lane = CValue::by_val(res, ret_lane_layout);
767-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
768-
}
648+
pack_instruction(fx, a, b, ret, PackSize::S16, PackWidth::Sse);
649+
}
769650

770-
for idx in 0..lane_count {
771-
let lane = b.value_lane(fx, idx).load_scalar(fx);
772-
let sat = fx.bcx.ins().smax(lane, min_u16);
773-
let sat = fx.bcx.ins().smin(sat, max_u16);
774-
let res = fx.bcx.ins().ireduce(types::I16, sat);
651+
"llvm.x86.avx2.packusdw" => {
652+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32&ig_expand=4883
653+
intrinsic_args!(fx, args => (a, b); intrinsic);
775654

776-
let res_lane = CValue::by_val(res, ret_lane_layout);
777-
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
778-
}
655+
pack_instruction(fx, a, b, ret, PackSize::U16, PackWidth::Avx);
779656
}
780657

781658
"llvm.x86.avx2.packssdw" => {
782659
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
783660
intrinsic_args!(fx, args => (a, b); intrinsic);
784661

785-
assert_eq!(a.layout(), b.layout());
786-
let layout = a.layout();
787-
788-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
789-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
790-
assert_eq!(lane_ty, fx.tcx.types.i32);
791-
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
792-
assert_eq!(lane_count * 2, ret_lane_count);
793-
794-
let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
795-
let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
796-
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
797-
798-
for idx in 0..lane_count / 2 {
799-
let lane = a.value_lane(fx, idx).load_scalar(fx);
800-
let sat = fx.bcx.ins().smax(lane, min_i16);
801-
let sat = fx.bcx.ins().smin(sat, max_i16);
802-
let res = fx.bcx.ins().ireduce(types::I16, sat);
803-
804-
let res_lane = CValue::by_val(res, ret_lane_layout);
805-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
806-
}
807-
808-
for idx in 0..lane_count / 2 {
809-
let lane = b.value_lane(fx, idx).load_scalar(fx);
810-
let sat = fx.bcx.ins().smax(lane, min_i16);
811-
let sat = fx.bcx.ins().smin(sat, max_i16);
812-
let res = fx.bcx.ins().ireduce(types::I16, sat);
813-
814-
let res_lane = CValue::by_val(res, ret_lane_layout);
815-
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
816-
}
817-
818-
for idx in 0..lane_count / 2 {
819-
let lane = a.value_lane(fx, idx).load_scalar(fx);
820-
let sat = fx.bcx.ins().smax(lane, min_i16);
821-
let sat = fx.bcx.ins().smin(sat, max_i16);
822-
let res = fx.bcx.ins().ireduce(types::I16, sat);
823-
824-
let res_lane = CValue::by_val(res, ret_lane_layout);
825-
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
826-
}
827-
828-
for idx in 0..lane_count / 2 {
829-
let lane = b.value_lane(fx, idx).load_scalar(fx);
830-
let sat = fx.bcx.ins().smax(lane, min_i16);
831-
let sat = fx.bcx.ins().smin(sat, max_i16);
832-
let res = fx.bcx.ins().ireduce(types::I16, sat);
833-
834-
let res_lane = CValue::by_val(res, ret_lane_layout);
835-
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
836-
}
662+
pack_instruction(fx, a, b, ret, PackSize::S16, PackWidth::Avx);
837663
}
838664

839665
"llvm.x86.fma.vfmaddsub.ps"
@@ -1407,3 +1233,113 @@ fn llvm_add_sub<'tcx>(
14071233

14081234
(cb_out, c)
14091235
}
1236+
1237+
enum PackSize {
1238+
U8,
1239+
U16,
1240+
S8,
1241+
S16,
1242+
}
1243+
1244+
impl PackSize {
1245+
fn ret_clif_type(&self) -> Type {
1246+
match self {
1247+
Self::U8 | Self::S8 => types::I8,
1248+
Self::U16 | Self::S16 => types::I16,
1249+
}
1250+
}
1251+
fn src_clif_type(&self) -> Type {
1252+
match self {
1253+
Self::U8 | Self::S8 => types::I16,
1254+
Self::U16 | Self::S16 => types::I32,
1255+
}
1256+
}
1257+
fn src_ty<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Ty<'tcx> {
1258+
match self {
1259+
Self::U8 | Self::S8 => tcx.types.i16,
1260+
Self::U16 | Self::S16 => tcx.types.i32,
1261+
}
1262+
}
1263+
fn ret_ty<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Ty<'tcx> {
1264+
match self {
1265+
Self::U8 => tcx.types.u8,
1266+
Self::S8 => tcx.types.i8,
1267+
Self::U16 => tcx.types.u16,
1268+
Self::S16 => tcx.types.i16,
1269+
}
1270+
}
1271+
fn max(&self) -> i64 {
1272+
match self {
1273+
Self::U8 => u8::MAX as u64 as i64,
1274+
Self::S8 => i8::MAX as u8 as u64 as i64,
1275+
Self::U16 => u16::MAX as u64 as i64,
1276+
Self::S16 => i16::MAX as u64 as u64 as i64,
1277+
}
1278+
}
1279+
fn min(&self) -> i64 {
1280+
match self {
1281+
Self::U8 | Self::U16 => 0,
1282+
Self::S8 => i16::from(i8::MIN) as u16 as i64,
1283+
Self::S16 => i32::from(i16::MIN) as u32 as i64,
1284+
}
1285+
}
1286+
}
1287+
1288+
enum PackWidth {
1289+
Sse = 1,
1290+
Avx = 2,
1291+
}
1292+
impl PackWidth {
1293+
fn divisor(&self) -> u64 {
1294+
match self {
1295+
Self::Sse => 1,
1296+
Self::Avx => 2,
1297+
}
1298+
}
1299+
}
1300+
1301+
fn pack_instruction<'tcx>(
1302+
fx: &mut FunctionCx<'_, '_, 'tcx>,
1303+
a: CValue<'tcx>,
1304+
b: CValue<'tcx>,
1305+
ret: CPlace<'tcx>,
1306+
ret_size: PackSize,
1307+
width: PackWidth,
1308+
) {
1309+
assert_eq!(a.layout(), b.layout());
1310+
let layout = a.layout();
1311+
1312+
let (src_lane_count, src_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
1313+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
1314+
assert_eq!(src_lane_ty, ret_size.src_ty(fx.tcx));
1315+
assert_eq!(ret_lane_ty, ret_size.ret_ty(fx.tcx));
1316+
assert_eq!(src_lane_count * 2, ret_lane_count);
1317+
1318+
let min = fx.bcx.ins().iconst(ret_size.src_clif_type(), ret_size.min());
1319+
let max = fx.bcx.ins().iconst(ret_size.src_clif_type(), ret_size.max());
1320+
let ret_lane_layout = fx.layout_of(ret_size.ret_ty(fx.tcx));
1321+
1322+
let mut round = |source: CValue<'tcx>, source_offset: u64, dest_offset: u64| {
1323+
let step_amount = src_lane_count / width.divisor();
1324+
let dest_offset = step_amount * dest_offset;
1325+
for idx in 0..step_amount {
1326+
let lane = source.value_lane(fx, step_amount * source_offset + idx).load_scalar(fx);
1327+
let sat = fx.bcx.ins().smax(lane, min);
1328+
let sat = match ret_size {
1329+
PackSize::U8 | PackSize::U16 => fx.bcx.ins().umin(sat, max),
1330+
PackSize::S8 | PackSize::S16 => fx.bcx.ins().smin(sat, max),
1331+
};
1332+
let res = fx.bcx.ins().ireduce(ret_size.ret_clif_type(), sat);
1333+
let res_lane = CValue::by_val(res, ret_lane_layout);
1334+
ret.place_lane(fx, dest_offset + idx).write_cvalue(fx, res_lane);
1335+
}
1336+
};
1337+
1338+
round(a, 0, 0);
1339+
round(b, 0, 1);
1340+
1341+
if let PackWidth::Avx = width {
1342+
round(a, 1, 2);
1343+
round(b, 1, 3);
1344+
}
1345+
}

0 commit comments

Comments
 (0)