Skip to content

Commit 4d92865

Browse files
author
Daniel Smith
committed
Add 32bit scatter intrinsics
1 parent 3e0675d commit 4d92865

File tree

1 file changed

+166
-2
lines changed

1 file changed

+166
-2
lines changed

crates/core_arch/src/x86/avx512f.rs

Lines changed: 166 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,51 @@ pub unsafe fn _mm512_mask_i64scatter_pd(
579579
constify_imm8_gather!(scale, call);
580580
}
581581

582+
/// Scatter single-precision (32-bit) floating-point elements from memory using 32-bit indices.
583+
///
584+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_ps)
585+
#[inline]
586+
#[target_feature(enable = "avx512f")]
587+
#[cfg_attr(test, assert_instr(vscatterdps, scale = 1))]
588+
#[rustc_args_required_const(3)]
589+
pub unsafe fn _mm512_i32scatter_ps(slice: *mut u8, offsets: __m512i, src: __m512, scale: i32) {
590+
let src = src.as_f32x16();
591+
let neg_one = -1;
592+
let slice = slice as *mut i8;
593+
let offsets = offsets.as_i32x16();
594+
macro_rules! call {
595+
($imm8:expr) => {
596+
vscatterdps(slice, neg_one, offsets, src, $imm8)
597+
};
598+
}
599+
constify_imm8_gather!(scale, call);
600+
}
601+
602+
/// Scatter single-precision (32-bit) floating-point elements from src into memory using 32-bit indices.
603+
///
604+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_ps)
605+
#[inline]
606+
#[target_feature(enable = "avx512f")]
607+
#[cfg_attr(test, assert_instr(vscatterdps, scale = 1))]
608+
#[rustc_args_required_const(4)]
609+
pub unsafe fn _mm512_mask_i32scatter_ps(
610+
slice: *mut u8,
611+
mask: __mmask16,
612+
offsets: __m512i,
613+
src: __m512,
614+
scale: i32,
615+
) {
616+
let src = src.as_f32x16();
617+
let slice = slice as *mut i8;
618+
let offsets = offsets.as_i32x16();
619+
macro_rules! call {
620+
($imm8:expr) => {
621+
vscatterdps(slice, mask as i16, offsets, src, $imm8)
622+
};
623+
}
624+
constify_imm8_gather!(scale, call);
625+
}
626+
582627
/// Scatter single-precision (32-bit) floating-point elements from src into memory using 64-bit indices.
583628
///
584629
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps)
@@ -716,6 +761,52 @@ pub unsafe fn _mm512_mask_i64scatter_epi64(
716761
constify_imm8_gather!(scale, call);
717762
}
718763

764+
/// Scatter 32-bit integers from src into memory using 32-bit indices.
765+
///
766+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32)
767+
#[inline]
768+
#[target_feature(enable = "avx512f")]
769+
#[cfg_attr(test, assert_instr(vpscatterdd, scale = 1))]
770+
#[rustc_args_required_const(3)]
771+
pub unsafe fn _mm512_i32scatter_epi32(slice: *mut u8, offsets: __m512i, src: __m512i, scale: i32) {
772+
let src = src.as_i32x16();
773+
let neg_one = -1;
774+
let slice = slice as *mut i8;
775+
let offsets = offsets.as_i32x16();
776+
macro_rules! call {
777+
($imm8:expr) => {
778+
vpscatterdd(slice, neg_one, offsets, src, $imm8)
779+
};
780+
}
781+
constify_imm8_gather!(scale, call);
782+
}
783+
784+
/// Scatter 32-bit integers from src into memory using 32-bit indices.
785+
///
786+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi32)
787+
#[inline]
788+
#[target_feature(enable = "avx512f")]
789+
#[cfg_attr(test, assert_instr(vpscatterdd, scale = 1))]
790+
#[rustc_args_required_const(4)]
791+
pub unsafe fn _mm512_mask_i32scatter_epi32(
792+
slice: *mut u8,
793+
mask: __mmask16,
794+
offsets: __m512i,
795+
src: __m512i,
796+
scale: i32,
797+
) {
798+
let src = src.as_i32x16();
799+
let mask = mask as i16;
800+
let slice = slice as *mut i8;
801+
let offsets = offsets.as_i32x16();
802+
macro_rules! call {
803+
($imm8:expr) => {
804+
vpscatterdd(slice, mask, offsets, src, $imm8)
805+
};
806+
}
807+
constify_imm8_gather!(scale, call);
808+
}
809+
719810
/// Scatter 32-bit integers from src into memory using 64-bit indices.
720811
///
721812
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32)
@@ -1580,6 +1671,8 @@ extern "C" {
15801671

15811672
#[link_name = "llvm.x86.avx512.scatter.dpd.512"]
15821673
fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
1674+
#[link_name = "llvm.x86.avx512.scatter.dps.512"]
1675+
fn vscatterdps(slice: *mut i8, mask: i16, offsets: i32x16, src: f32x16, scale: i32);
15831676
#[link_name = "llvm.x86.avx512.scatter.qpd.512"]
15841677
fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
15851678
#[link_name = "llvm.x86.avx512.scatter.qps.512"]
@@ -1767,12 +1860,83 @@ mod tests {
17671860
let mask = 0b10101010_10101010;
17681861
#[rustfmt::skip]
17691862
let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
1770-
120, 128, 136, 144, 152, 160, 168, 176);
1863+
128, 144, 160, 176, 192, 208, 224, 240);
17711864
// A multiplier of 4 is word-addressing
17721865
let r = _mm512_mask_i32gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 4);
17731866
#[rustfmt::skip]
17741867
assert_eq_m512i(r, _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112,
1775-
2, 128, 2, 144, 2, 160, 2, 176));
1868+
2, 144, 2, 176, 2, 208, 2, 240));
1869+
}
1870+
1871+
#[simd_test(enable = "avx512f")]
1872+
unsafe fn test_mm512_i32scatter_ps() {
1873+
let mut arr = [0f32; 256];
1874+
#[rustfmt::skip]
1875+
let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
1876+
128, 144, 160, 176, 192, 208, 224, 240);
1877+
let src = _mm512_setr_ps(
1878+
1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
1879+
);
1880+
// A multiplier of 4 is word-addressing
1881+
_mm512_i32scatter_ps(arr.as_mut_ptr() as *mut u8, index, src, 4);
1882+
let mut expected = [0f32; 256];
1883+
for i in 0..16 {
1884+
expected[i * 16] = (i + 1) as f32;
1885+
}
1886+
assert_eq!(&arr[..], &expected[..],);
1887+
}
1888+
1889+
#[simd_test(enable = "avx512f")]
1890+
unsafe fn test_mm512_mask_i32scatter_ps() {
1891+
let mut arr = [0f32; 256];
1892+
let mask = 0b10101010_10101010;
1893+
#[rustfmt::skip]
1894+
let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
1895+
128, 144, 160, 176, 192, 208, 224, 240);
1896+
let src = _mm512_setr_ps(
1897+
1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
1898+
);
1899+
// A multiplier of 4 is word-addressing
1900+
_mm512_mask_i32scatter_ps(arr.as_mut_ptr() as *mut u8, mask, index, src, 4);
1901+
let mut expected = [0f32; 256];
1902+
for i in 0..8 {
1903+
expected[i * 32 + 16] = 2. * (i + 1) as f32;
1904+
}
1905+
assert_eq!(&arr[..], &expected[..],);
1906+
}
1907+
1908+
#[simd_test(enable = "avx512f")]
1909+
unsafe fn test_mm512_i32scatter_epi32() {
1910+
let mut arr = [0i32; 256];
1911+
#[rustfmt::skip]
1912+
1913+
let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
1914+
128, 144, 160, 176, 192, 208, 224, 240);
1915+
let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1916+
// A multiplier of 4 is word-addressing
1917+
_mm512_i32scatter_epi32(arr.as_mut_ptr() as *mut u8, index, src, 4);
1918+
let mut expected = [0i32; 256];
1919+
for i in 0..16 {
1920+
expected[i * 16] = (i + 1) as i32;
1921+
}
1922+
assert_eq!(&arr[..], &expected[..],);
1923+
}
1924+
1925+
#[simd_test(enable = "avx512f")]
1926+
unsafe fn test_mm512_mask_i32scatter_epi32() {
1927+
let mut arr = [0i32; 256];
1928+
let mask = 0b10101010_10101010;
1929+
#[rustfmt::skip]
1930+
let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
1931+
128, 144, 160, 176, 192, 208, 224, 240);
1932+
let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1933+
// A multiplier of 4 is word-addressing
1934+
_mm512_mask_i32scatter_epi32(arr.as_mut_ptr() as *mut u8, mask, index, src, 4);
1935+
let mut expected = [0i32; 256];
1936+
for i in 0..8 {
1937+
expected[i * 32 + 16] = 2 * (i + 1) as i32;
1938+
}
1939+
assert_eq!(&arr[..], &expected[..],);
17761940
}
17771941

17781942
#[simd_test(enable = "avx512f")]

0 commit comments

Comments
 (0)