Skip to content

Commit 9b90883

Browse files
author
Daniel Smith
committed
Add 32 bit output gather intrinsics
1 parent f244d2e commit 9b90883

File tree

3 files changed

+169
-0
lines changed

3 files changed

+169
-0
lines changed

crates/core_arch/src/x86/avx512f.rs

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,51 @@ pub unsafe fn _mm512_mask_i64gather_pd(
188188
transmute(r)
189189
}
190190

191+
/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices.
192+
///
193+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_ps)
194+
#[inline]
195+
#[target_feature(enable = "avx512f")]
196+
#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
197+
pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m256 {
198+
let zero = _mm256_setzero_ps().as_f32x8();
199+
let neg_one = -1;
200+
let slice = slice as *const i8;
201+
let offsets = offsets.as_i64x8();
202+
macro_rules! call {
203+
($imm8:expr) => {
204+
vgatherqps(zero, slice, offsets, neg_one, $imm8)
205+
};
206+
}
207+
let r = constify_imm8!(scale, call);
208+
transmute(r)
209+
}
210+
211+
/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices.
212+
///
213+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_ps)
214+
#[inline]
215+
#[target_feature(enable = "avx512f")]
216+
#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
217+
pub unsafe fn _mm512_mask_i64gather_ps(
218+
src: __m256,
219+
mask: __mmask8,
220+
offsets: __m512i,
221+
slice: *const u8,
222+
scale: i32,
223+
) -> __m256 {
224+
let src = src.as_f32x8();
225+
let slice = slice as *const i8;
226+
let offsets = offsets.as_i64x8();
227+
macro_rules! call {
228+
($imm8:expr) => {
229+
vgatherqps(src, slice, offsets, mask as i8, $imm8)
230+
};
231+
}
232+
let r = constify_imm8!(scale, call);
233+
transmute(r)
234+
}
235+
191236
/// Gather 64-bit integers from memory using 32-bit indices.
192237
///
193238
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
@@ -280,16 +325,66 @@ pub unsafe fn _mm512_mask_i64gather_epi64(
280325
transmute(r)
281326
}
282327

328+
/// Gather 32-bit integers from memory using 64-bit indices.
329+
///
330+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64)
331+
#[inline]
332+
#[target_feature(enable = "avx512f")]
333+
#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
334+
pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m256i {
335+
let zeros = _mm256_setzero_si256().as_i32x8();
336+
let neg_one = -1;
337+
let slice = slice as *const i8;
338+
let offsets = offsets.as_i64x8();
339+
macro_rules! call {
340+
($imm8:expr) => {
341+
vpgatherqd(zeros, slice, offsets, neg_one, $imm8)
342+
};
343+
}
344+
let r = constify_imm8!(scale, call);
345+
transmute(r)
346+
}
347+
348+
/// Gather 32-bit integers from memory using 64-bit indices.
349+
///
350+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64)
351+
#[inline]
352+
#[target_feature(enable = "avx512f")]
353+
#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
354+
pub unsafe fn _mm512_mask_i64gather_epi32(
355+
src: __m256i,
356+
mask: __mmask8,
357+
offsets: __m512i,
358+
slice: *const u8,
359+
scale: i32,
360+
) -> __m256i {
361+
let src = src.as_i32x8();
362+
let mask = mask as i8;
363+
let slice = slice as *const i8;
364+
let offsets = offsets.as_i64x8();
365+
macro_rules! call {
366+
($imm8:expr) => {
367+
vpgatherqd(src, slice, offsets, mask, $imm8)
368+
};
369+
}
370+
let r = constify_imm8!(scale, call);
371+
transmute(r)
372+
}
373+
283374
#[allow(improper_ctypes)]
284375
extern "C" {
285376
#[link_name = "llvm.x86.avx512.gather.dpd.512"]
286377
fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
287378
#[link_name = "llvm.x86.avx512.gather.qpd.512"]
288379
fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
380+
#[link_name = "llvm.x86.avx512.gather.qps.512"]
381+
fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
289382
#[link_name = "llvm.x86.avx512.gather.dpq.512"]
290383
fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
291384
#[link_name = "llvm.x86.avx512.gather.qpq.512"]
292385
fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
386+
#[link_name = "llvm.x86.avx512.gather.qpi.512"]
387+
fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
293388
}
294389

295390
/// Broadcast 64-bit float `a` to all elements of `dst`.

crates/core_arch/src/x86/mod.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,24 @@ impl m256iExt for __m256i {
504504
}
505505
}
506506

507+
#[allow(non_camel_case_types)]
508+
#[unstable(feature = "stdimd_internal", issue = "none")]
509+
pub(crate) trait m256Ext: Sized {
510+
fn as_m256(self) -> __m256;
511+
512+
#[inline]
513+
fn as_f32x8(self) -> crate::core_arch::simd::f32x8 {
514+
unsafe { transmute(self.as_m256()) }
515+
}
516+
}
517+
518+
impl m256Ext for __m256 {
519+
#[inline]
520+
fn as_m256(self) -> Self {
521+
self
522+
}
523+
}
524+
507525
#[allow(non_camel_case_types)]
508526
#[unstable(feature = "stdimd_internal", issue = "none")]
509527
pub(crate) trait m512iExt: Sized {

crates/core_arch/src/x86_64/avx512f.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,34 @@ mod tests {
352352
assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
353353
}
354354

355+
#[simd_test(enable = "avx512f")]
356+
unsafe fn test_mm512_i64gather_ps() {
357+
let mut arr = [0f32; 128];
358+
for i in 0..128 {
359+
arr[i] = i as f32;
360+
}
361+
// A multiplier of 4 is word-addressing
362+
#[rustfmt::skip]
363+
let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
364+
let r = _mm512_i64gather_ps(index, arr.as_ptr() as *const u8, 4);
365+
assert_eq_m256(r, _mm256_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.));
366+
}
367+
368+
#[simd_test(enable = "avx512f")]
369+
unsafe fn test_mm512_mask_i64gather_ps() {
370+
let mut arr = [0f32; 128];
371+
for i in 0..128 {
372+
arr[i] = i as f32;
373+
}
374+
let src = _mm256_set1_ps(2.);
375+
let mask = 0b10101010;
376+
#[rustfmt::skip]
377+
let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
378+
// A multiplier of 4 is word-addressing
379+
let r = _mm512_mask_i64gather_ps(src, mask, index, arr.as_ptr() as *const u8, 4);
380+
assert_eq_m256(r, _mm256_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.));
381+
}
382+
355383
#[simd_test(enable = "avx512f")]
356384
unsafe fn test_mm512_i32gather_epi64() {
357385
let mut arr = [0i64; 128];
@@ -407,4 +435,32 @@ mod tests {
407435
let r = _mm512_mask_i64gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8);
408436
assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
409437
}
438+
439+
#[simd_test(enable = "avx512f")]
440+
unsafe fn test_mm512_i64gather_epi32() {
441+
let mut arr = [0i64; 128];
442+
for i in 0..128i64 {
443+
arr[i as usize] = i;
444+
}
445+
// A multiplier of 8 is word-addressing
446+
#[rustfmt::skip]
447+
let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
448+
let r = _mm512_i64gather_epi32(index, arr.as_ptr() as *const u8, 8);
449+
assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112));
450+
}
451+
452+
#[simd_test(enable = "avx512f")]
453+
unsafe fn test_mm512_mask_i64gather_epi32() {
454+
let mut arr = [0i64; 128];
455+
for i in 0..128i64 {
456+
arr[i as usize] = i;
457+
}
458+
let src = _mm256_set1_epi32(2);
459+
let mask = 0b10101010;
460+
#[rustfmt::skip]
461+
let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
462+
// A multiplier of 8 is word-addressing
463+
let r = _mm512_mask_i64gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 8);
464+
assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112));
465+
}
410466
}

0 commit comments

Comments
 (0)