Skip to content

Commit f244d2e

Browse files
author
Daniel Smith
committed
Add 64 bit index variants
1 parent 0d3a19b commit f244d2e

File tree

2 files changed

+151
-0
lines changed

2 files changed

+151
-0
lines changed

crates/core_arch/src/x86/avx512f.rs

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,51 @@ pub unsafe fn _mm512_mask_i32gather_pd(
143143
transmute(r)
144144
}
145145

146+
/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices.
147+
///
148+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_pd)
149+
#[inline]
150+
#[target_feature(enable = "avx512f")]
151+
#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
152+
pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32) -> __m512d {
153+
let zero = _mm512_setzero_pd().as_f64x8();
154+
let neg_one = -1;
155+
let slice = slice as *const i8;
156+
let offsets = offsets.as_i64x8();
157+
macro_rules! call {
158+
($imm8:expr) => {
159+
vgatherqpd(zero, slice, offsets, neg_one, $imm8)
160+
};
161+
}
162+
let r = constify_imm8!(scale, call);
163+
transmute(r)
164+
}
165+
166+
/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices.
167+
///
168+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_pd)
169+
#[inline]
170+
#[target_feature(enable = "avx512f")]
171+
#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
172+
pub unsafe fn _mm512_mask_i64gather_pd(
173+
src: __m512d,
174+
mask: __mmask8,
175+
offsets: __m512i,
176+
slice: *const u8,
177+
scale: i32,
178+
) -> __m512d {
179+
let src = src.as_f64x8();
180+
let slice = slice as *const i8;
181+
let offsets = offsets.as_i64x8();
182+
macro_rules! call {
183+
($imm8:expr) => {
184+
vgatherqpd(src, slice, offsets, mask as i8, $imm8)
185+
};
186+
}
187+
let r = constify_imm8!(scale, call);
188+
transmute(r)
189+
}
190+
146191
/// Gather 64-bit integers from memory using 32-bit indices.
147192
///
148193
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
@@ -189,12 +234,62 @@ pub unsafe fn _mm512_mask_i32gather_epi64(
189234
transmute(r)
190235
}
191236

237+
/// Gather 64-bit integers from memory using 64-bit indices.
238+
///
239+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64)
240+
#[inline]
241+
#[target_feature(enable = "avx512f")]
242+
#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
243+
pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i {
244+
let zero = _mm512_setzero_si512().as_i64x8();
245+
let neg_one = -1;
246+
let slice = slice as *const i8;
247+
let offsets = offsets.as_i64x8();
248+
macro_rules! call {
249+
($imm8:expr) => {
250+
vpgatherqq(zero, slice, offsets, neg_one, $imm8)
251+
};
252+
}
253+
let r = constify_imm8!(scale, call);
254+
transmute(r)
255+
}
256+
257+
/// Gather 64-bit integers from memory using 64-bit indices.
258+
///
259+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64)
260+
#[inline]
261+
#[target_feature(enable = "avx512f")]
262+
#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
263+
pub unsafe fn _mm512_mask_i64gather_epi64(
264+
src: __m512i,
265+
mask: __mmask8,
266+
offsets: __m512i,
267+
slice: *const u8,
268+
scale: i32,
269+
) -> __m512i {
270+
let src = src.as_i64x8();
271+
let mask = mask as i8;
272+
let slice = slice as *const i8;
273+
let offsets = offsets.as_i64x8();
274+
macro_rules! call {
275+
($imm8:expr) => {
276+
vpgatherqq(src, slice, offsets, mask, $imm8)
277+
};
278+
}
279+
let r = constify_imm8!(scale, call);
280+
transmute(r)
281+
}
282+
192283
#[allow(improper_ctypes)]
193284
extern "C" {
194285
#[link_name = "llvm.x86.avx512.gather.dpd.512"]
195286
fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
287+
#[link_name = "llvm.x86.avx512.gather.qpd.512"]
288+
fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
196289
#[link_name = "llvm.x86.avx512.gather.dpq.512"]
197290
fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
291+
#[link_name = "llvm.x86.avx512.gather.qpq.512"]
292+
fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
198293
}
199294

200295
/// Broadcast 64-bit float `a` to all elements of `dst`.

crates/core_arch/src/x86_64/avx512f.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,34 @@ mod tests {
324324
assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
325325
}
326326

327+
#[simd_test(enable = "avx512f")]
328+
unsafe fn test_mm512_i64gather_pd() {
329+
let mut arr = [0f64; 128];
330+
for i in 0..128 {
331+
arr[i] = i as f64;
332+
}
333+
// A multiplier of 8 is word-addressing
334+
#[rustfmt::skip]
335+
let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
336+
let r = _mm512_i64gather_pd(index, arr.as_ptr() as *const u8, 8);
337+
assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.));
338+
}
339+
340+
#[simd_test(enable = "avx512f")]
341+
unsafe fn test_mm512_mask_i64gather_pd() {
342+
let mut arr = [0f64; 128];
343+
for i in 0..128 {
344+
arr[i] = i as f64;
345+
}
346+
let src = _mm512_set1_pd(2.);
347+
let mask = 0b10101010;
348+
#[rustfmt::skip]
349+
let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
350+
// A multiplier of 8 is word-addressing
351+
let r = _mm512_mask_i64gather_pd(src, mask, index, arr.as_ptr() as *const u8, 8);
352+
assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
353+
}
354+
327355
#[simd_test(enable = "avx512f")]
328356
unsafe fn test_mm512_i32gather_epi64() {
329357
let mut arr = [0i64; 128];
@@ -351,4 +379,32 @@ mod tests {
351379
let r = _mm512_mask_i32gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8);
352380
assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
353381
}
382+
383+
#[simd_test(enable = "avx512f")]
384+
unsafe fn test_mm512_i64gather_epi64() {
385+
let mut arr = [0i64; 128];
386+
for i in 0..128i64 {
387+
arr[i as usize] = i;
388+
}
389+
// A multiplier of 8 is word-addressing
390+
#[rustfmt::skip]
391+
let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
392+
let r = _mm512_i64gather_epi64(index, arr.as_ptr() as *const u8, 8);
393+
assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112));
394+
}
395+
396+
#[simd_test(enable = "avx512f")]
397+
unsafe fn test_mm512_mask_i64gather_epi64() {
398+
let mut arr = [0i64; 128];
399+
for i in 0..128i64 {
400+
arr[i as usize] = i;
401+
}
402+
let src = _mm512_set1_epi64(2);
403+
let mask = 0b10101010;
404+
#[rustfmt::skip]
405+
let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
406+
// A multiplier of 8 is word-addressing
407+
let r = _mm512_mask_i64gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8);
408+
assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
409+
}
354410
}

0 commit comments

Comments
 (0)