Skip to content

Commit 0d3a19b

Browse files
author
Daniel Smith
committed
Add pd gather intrinsics
1 parent 79dee01 commit 0d3a19b

File tree

6 files changed

+186
-0
lines changed

6 files changed

+186
-0
lines changed

crates/core_arch/src/simd.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,7 @@ simd_ty!(i64x8[i64]:
205205
simd_ty!(u64x8[u64]:
206206
u64, u64, u64, u64, u64, u64, u64, u64
207207
| x0, x1, x2, x3, x4, x5, x6, x7);
208+
209+
simd_ty!(f64x8[f64]:
210+
f64, f64, f64, f64, f64, f64, f64, f64
211+
| x0, x1, x2, x3, x4, x5, x6, x7);

crates/core_arch/src/x86/avx512f.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,17 @@ pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
4848
transmute(simd_select_bitmask(k, abs, zero))
4949
}
5050

51+
/// Returns vector of type `__m512d` with all elements set to zero.
52+
///
53+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd)
54+
#[inline]
55+
#[target_feature(enable = "avx512f")]
56+
#[cfg_attr(test, assert_instr(vxorps))]
57+
pub unsafe fn _mm512_setzero_pd() -> __m512d {
58+
// All-0 is a properly initialized __m512d
59+
mem::zeroed()
60+
}
61+
5162
/// Returns vector of type `__m512i` with all elements set to zero.
5263
///
5364
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512)
@@ -87,6 +98,51 @@ pub unsafe fn _mm512_setr_epi32(
8798
transmute(r)
8899
}
89100

101+
/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices.
102+
///
103+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_pd)
104+
#[inline]
105+
#[target_feature(enable = "avx512f")]
106+
#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
107+
pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d {
108+
let zero = _mm512_setzero_pd().as_f64x8();
109+
let neg_one = -1;
110+
let slice = slice as *const i8;
111+
let offsets = offsets.as_i32x8();
112+
macro_rules! call {
113+
($imm8:expr) => {
114+
vgatherdpd(zero, slice, offsets, neg_one, $imm8)
115+
};
116+
}
117+
let r = constify_imm8!(scale, call);
118+
transmute(r)
119+
}
120+
121+
/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices.
122+
///
123+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_pd)
124+
#[inline]
125+
#[target_feature(enable = "avx512f")]
126+
#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
127+
pub unsafe fn _mm512_mask_i32gather_pd(
128+
src: __m512d,
129+
mask: __mmask8,
130+
offsets: __m256i,
131+
slice: *const u8,
132+
scale: i32,
133+
) -> __m512d {
134+
let src = src.as_f64x8();
135+
let slice = slice as *const i8;
136+
let offsets = offsets.as_i32x8();
137+
macro_rules! call {
138+
($imm8:expr) => {
139+
vgatherdpd(src, slice, offsets, mask as i8, $imm8)
140+
};
141+
}
142+
let r = constify_imm8!(scale, call);
143+
transmute(r)
144+
}
145+
90146
/// Gather 64-bit integers from memory using 32-bit indices.
91147
///
92148
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
@@ -135,10 +191,19 @@ pub unsafe fn _mm512_mask_i32gather_epi64(
135191

136192
#[allow(improper_ctypes)]
137193
extern "C" {
194+
#[link_name = "llvm.x86.avx512.gather.dpd.512"]
195+
fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
138196
#[link_name = "llvm.x86.avx512.gather.dpq.512"]
139197
fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
140198
}
141199

200+
/// Broadcast 64-bit float `a` to all elements of `dst`.
201+
#[inline]
202+
#[target_feature(enable = "avx512f")]
203+
pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
204+
transmute(f64x8::splat(a))
205+
}
206+
142207
/// Broadcast 64-bit integer `a` to all elements of `dst`.
143208
#[inline]
144209
#[target_feature(enable = "avx512f")]

crates/core_arch/src/x86/mod.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,24 @@ impl m512iExt for __m512i {
532532
}
533533
}
534534

535+
#[allow(non_camel_case_types)]
536+
#[unstable(feature = "stdimd_internal", issue = "none")]
537+
pub(crate) trait m512dExt: Sized {
538+
fn as_m512d(self) -> __m512d;
539+
540+
#[inline]
541+
fn as_f64x8(self) -> crate::core_arch::simd::f64x8 {
542+
unsafe { transmute(self.as_m512d()) }
543+
}
544+
}
545+
546+
impl m512dExt for __m512d {
547+
#[inline]
548+
fn as_m512d(self) -> Self {
549+
self
550+
}
551+
}
552+
535553
mod eflags;
536554
pub use self::eflags::*;
537555

crates/core_arch/src/x86/test.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,12 @@ pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
143143
}
144144
assert_eq!(A { a }.b, A { a: b }.b)
145145
}
146+
147+
pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
148+
// TODO: This should probably use `_mm512_cmpeq_pd_mask`, but that requires KNC.
149+
union A {
150+
a: __m512d,
151+
b: [f64; 8],
152+
}
153+
assert_eq!(A { a }.b, A { a: b }.b)
154+
}

crates/core_arch/src/x86_64/avx512f.rs

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,44 @@ use crate::{
33
mem::transmute,
44
};
55

6+
/// Sets packed 64-bit integers in `dst` with the supplied values.
7+
///
8+
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd)
9+
#[inline]
10+
#[target_feature(enable = "avx512f")]
11+
pub unsafe fn _mm512_set_pd(
12+
e0: f64,
13+
e1: f64,
14+
e2: f64,
15+
e3: f64,
16+
e4: f64,
17+
e5: f64,
18+
e6: f64,
19+
e7: f64,
20+
) -> __m512d {
21+
_mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
22+
}
23+
24+
/// Sets packed 64-bit integers in `dst` with the supplied values in
25+
/// reverse order.
26+
///
27+
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd)
28+
#[inline]
29+
#[target_feature(enable = "avx512f")]
30+
pub unsafe fn _mm512_setr_pd(
31+
e0: f64,
32+
e1: f64,
33+
e2: f64,
34+
e3: f64,
35+
e4: f64,
36+
e5: f64,
37+
e6: f64,
38+
e7: f64,
39+
) -> __m512d {
40+
let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
41+
transmute(r)
42+
}
43+
644
/// Sets packed 64-bit integers in `dst` with the supplied values.
745
///
846
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_epi64)
@@ -49,6 +87,17 @@ mod tests {
4987
use crate::core_arch::x86::*;
5088
use crate::core_arch::x86_64::*;
5189

90+
#[simd_test(enable = "avx512f")]
91+
unsafe fn test_mm512_setzero_pd() {
92+
assert_eq_m512d(_mm512_setzero_pd(), _mm512_set1_pd(0.));
93+
}
94+
95+
#[simd_test(enable = "avx512f")]
96+
unsafe fn test_mm512_set1_pd() {
97+
let expected = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.);
98+
assert_eq_m512d(expected, _mm512_set1_pd(2.));
99+
}
100+
52101
#[simd_test(enable = "avx512f")]
53102
unsafe fn test_mm512_cmplt_epu64_mask() {
54103
let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
@@ -223,6 +272,18 @@ mod tests {
223272
assert_eq!(r, 0b01001010);
224273
}
225274

275+
#[simd_test(enable = "avx512f")]
276+
unsafe fn test_mm512_set_pd() {
277+
let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
278+
assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.));
279+
}
280+
281+
#[simd_test(enable = "avx512f")]
282+
unsafe fn test_mm512_setr_pd() {
283+
let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
284+
assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.));
285+
}
286+
226287
#[simd_test(enable = "avx512f")]
227288
unsafe fn test_mm512_set_epi64() {
228289
let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
@@ -235,6 +296,34 @@ mod tests {
235296
assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0))
236297
}
237298

299+
#[simd_test(enable = "avx512f")]
300+
unsafe fn test_mm512_i32gather_pd() {
301+
let mut arr = [0f64; 128];
302+
for i in 0..128 {
303+
arr[i] = i as f64;
304+
}
305+
// A multiplier of 8 is word-addressing
306+
#[rustfmt::skip]
307+
let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
308+
let r = _mm512_i32gather_pd(index, arr.as_ptr() as *const u8, 8);
309+
assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.));
310+
}
311+
312+
#[simd_test(enable = "avx512f")]
313+
unsafe fn test_mm512_mask_i32gather_pd() {
314+
let mut arr = [0f64; 128];
315+
for i in 0..128 {
316+
arr[i] = i as f64;
317+
}
318+
let src = _mm512_set1_pd(2.);
319+
let mask = 0b10101010;
320+
#[rustfmt::skip]
321+
let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
322+
// A multiplier of 8 is word-addressing
323+
let r = _mm512_mask_i32gather_pd(src, mask, index, arr.as_ptr() as *const u8, 8);
324+
assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
325+
}
326+
238327
#[simd_test(enable = "avx512f")]
239328
unsafe fn test_mm512_i32gather_epi64() {
240329
let mut arr = [0i64; 128];

crates/std_detect/src/detect/arch/x86.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ features! {
7474
/// * `"avx512bitalg"`
7575
/// * `"avx512bf16"`
7676
/// * `"avx512vp2intersect"`
77+
/// * `"knc"`
7778
/// * `"f16c"`
7879
/// * `"fma"`
7980
/// * `"bmi1"`

0 commit comments

Comments
 (0)