Add pd gather intrinsics

Daniel Smith · Daniel Smith · commit 0d3a19befad3 · 2020-05-31T17:28:39.000Z
diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
@@ -205,3 +205,7 @@ simd_ty!(i64x8[i64]:
 simd_ty!(u64x8[u64]:
          u64, u64, u64, u64, u64, u64, u64, u64
          | x0, x1, x2, x3, x4, x5, x6, x7);
+
+simd_ty!(f64x8[f64]:
+         f64, f64, f64, f64, f64, f64, f64, f64
+         | x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
@@ -48,6 +48,17 @@ pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
     transmute(simd_select_bitmask(k, abs, zero))
 }
 
+/// Returns vector of type `__m512d` with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm512_setzero_pd() -> __m512d {
+    // All-0 is a properly initialized __m512d
+    mem::zeroed()
+}
+
 /// Returns vector of type `__m512i` with all elements set to zero.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512)
@@ -87,6 +98,51 @@ pub unsafe fn _mm512_setr_epi32(
     transmute(r)
 }
 
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d {
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherdpd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+pub unsafe fn _mm512_mask_i32gather_pd(
+    src: __m512d,
+    mask: __mmask8,
+    offsets: __m256i,
+    slice: *const u8,
+    scale: i32,
+) -> __m512d {
+    let src = src.as_f64x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherdpd(src, slice, offsets, mask as i8, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
 /// Gather 64-bit integers from memory using 32-bit indices.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
@@ -135,10 +191,19 @@ pub unsafe fn _mm512_mask_i32gather_epi64(
 
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.x86.avx512.gather.dpd.512"]
+    fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
     #[link_name = "llvm.x86.avx512.gather.dpq.512"]
     fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
 }
 
+/// Broadcast 64-bit float `a` to all elements of `dst`.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
+    transmute(f64x8::splat(a))
+}
+
 /// Broadcast 64-bit integer `a` to all elements of `dst`.
 #[inline]
 #[target_feature(enable = "avx512f")]
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
@@ -532,6 +532,24 @@ impl m512iExt for __m512i {
     }
 }
 
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdimd_internal", issue = "none")]
+pub(crate) trait m512dExt: Sized {
+    fn as_m512d(self) -> __m512d;
+
+    #[inline]
+    fn as_f64x8(self) -> crate::core_arch::simd::f64x8 {
+        unsafe { transmute(self.as_m512d()) }
+    }
+}
+
+impl m512dExt for __m512d {
+    #[inline]
+    fn as_m512d(self) -> Self {
+        self
+    }
+}
+
 mod eflags;
 pub use self::eflags::*;
 
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
@@ -143,3 +143,12 @@ pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
     }
     assert_eq!(A { a }.b, A { a: b }.b)
 }
+
+pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
+    // TODO: This should probably use `_mm512_cmpeq_pd_mask`, but that requires KNC.
+    union A {
+        a: __m512d,
+        b: [f64; 8],
+    }
+    assert_eq!(A { a }.b, A { a: b }.b)
+}
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
@@ -3,6 +3,44 @@ use crate::{
     mem::transmute,
 };
 
+/// Sets packed 64-bit integers in `dst` with the supplied values.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Sets packed 64-bit integers in `dst` with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+    transmute(r)
+}
+
 /// Sets packed 64-bit integers in `dst` with the supplied values.
 ///
 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_epi64)
@@ -49,6 +87,17 @@ mod tests {
     use crate::core_arch::x86::*;
     use crate::core_arch::x86_64::*;
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_pd() {
+        assert_eq_m512d(_mm512_setzero_pd(), _mm512_set1_pd(0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_pd() {
+        let expected = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(expected, _mm512_set1_pd(2.));
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu64_mask() {
         let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
@@ -223,6 +272,18 @@ mod tests {
         assert_eq!(r, 0b01001010);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_pd() {
+        let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_pd() {
+        let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_set_epi64() {
         let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
@@ -235,6 +296,34 @@ mod tests {
         assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0))
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        // A multiplier of 8 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i32gather_pd(index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        let src = _mm512_set1_pd(2.);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i32gather_pd(src, mask, index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_i32gather_epi64() {
         let mut arr = [0i64; 128];
diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs
@@ -74,6 +74,7 @@ features! {
     /// * `"avx512bitalg"`
     /// * `"avx512bf16"`
     /// * `"avx512vp2intersect"`
+    /// * `"knc"`
     /// * `"f16c"`
     /// * `"fma"`
     /// * `"bmi1"`

Original file line number	Diff line number	Diff line change
`@@ -143,3 +143,12 @@ pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {`
`143`	`143`	`}`
`144`	`144`	`assert_eq!(A { a }.b, A { a: b }.b)`
`145`	`145`	`}`
	`146`	`+`
	`147`	`+pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {`
	`148`	+ // TODO: This should probably use `_mm512_cmpeq_pd_mask`, but that requires KNC.
	`149`	`+ union A {`
	`150`	`+ a: __m512d,`
	`151`	`+ b: [f64; 8],`
	`152`	`+ }`
	`153`	`+ assert_eq!(A { a }.b, A { a: b }.b)`
	`154`	`+}`