rust-lang · alexcrichton · Feb 11, 2018 · Feb 10, 2018
diff --git a/coresimd/src/lib.rs b/coresimd/src/lib.rs
@@ -15,7 +15,7 @@
            simd_ffi, target_feature, cfg_target_feature, i128_type, asm,
            integer_atomics, stmt_expr_attributes, core_intrinsics,
            crate_in_paths)]
-#![cfg_attr(test, feature(proc_macro, test, attr_literals))]
+#![cfg_attr(test, feature(proc_macro, test, attr_literals, abi_vectorcall))]
 #![cfg_attr(feature = "cargo-clippy",
             allow(inline_always, too_many_arguments, cast_sign_loss,
                   cast_lossless, cast_possible_wrap,

diff --git a/coresimd/src/x86/i586/avx.rs b/coresimd/src/x86/i586/avx.rs
@@ -1921,7 +1921,6 @@ pub unsafe fn _mm256_set_epi32(
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
-#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
     _mm256_setr_epi64x(d, c, b, a)
 }
@@ -2001,7 +2000,6 @@ pub unsafe fn _mm256_setr_epi32(
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
-#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
     mem::transmute(i64x4::new(a, b, c, d))
 }

diff --git a/coresimd/src/x86/i586/avx2.rs b/coresimd/src/x86/i586/avx2.rs
@@ -226,7 +226,7 @@ pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))]
+#[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
     let imm8 = (imm8 & 0xFF) as u8;
     let a = a.as_i32x4();
@@ -258,7 +258,7 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))]
+#[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 pub unsafe fn _mm256_blend_epi32(
     a: __m256i, b: __m256i, imm8: i32
 ) -> __m256i {
@@ -1790,15 +1790,15 @@ pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// integers of `a`.
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpermd))]
+#[cfg_attr(test, assert_instr(vpermps))]
 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(permd(a.as_u32x8(), b.as_u32x8()))
 }
 
 /// Permutes 64-bit integers from `a` using control mask `imm8`.
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpermq, imm8 = 9))]
+#[cfg_attr(test, assert_instr(vpermpd, imm8 = 9))]
 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
     let imm8 = (imm8 & 0xFF) as u8;
     let zero = _mm256_setzero_si256().as_i64x4();
@@ -2007,7 +2007,7 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// ```
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
+#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
     // simd_shuffleX requires that its selector parameter be made up of
     // constant values, but we can't enforce that here. In spirit, we need
@@ -2762,7 +2762,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// ```
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpunpckhdq))]
+#[cfg_attr(test, assert_instr(vunpckhps))]
 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
     let r: i32x8 = simd_shuffle8(
         a.as_i32x8(),
@@ -2802,7 +2802,7 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// ```
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpunpckldq))]
+#[cfg_attr(test, assert_instr(vunpcklps))]
 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
     let r: i32x8 =
         simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
@@ -2839,7 +2839,7 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// ```
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
     mem::transmute(r)
@@ -2875,7 +2875,7 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// ```
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
     mem::transmute(r)

diff --git a/coresimd/src/x86/i586/sse.rs b/coresimd/src/x86/i586/sse.rs
@@ -832,8 +832,7 @@ pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
 /// lower half of result.
 #[inline]
 #[target_feature(enable = "sse")]
-#[cfg_attr(all(test, not(windows)), assert_instr(movhlps))]
-#[cfg_attr(all(test, windows), assert_instr(unpckhpd))]
+#[cfg_attr(test, assert_instr(movhlps))]
 pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
     // TODO; figure why this is a different instruction on Windows?
     simd_shuffle4(a, b, [6, 7, 2, 3])
@@ -843,8 +842,7 @@ pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
 /// higher half of result.
 #[inline]
 #[target_feature(enable = "sse")]
-#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(unpcklpd))]
-#[cfg_attr(all(test, not(target_feature = "sse2")), assert_instr(movlhps))]
+#[cfg_attr(test, assert_instr(movlhps))]
 pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
     simd_shuffle4(a, b, [0, 1, 4, 5])
 }
@@ -900,7 +898,7 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
 // 32-bit codegen does not generate `movhps` or `movhpd`, but instead
 // `movsd` followed by `unpcklpd` (or `movss'/`unpcklps` if there's no SSE2).
 #[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"),
-           assert_instr(unpcklpd))]
+           assert_instr(movlhps))]
 #[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
            assert_instr(unpcklps))]
 // TODO: This function is actually not limited to floats, but that's what
@@ -1095,13 +1093,8 @@ pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
 #[inline]
 #[target_feature(enable = "sse")]
 // On i586 the codegen just generates plane MOVs. No need to test for that.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"),
-               not(target_family = "windows")),
+#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
            assert_instr(movlps))]
-// Win64 passes `a` by reference, which causes it to generate two 64 bit moves.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"),
-               target_family = "windows"),
-           assert_instr(movsd))]
 pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) {
     #[cfg(target_arch = "x86")]
     {

diff --git a/coresimd/src/x86/i586/sse2.rs b/coresimd/src/x86/i586/sse2.rs
@@ -710,7 +710,7 @@ pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
 /// Return the lowest element of `a`.
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(all(test, not(windows)), assert_instr(movd))] // FIXME mov on windows
+#[cfg_attr(test, assert_instr(movd))]
 pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
     simd_extract(a.as_i32x4(), 0)
 }
@@ -1207,7 +1207,7 @@ pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(punpckhdq))]
+#[cfg_attr(test, assert_instr(unpckhps))]
 pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i32x4, _>(simd_shuffle4(
         a.as_i32x4(),
@@ -1219,7 +1219,7 @@ pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(punpckhqdq))]
+#[cfg_attr(test, assert_instr(unpckhpd))]
 pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i64x2, _>(simd_shuffle2(
         a.as_i64x2(),
@@ -1253,7 +1253,7 @@ pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(punpckldq))]
+#[cfg_attr(test, assert_instr(unpcklps))]
 pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i32x4, _>(simd_shuffle4(
         a.as_i32x4(),
@@ -1265,7 +1265,7 @@ pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(punpcklqdq))]
+#[cfg_attr(test, assert_instr(movlhps))]
 pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i64x2, _>(simd_shuffle2(
         a.as_i64x2(),
@@ -1795,7 +1795,6 @@ pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
 /// Return the lower double-precision (64-bit) floating-point element of "a".
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(all(test, windows), assert_instr(movsd))] // FIXME movq/movlps/mov on other platform
 pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
     simd_extract(a, 0)
 }
@@ -1953,7 +1952,7 @@ pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
 /// memory location.
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movsd only on windows
+#[cfg_attr(test, assert_instr(movlps))]
 pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
     *mem_addr = simd_extract(a, 0)
 }
@@ -2022,7 +2021,7 @@ pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
 /// memory location.
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movlpd (movsd on windows)
+#[cfg_attr(test, assert_instr(movlps))] // FIXME movlpd
 pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
     *mem_addr = simd_extract(a, 0);
 }
@@ -2179,7 +2178,7 @@ pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
 /// * The [63:0] bits are copied from the [63:0] bits of the first input
 #[inline]
 #[target_feature(enable = "sse2")]
-#[cfg_attr(test, assert_instr(unpcklpd))]
+#[cfg_attr(test, assert_instr(movlhps))]
 pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
     simd_shuffle2(a, b, [0, 2])
 }

diff --git a/coresimd/src/x86/i586/sse41.rs b/coresimd/src/x86/i586/sse41.rs
@@ -120,7 +120,7 @@ pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
 #[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
-#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))]
+#[cfg_attr(test, assert_instr(extractps, imm8 = 0))]
 pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
     mem::transmute(simd_extract::<_, f32>(a, imm8 as u32 & 0b11))
 }
@@ -141,7 +141,7 @@ pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
 #[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
-#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8 = 1))]
+#[cfg_attr(test, assert_instr(extractps, imm8 = 1))]
 pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
     let imm8 = (imm8 & 3) as u32;
     simd_extract::<_, i32>(a.as_i32x4(), imm8)

diff --git a/coresimd/src/x86/x86_64/sse41.rs b/coresimd/src/x86/x86_64/sse41.rs
@@ -12,7 +12,7 @@ use stdsimd_test::assert_instr;
 #[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
-#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8 = 1))]
+#[cfg_attr(test, assert_instr(pextrq, imm8 = 1))]
 pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 {
     let imm8 = (imm8 & 1) as u32;
     simd_extract(a.as_i64x2(), imm8)

diff --git a/stdsimd-test/assert-instr-macro/src/lib.rs b/stdsimd-test/assert-instr-macro/src/lib.rs
@@ -80,9 +80,17 @@ pub fn assert_instr(
         })
         .collect::<Vec<_>>();
     let attrs = Append(&attrs);
+
+    // Use an ABI on Windows that passes SIMD values in registers, like what
+    // happens on Unix (I think?) by default.
+    let abi = if cfg!(windows) {
+        syn::LitStr::new("vectorcall", proc_macro2::Span::call_site())
+    } else {
+        syn::LitStr::new("C", proc_macro2::Span::call_site())
+    };
     let to_test = quote! {
         #attrs
-        unsafe extern fn #shim_name(#(#inputs),*) #ret {
+        unsafe extern #abi fn #shim_name(#(#inputs),*) #ret {
             #name(#(#input_vals),*)
         }
     };