Skip to content

Add AVX 512f gather instructions #862

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
37a37e2
Add 64 bit AVX512f le and ge comparisons
May 30, 2020
3f88738
Checkpointing first gather implementation
May 30, 2020
cf3e316
Fix interface to be consistent
May 30, 2020
72959dd
Merge remote-tracking branch 'upstream/master' into avx-512-cmp
May 31, 2020
01102d7
Fix instruction assert
May 31, 2020
79dee01
Add _mm512_mask_i32gather_epi64
May 31, 2020
0d3a19b
Add pd gather intrinsics
May 31, 2020
f244d2e
Add 64 bit index variants
May 31, 2020
9b90883
Add 32 bit output gather intrinsics
May 31, 2020
0238065
Fix comments
May 31, 2020
d7e2afa
Fix comparison comments
May 31, 2020
dcf5d47
s/unsigned/signed/ for epi64
May 31, 2020
d9d0fc9
Add neq integer comparisons
May 31, 2020
9a1200d
Remove feature that wasn't added
May 31, 2020
ed9bbe4
Merge branch 'master' into moar-avx512f-cmp
May 31, 2020
f70f643
Constanting the arguments
Jun 6, 2020
e29e2ba
Merge branch 'avx-512-cmp' of github.com:Daniel-B-Smith/stdarch into …
Jun 6, 2020
c5cec2d
Fix comment
Jun 6, 2020
f775ef1
Make instruction check less specific for CI
Jun 6, 2020
2957e2e
Add comparison operator integer comparisons
Jun 6, 2020
7538c0f
Fix comments
Jun 6, 2020
33a4dd5
Allow non camel case types
Jun 6, 2020
a74886b
Add cmplt_ep(i|u)32
Jun 7, 2020
e8cfdb8
Allow AVX512f or KNC intrinsics to be gated by avx512f
Jun 13, 2020
690a03c
Add remaining 32bit integer comparisons
Jun 13, 2020
45aa0bd
Merge branch 'moar-avx512f-cmp' into avx-512-cmp
Jun 13, 2020
475c51d
Merge remote-tracking branch 'upstream/master' into moar-avx512f-cmp
Jun 13, 2020
832166a
Fix verify test with updated XML
Jun 13, 2020
1c81797
Merge branch 'moar-avx512f-cmp' into avx-512-cmp
Jun 13, 2020
c761d6f
Add remaining gather intrinsics
Jun 13, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions crates/core_arch/src/simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,7 @@ simd_ty!(i64x8[i64]:
simd_ty!(u64x8[u64]:
u64, u64, u64, u64, u64, u64, u64, u64
| x0, x1, x2, x3, x4, x5, x6, x7);

simd_ty!(f64x8[f64]:
f64, f64, f64, f64, f64, f64, f64, f64
| x0, x1, x2, x3, x4, x5, x6, x7);
319 changes: 319 additions & 0 deletions crates/core_arch/src/x86/avx512f.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,17 @@ pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
transmute(simd_select_bitmask(k, abs, zero))
}

/// Returns vector of type `__m512d` with all elements set to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vxorps))]
pub unsafe fn _mm512_setzero_pd() -> __m512d {
// All-0 is a properly initialized __m512d
mem::zeroed()
}

/// Returns vector of type `__m512i` with all elements set to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512)
Expand Down Expand Up @@ -87,6 +98,314 @@ pub unsafe fn _mm512_setr_epi32(
transmute(r)
}

/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
#[rustc_args_required_const(2)]
pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d {
let zero = _mm512_setzero_pd().as_f64x8();
let neg_one = -1;
let slice = slice as *const i8;
let offsets = offsets.as_i32x8();
macro_rules! call {
($imm8:expr) => {
vgatherdpd(zero, slice, offsets, neg_one, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
#[rustc_args_required_const(4)]
pub unsafe fn _mm512_mask_i32gather_pd(
src: __m512d,
mask: __mmask8,
offsets: __m256i,
slice: *const u8,
scale: i32,
) -> __m512d {
let src = src.as_f64x8();
let slice = slice as *const i8;
let offsets = offsets.as_i32x8();
macro_rules! call {
($imm8:expr) => {
vgatherdpd(src, slice, offsets, mask as i8, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
#[rustc_args_required_const(2)]
pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32) -> __m512d {
let zero = _mm512_setzero_pd().as_f64x8();
let neg_one = -1;
let slice = slice as *const i8;
let offsets = offsets.as_i64x8();
macro_rules! call {
($imm8:expr) => {
vgatherqpd(zero, slice, offsets, neg_one, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
#[rustc_args_required_const(4)]
pub unsafe fn _mm512_mask_i64gather_pd(
src: __m512d,
mask: __mmask8,
offsets: __m512i,
slice: *const u8,
scale: i32,
) -> __m512d {
let src = src.as_f64x8();
let slice = slice as *const i8;
let offsets = offsets.as_i64x8();
macro_rules! call {
($imm8:expr) => {
vgatherqpd(src, slice, offsets, mask as i8, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_ps)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
#[rustc_args_required_const(2)]
pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m256 {
let zero = _mm256_setzero_ps().as_f32x8();
let neg_one = -1;
let slice = slice as *const i8;
let offsets = offsets.as_i64x8();
macro_rules! call {
($imm8:expr) => {
vgatherqps(zero, slice, offsets, neg_one, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_ps)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
#[rustc_args_required_const(4)]
pub unsafe fn _mm512_mask_i64gather_ps(
src: __m256,
mask: __mmask8,
offsets: __m512i,
slice: *const u8,
scale: i32,
) -> __m256 {
let src = src.as_f32x8();
let slice = slice as *const i8;
let offsets = offsets.as_i64x8();
macro_rules! call {
($imm8:expr) => {
vgatherqps(src, slice, offsets, mask as i8, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather 64-bit integers from memory using 32-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
#[rustc_args_required_const(2)]
pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i {
let zero = _mm512_setzero_si512().as_i64x8();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should use _mm512_undefined here instead to match what Clang is doing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm actually it seems that Clang defines _mm512_undefined as zero-initialization, so it doesn't matter either way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure? I see it defined as a particular builtin, but _mm512_setzero is explicitly defined as zero initialization. I'm not sure of the behavior of __builtin_ia32_undef512, however.

https://github.com/llvm/llvm-project/blob/1b02db52b79e01f038775f59193a49850a34184d/clang/lib/Headers/avx512fintrin.h#L189
https://github.com/llvm/llvm-project/blob/a3dc9490004ce1601fb1bc67cf218b86a6fdf652/clang/include/clang/Basic/BuiltinsX86.def#L40
https://github.com/llvm/llvm-project/blob/1b02db52b79e01f038775f59193a49850a34184d/clang/lib/Headers/avx512fintrin.h#L259
https://github.com/llvm/llvm-project/blob/1b02db52b79e01f038775f59193a49850a34184d/clang/lib/Headers/avx512fintrin.h#L253

LLVM should be able to optimize away the dead store, but I'm happy to change the code regardless. I'm not quite sure how/if I can implement _mm512_undefined since my reading of the std::mem::MaybeUninit is that I couldn't create an unitialized __m512i without inviting UB. Assuming the calling convention allows it, I should be able to create a MaybeUninit<__m512i> and pass that to vpgatherdq.

let neg_one = -1;
let slice = slice as *const i8;
let offsets = offsets.as_i32x8();
macro_rules! call {
($imm8:expr) => {
vpgatherdq(zero, slice, offsets, neg_one, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather 64-bit integers from memory using 32-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
#[rustc_args_required_const(4)]
pub unsafe fn _mm512_mask_i32gather_epi64(
src: __m512i,
mask: __mmask8,
offsets: __m256i,
slice: *const u8,
scale: i32,
) -> __m512i {
let src = src.as_i64x8();
let mask = mask as i8;
let slice = slice as *const i8;
let offsets = offsets.as_i32x8();
macro_rules! call {
($imm8:expr) => {
vpgatherdq(src, slice, offsets, mask, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather 64-bit integers from memory using 64-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
#[rustc_args_required_const(2)]
pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i {
let zero = _mm512_setzero_si512().as_i64x8();
let neg_one = -1;
let slice = slice as *const i8;
let offsets = offsets.as_i64x8();
macro_rules! call {
($imm8:expr) => {
vpgatherqq(zero, slice, offsets, neg_one, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather 64-bit integers from memory using 64-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
#[rustc_args_required_const(4)]
pub unsafe fn _mm512_mask_i64gather_epi64(
src: __m512i,
mask: __mmask8,
offsets: __m512i,
slice: *const u8,
scale: i32,
) -> __m512i {
let src = src.as_i64x8();
let mask = mask as i8;
let slice = slice as *const i8;
let offsets = offsets.as_i64x8();
macro_rules! call {
($imm8:expr) => {
vpgatherqq(src, slice, offsets, mask, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather 32-bit integers from memory using 64-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
#[rustc_args_required_const(2)]
pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m256i {
let zeros = _mm256_setzero_si256().as_i32x8();
let neg_one = -1;
let slice = slice as *const i8;
let offsets = offsets.as_i64x8();
macro_rules! call {
($imm8:expr) => {
vpgatherqd(zeros, slice, offsets, neg_one, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

/// Gather 32-bit integers from memory using 64-bit indices.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
#[rustc_args_required_const(4)]
pub unsafe fn _mm512_mask_i64gather_epi32(
src: __m256i,
mask: __mmask8,
offsets: __m512i,
slice: *const u8,
scale: i32,
) -> __m256i {
let src = src.as_i32x8();
let mask = mask as i8;
let slice = slice as *const i8;
let offsets = offsets.as_i64x8();
macro_rules! call {
($imm8:expr) => {
vpgatherqd(src, slice, offsets, mask, $imm8)
};
}
let r = constify_imm8_gather!(scale, call);
transmute(r)
}

#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.avx512.gather.dpd.512"]
fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
#[link_name = "llvm.x86.avx512.gather.qpd.512"]
fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
#[link_name = "llvm.x86.avx512.gather.qps.512"]
fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
#[link_name = "llvm.x86.avx512.gather.dpq.512"]
fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
#[link_name = "llvm.x86.avx512.gather.qpq.512"]
fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
#[link_name = "llvm.x86.avx512.gather.qpi.512"]
fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
}

/// Broadcast 64-bit float `a` to all elements of `dst`.
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
transmute(f64x8::splat(a))
}

/// Broadcast 64-bit integer `a` to all elements of `dst`.
#[inline]
#[target_feature(enable = "avx512f")]
Expand Down
16 changes: 16 additions & 0 deletions crates/core_arch/src/x86/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,22 @@ macro_rules! constify_imm2 {
};
}

// For gather intsructions, the only valid values for scale are 1, 2, 4 and 8.
// This macro enforces that.
#[allow(unused)]
macro_rules! constify_imm8_gather {
($imm8:expr, $expand:ident) => {
#[allow(overflowing_literals)]
match ($imm8) {
1 => $expand!(1),
2 => $expand!(2),
4 => $expand!(4),
8 => $expand!(8),
_ => panic!("Only 1, 2, 4, and 8 are valid values"),
}
};
}

#[cfg(test)]
macro_rules! assert_approx_eq {
($a:expr, $b:expr, $eps:expr) => {{
Expand Down
Loading