Skip to content

Commit c75474b

Browse files
author
Daniel Smith
committed
Add AVX512f scatter intrinsics
1 parent c5cec2d commit c75474b

File tree

2 files changed

+490
-0
lines changed

2 files changed

+490
-0
lines changed

crates/core_arch/src/x86/avx512f.rs

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,279 @@ pub unsafe fn _mm512_mask_i64gather_epi32(
383383
transmute(r)
384384
}
385385

386+
/// Scatter double-precision (64-bit) floating-point elements from memory using 32-bit indices.
387+
///
388+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_pd)
389+
#[inline]
390+
#[target_feature(enable = "avx512f")]
391+
#[cfg_attr(test, assert_instr(vscatterdpd, scale = 1))]
392+
#[rustc_args_required_const(3)]
393+
pub unsafe fn _mm512_i32scatter_pd(slice: *mut u8, offsets: __m256i, src: __m512d, scale: i32) {
394+
let src = src.as_f64x8();
395+
let neg_one = -1;
396+
let slice = slice as *mut i8;
397+
let offsets = offsets.as_i32x8();
398+
macro_rules! call {
399+
($imm8:expr) => {
400+
vscatterdpd(slice, neg_one, offsets, src, $imm8)
401+
};
402+
}
403+
constify_imm8_gather!(scale, call);
404+
}
405+
406+
/// Scatter double-precision (64-bit) floating-point elements from src into memory using 32-bit indices.
407+
///
408+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_pd)
409+
#[inline]
410+
#[target_feature(enable = "avx512f")]
411+
#[cfg_attr(test, assert_instr(vscatterdpd, scale = 1))]
412+
#[rustc_args_required_const(4)]
413+
pub unsafe fn _mm512_mask_i32scatter_pd(
414+
slice: *mut u8,
415+
mask: __mmask8,
416+
offsets: __m256i,
417+
src: __m512d,
418+
scale: i32,
419+
) {
420+
let src = src.as_f64x8();
421+
let slice = slice as *mut i8;
422+
let offsets = offsets.as_i32x8();
423+
macro_rules! call {
424+
($imm8:expr) => {
425+
vscatterdpd(slice, mask as i8, offsets, src, $imm8)
426+
};
427+
}
428+
constify_imm8_gather!(scale, call);
429+
}
430+
431+
/// Scatter double-precision (64-bit) floating-point elements from src into memory using 64-bit indices.
432+
///
433+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_pd)
434+
#[inline]
435+
#[target_feature(enable = "avx512f")]
436+
#[cfg_attr(test, assert_instr(vscatterqpd, scale = 1))]
437+
#[rustc_args_required_const(3)]
438+
pub unsafe fn _mm512_i64scatter_pd(slice: *mut u8, offsets: __m512i, src: __m512d, scale: i32) {
439+
let src = src.as_f64x8();
440+
let neg_one = -1;
441+
let slice = slice as *mut i8;
442+
let offsets = offsets.as_i64x8();
443+
macro_rules! call {
444+
($imm8:expr) => {
445+
vscatterqpd(slice, neg_one, offsets, src, $imm8)
446+
};
447+
}
448+
constify_imm8_gather!(scale, call);
449+
}
450+
451+
/// Scatter double-precision (64-bit) floating-point elements from src into memory using 64-bit indices.
452+
///
453+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_pd)
454+
#[inline]
455+
#[target_feature(enable = "avx512f")]
456+
#[cfg_attr(test, assert_instr(vscatterqpd, scale = 1))]
457+
#[rustc_args_required_const(4)]
458+
pub unsafe fn _mm512_mask_i64scatter_pd(
459+
slice: *mut u8,
460+
mask: __mmask8,
461+
offsets: __m512i,
462+
src: __m512d,
463+
scale: i32,
464+
) {
465+
let src = src.as_f64x8();
466+
let slice = slice as *mut i8;
467+
let offsets = offsets.as_i64x8();
468+
macro_rules! call {
469+
($imm8:expr) => {
470+
vscatterqpd(slice, mask as i8, offsets, src, $imm8)
471+
};
472+
}
473+
constify_imm8_gather!(scale, call);
474+
}
475+
476+
/// Scatter single-precision (32-bit) floating-point elements from src into memory using 64-bit indices.
477+
///
478+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps)
479+
#[inline]
480+
#[target_feature(enable = "avx512f")]
481+
#[cfg_attr(test, assert_instr(vscatterqps, scale = 1))]
482+
#[rustc_args_required_const(3)]
483+
pub unsafe fn _mm512_i64scatter_ps(slice: *mut u8, offsets: __m512i, src: __m256, scale: i32) {
484+
let src = src.as_f32x8();
485+
let neg_one = -1;
486+
let slice = slice as *mut i8;
487+
let offsets = offsets.as_i64x8();
488+
macro_rules! call {
489+
($imm8:expr) => {
490+
vscatterqps(slice, neg_one, offsets, src, $imm8)
491+
};
492+
}
493+
constify_imm8_gather!(scale, call);
494+
}
495+
496+
/// Scatter single-precision (32-bit) floating-point elements from src into memory using 64-bit indices.
497+
///
498+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_ps)
499+
#[inline]
500+
#[target_feature(enable = "avx512f")]
501+
#[cfg_attr(test, assert_instr(vscatterqps, scale = 1))]
502+
#[rustc_args_required_const(4)]
503+
pub unsafe fn _mm512_mask_i64scatter_ps(
504+
slice: *mut u8,
505+
mask: __mmask8,
506+
offsets: __m512i,
507+
src: __m256,
508+
scale: i32,
509+
) {
510+
let src = src.as_f32x8();
511+
let slice = slice as *mut i8;
512+
let offsets = offsets.as_i64x8();
513+
macro_rules! call {
514+
($imm8:expr) => {
515+
vscatterqps(slice, mask as i8, offsets, src, $imm8)
516+
};
517+
}
518+
constify_imm8_gather!(scale, call);
519+
}
520+
521+
/// Scatter 64-bit integers from src into memory using 32-bit indices.
522+
///
523+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_epi64)
524+
#[inline]
525+
#[target_feature(enable = "avx512f")]
526+
#[cfg_attr(test, assert_instr(vpscatterdq, scale = 1))]
527+
#[rustc_args_required_const(3)]
528+
pub unsafe fn _mm512_i32scatter_epi64(slice: *mut u8, offsets: __m256i, src: __m512i, scale: i32) {
529+
let src = src.as_i64x8();
530+
let neg_one = -1;
531+
let slice = slice as *mut i8;
532+
let offsets = offsets.as_i32x8();
533+
macro_rules! call {
534+
($imm8:expr) => {
535+
vpscatterdq(slice, neg_one, offsets, src, $imm8)
536+
};
537+
}
538+
constify_imm8_gather!(scale, call);
539+
}
540+
541+
/// Scatter 64-bit integers from src into memory using 32-bit indices.
542+
///
543+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi64)
544+
#[inline]
545+
#[target_feature(enable = "avx512f")]
546+
#[cfg_attr(test, assert_instr(vpscatterdq, scale = 1))]
547+
#[rustc_args_required_const(4)]
548+
pub unsafe fn _mm512_mask_i32scatter_epi64(
549+
slice: *mut u8,
550+
mask: __mmask8,
551+
offsets: __m256i,
552+
src: __m512i,
553+
scale: i32,
554+
) {
555+
let src = src.as_i64x8();
556+
let mask = mask as i8;
557+
let slice = slice as *mut i8;
558+
let offsets = offsets.as_i32x8();
559+
macro_rules! call {
560+
($imm8:expr) => {
561+
vpscatterdq(slice, mask, offsets, src, $imm8)
562+
};
563+
}
564+
constify_imm8_gather!(scale, call);
565+
}
566+
567+
/// Scatter 64-bit integers from src into memory using 64-bit indices.
568+
///
569+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi64)
570+
#[inline]
571+
#[target_feature(enable = "avx512f")]
572+
#[cfg_attr(test, assert_instr(vpscatterqq, scale = 1))]
573+
#[rustc_args_required_const(3)]
574+
pub unsafe fn _mm512_i64scatter_epi64(slice: *mut u8, offsets: __m512i, src: __m512i, scale: i32) {
575+
let src = src.as_i64x8();
576+
let neg_one = -1;
577+
let slice = slice as *mut i8;
578+
let offsets = offsets.as_i64x8();
579+
macro_rules! call {
580+
($imm8:expr) => {
581+
vpscatterqq(slice, neg_one, offsets, src, $imm8)
582+
};
583+
}
584+
constify_imm8_gather!(scale, call);
585+
}
586+
587+
/// Scatter 64-bit integers from src into memory using 64-bit indices.
588+
///
589+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi64)
590+
#[inline]
591+
#[target_feature(enable = "avx512f")]
592+
#[cfg_attr(test, assert_instr(vpscatterqq, scale = 1))]
593+
#[rustc_args_required_const(4)]
594+
pub unsafe fn _mm512_mask_i64scatter_epi64(
595+
slice: *mut u8,
596+
mask: __mmask8,
597+
offsets: __m512i,
598+
src: __m512i,
599+
scale: i32,
600+
) {
601+
let src = src.as_i64x8();
602+
let mask = mask as i8;
603+
let slice = slice as *mut i8;
604+
let offsets = offsets.as_i64x8();
605+
macro_rules! call {
606+
($imm8:expr) => {
607+
vpscatterqq(slice, mask, offsets, src, $imm8)
608+
};
609+
}
610+
constify_imm8_gather!(scale, call);
611+
}
612+
613+
/// Scatter 32-bit integers from src into memory using 64-bit indices.
614+
///
615+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32)
616+
#[inline]
617+
#[target_feature(enable = "avx512f")]
618+
#[cfg_attr(test, assert_instr(vpscatterqd, scale = 1))]
619+
#[rustc_args_required_const(3)]
620+
pub unsafe fn _mm512_i64scatter_epi32(slice: *mut u8, offsets: __m512i, src: __m256i, scale: i32) {
621+
let src = src.as_i32x8();
622+
let neg_one = -1;
623+
let slice = slice as *mut i8;
624+
let offsets = offsets.as_i64x8();
625+
macro_rules! call {
626+
($imm8:expr) => {
627+
vpscatterqd(slice, neg_one, offsets, src, $imm8)
628+
};
629+
}
630+
constify_imm8_gather!(scale, call);
631+
}
632+
633+
/// Scatter 32-bit integers from src into memory using 64-bit indices.
634+
///
635+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi32)
636+
#[inline]
637+
#[target_feature(enable = "avx512f")]
638+
#[cfg_attr(test, assert_instr(vpscatterqd, scale = 1))]
639+
#[rustc_args_required_const(4)]
640+
pub unsafe fn _mm512_mask_i64scatter_epi32(
641+
slice: *mut u8,
642+
mask: __mmask8,
643+
offsets: __m512i,
644+
src: __m256i,
645+
scale: i32,
646+
) {
647+
let src = src.as_i32x8();
648+
let mask = mask as i8;
649+
let slice = slice as *mut i8;
650+
let offsets = offsets.as_i64x8();
651+
macro_rules! call {
652+
($imm8:expr) => {
653+
vpscatterqd(slice, mask, offsets, src, $imm8)
654+
};
655+
}
656+
constify_imm8_gather!(scale, call);
657+
}
658+
386659
#[allow(improper_ctypes)]
387660
extern "C" {
388661
#[link_name = "llvm.x86.avx512.gather.dpd.512"]
@@ -397,6 +670,20 @@ extern "C" {
397670
fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
398671
#[link_name = "llvm.x86.avx512.gather.qpi.512"]
399672
fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
673+
674+
#[link_name = "llvm.x86.avx512.scatter.dpd.512"]
675+
fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
676+
#[link_name = "llvm.x86.avx512.scatter.qpd.512"]
677+
fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
678+
#[link_name = "llvm.x86.avx512.scatter.qps.512"]
679+
fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
680+
#[link_name = "llvm.x86.avx512.scatter.dpq.512"]
681+
fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
682+
#[link_name = "llvm.x86.avx512.scatter.qpq.512"]
683+
fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
684+
#[link_name = "llvm.x86.avx512.scatter.qpi.512"]
685+
fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
686+
400687
}
401688

402689
/// Broadcast 64-bit float `a` to all elements of `dst`.

0 commit comments

Comments
 (0)