@@ -3371,8 +3371,7 @@ block_load(AccessorT lacc, simd_mask<1> pred, simd<T, N> pass_thru,
3371
3371
// / @param vals The vector to store.
3372
3372
// / @param Flags Specifies the alignment.
3373
3373
// /
3374
- template <typename T, int N,
3375
- typename Flags = overaligned_tag<detail::OperandSize::OWORD>>
3374
+ template <typename T, int N, typename Flags>
3376
3375
__ESIMD_API std::enable_if_t <is_simd_flag_type_v<Flags>>
3377
3376
slm_block_store (uint32_t offset, simd<T, N> vals, Flags) {
3378
3377
constexpr size_t Align = Flags::template alignment<simd<T, N>>;
@@ -3391,6 +3390,12 @@ slm_block_store(uint32_t offset, simd<T, N> vals, Flags) {
3391
3390
// / simd_mask<1> pred, props={});
3392
3391
// / void slm_block_store(uint32_t byte_offset, simd<T, N> vals, // (slm-bs-2)
3393
3392
// / props={});
3393
+ // / void slm_block_store(uint32_t byte_offset, // (slm-bs-3)
3394
+ // / simd_view<T, RegionTy> vals,
3395
+ // / simd_mask<1> pred, props={});
3396
+ // /
3397
+ // / void slm_block_store(uint32_t byte_offset, // (slm-bs-4)
3398
+ // / simd_view<T, RegionTy> vals, props={});
3394
3399
// /
3395
3400
// / The following functions do the same work as slm_block_store(). They accept
3396
3401
// / a local accessor \p lacc and the store of \p vals is done to SLM associated
@@ -3403,11 +3408,26 @@ slm_block_store(uint32_t offset, simd<T, N> vals, Flags) {
3403
3408
// / props={});
3404
3409
// /
3405
3410
// / void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-3)
3406
- // / simd<T, N> vals, simd_mask<1> pred, props={});
3411
+ // / simd<T, N> vals,
3412
+ // / simd_mask<1> pred, props={});
3407
3413
// /
3408
3414
// / void block_store(local_accessor lacc, simd<T, N> vals, // (lacc-bs-4)
3409
3415
// / simd_mask<1> pred, props={});
3410
-
3416
+ // /
3417
+ // / void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-5)
3418
+ // / simd_view<T, RegionTy> vals, props={});
3419
+ // /
3420
+ // / void block_store(local_accessor lacc, // (lacc-bs-6)
3421
+ // / simd_view<T, RegionTy> vals, props={});
3422
+ // /
3423
+ // / void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-7)
3424
+ // / simd_view<T, RegionTy> vals,
3425
+ // / simd_mask<1> pred, props={});
3426
+ // /
3427
+ // / void block_store(local_accessor lacc, // (lacc-bs-8)
3428
+ // / simd_view<T, RegionTy> vals,
3429
+ // / simd_mask<1> pred, props={});
3430
+ // /
3411
3431
// / void slm_block_store(uint32_t byte_offset, simd<T, N> vals, // (slm-bs-1)
3412
3432
// / simd_mask<1> pred, props={});
3413
3433
// / Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
@@ -3516,8 +3536,71 @@ slm_block_store(uint32_t byte_offset, simd<T, N> vals,
3516
3536
constexpr size_t DefaultAlignment = detail::OperandSize::OWORD;
3517
3537
constexpr size_t Alignment =
3518
3538
detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
3519
- __esimd_slm_block_st<detail::__raw_t <T>, N, Alignment>(byte_offset,
3520
- vals.data ());
3539
+ using StoreElemT = detail::__raw_t <T>;
3540
+ __esimd_slm_block_st<StoreElemT, N, Alignment>(
3541
+ byte_offset,
3542
+ sycl::bit_cast<__ESIMD_DNS::vector_type_t <StoreElemT, N>>(vals.data ()));
3543
+ }
3544
+
3545
+ // / void slm_block_store(uint32_t byte_offset, // (slm-bs-3)
3546
+ // / simd_view<T, RegionTy> vals,
3547
+ // / simd_mask<1> pred, props={});
3548
+ // / Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
3549
+ // / Memory) at the given \p byte_offset. The parameter \p pred is the
3550
+ // / one-element predicate. If it is set to 1, then all 'N' elements are stored.
3551
+ // / Otherwise, the block stored operation is a NO-OP.
3552
+ // /
3553
+ // / The parameter 'props' specifies the optional compile-time properties
3554
+ // / list. Only esimd::alignment property is used. Other properties are ignored.
3555
+ // /
3556
+ // / Alignment: If \p props does not specify the 'alignment' property, then
3557
+ // / the default expected alignment is the minimally required (see (R1) below).
3558
+ // /
3559
+ // / Restrictions - predicate imposed - temporary:
3560
+ // / R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
3561
+ // / elements and 8-byte aligned for 8-byte elements.
3562
+ // / R2: The number of elements must be:
3563
+ // / for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
3564
+ // / for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
3565
+ // / or 128(only if alignment is 8-bytes or more);
3566
+ // / for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
3567
+ // / or 256(only if alignment is 8-bytes or more);
3568
+ // / for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
3569
+ // / or 512(only if alignment is 8-bytes or more).
3570
+ // / R3: The target device must be DG2, PVC or newer GPU.
3571
+ template <typename T, int N, typename RegionTy = region1d_t <T, N, 1 >,
3572
+ typename PropertyListT =
3573
+ ext::oneapi::experimental::detail::empty_properties_t >
3574
+ __ESIMD_API std::enable_if_t <
3575
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3576
+ slm_block_store (uint32_t byte_offset, simd_view<T, RegionTy> vals,
3577
+ simd_mask<1 > pred, PropertyListT props = {}) {
3578
+ slm_block_store<T, N>(byte_offset, vals.read (), pred, props);
3579
+ }
3580
+
3581
+ // / void slm_block_store(uint32_t byte_offset, // (slm-bs-4)
3582
+ // / simd_view<T, RegionTy> vals, props = {});
3583
+ // / Stores the vector \p vals to a contiguous memory block in SLM
3584
+ // / (Shared Local Memory) at the given \p byte_offset. The parameter 'props'
3585
+ // / specifies the optional compile-time properties list. Only esimd::alignment
3586
+ // / property is used. Other properties are ignored.
3587
+ // /
3588
+ // / Alignment: If \p props does not specify the 'alignment' property, then
3589
+ // / the default expected alignment is 16-bytes to generate block_store
3590
+ // / instruction on all known target devices (Gen12, DG2, PVC, etc).
3591
+ // / On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
3592
+ // / is valid, but requires JIT compiler generating a slower SCATTER instead
3593
+ // / of faster BLOCK_STORE.
3594
+ // / !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
3595
+ // / the actual alignment in \p props produces incorrect store results on Gen12.
3596
+ template <typename T, int N, typename RegionTy = region1d_t <T, N, 1 >,
3597
+ typename PropertyListT =
3598
+ ext::oneapi::experimental::detail::empty_properties_t >
3599
+ __ESIMD_API std::enable_if_t <
3600
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3601
+ slm_block_store (uint32_t byte_offset, simd_view<T, RegionTy> vals,
3602
+ PropertyListT props = {}) {
3603
+ slm_block_store<T, N>(byte_offset, vals.read (), props);
3521
3604
}
3522
3605
3523
3606
// / void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-1)
@@ -3652,6 +3735,141 @@ block_store(AccessorT lacc, simd<T, N> vals, simd_mask<1> pred,
3652
3735
slm_block_store<T, N>(detail::localAccessorToOffset (lacc), vals, pred, props);
3653
3736
}
3654
3737
3738
+ // / void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-5)
3739
+ // / simd_view<T, RegionTy> vals, props={});
3740
+ // / Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
3741
+ // / Memory) associated with the local accessor \p lacc at the given \p
3742
+ // / byte_offset. The parameter 'props' specifies the optional compile-time
3743
+ // / properties list. Only esimd::alignment property is used. Other properties
3744
+ // / are ignored.
3745
+ // /
3746
+ // / Alignment: If \p props does not specify the 'alignment' property, then
3747
+ // / the default expected alignment is 16-bytes to generate block_store
3748
+ // / instruction on all known target devices (Gen12, DG2, PVC, etc).
3749
+ // / On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
3750
+ // / is valid, but requires JIT compiler generating a slower SCATTER instead
3751
+ // / of faster BLOCK_STORE.
3752
+ // / !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
3753
+ // / the actual alignment in \p props produces incorrect store results on Gen12.
3754
+ template <typename T, int N, typename AccessorT,
3755
+ typename RegionTy = region1d_t <T, N, 1 >,
3756
+ typename PropertyListT =
3757
+ ext::oneapi::experimental::detail::empty_properties_t >
3758
+ __ESIMD_API std::enable_if_t <
3759
+ detail::is_local_accessor_with_v<AccessorT,
3760
+ detail::accessor_mode_cap::can_write> &&
3761
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3762
+ block_store (AccessorT lacc, uint32_t byte_offset, simd_view<T, RegionTy> vals,
3763
+ PropertyListT props = {}) {
3764
+ block_store<T, N>(lacc, byte_offset, vals.read (), props);
3765
+ }
3766
+
3767
+ // / void block_store(local_accessor lacc, // (lacc-bs-6)
3768
+ // / simd_view<T, RegionTy> vals, props={});
3769
+ // / Stores the vector \p vals to a contiguous memory block in SLM
3770
+ // / (Shared Local Memory) associated with the local accessor \p lacc. The
3771
+ // / parameter 'props' specifies the optional compile-time properties list. Only
3772
+ // / esimd::alignment property is used. Other properties are ignored.
3773
+ // /
3774
+ // / Alignment: If \p props does not specify the 'alignment' property, then
3775
+ // / the default expected alignment is 16-bytes to generate block_store
3776
+ // / instruction on all known target devices (Gen12, DG2, PVC, etc).
3777
+ // / On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
3778
+ // / is valid, but requires JIT compiler generating a slower SCATTER instead
3779
+ // / of faster BLOCK_STORE.
3780
+ // / !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
3781
+ // / the actual alignment in \p props produces incorrect store results on Gen12.
3782
+ template <typename T, int N, typename AccessorT,
3783
+ typename RegionTy = region1d_t <T, N, 1 >,
3784
+ typename PropertyListT =
3785
+ ext::oneapi::experimental::detail::empty_properties_t >
3786
+ __ESIMD_API std::enable_if_t <
3787
+ detail::is_local_accessor_with_v<AccessorT,
3788
+ detail::accessor_mode_cap::can_write> &&
3789
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3790
+ block_store (AccessorT lacc, simd_view<T, RegionTy> vals,
3791
+ PropertyListT props = {}) {
3792
+ block_store<T, N>(lacc, vals.read (), props);
3793
+ }
3794
+
3795
+ // / void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-7)
3796
+ // / simd_view<T, RegionTy> vals,
3797
+ // / simd_mask<1> pred, props={});
3798
+ // /
3799
+ // / Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
3800
+ // / Memory) associated with the local accessor \p lacc at the given \p
3801
+ // / byte_offset. The parameter \p pred is the one-element predicate. If it is
3802
+ // / set to 1, then all 'N' elements are stored. Otherwise, the block store
3803
+ // / operation is a NO-OP.
3804
+ // /
3805
+ // / The parameter 'props' specifies the optional compile-time properties
3806
+ // / list. Only esimd::alignment property is used. Other properties are ignored.
3807
+ // /
3808
+ // / Alignment: If \p props does not specify the 'alignment' property, then
3809
+ // / the default expected alignment is the minimally required (see (R1) below).
3810
+ // /
3811
+ // / Restrictions - predicate imposed - temporary:
3812
+ // / R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
3813
+ // / elements and 8-byte aligned for 8-byte elements.
3814
+ // / R2: The number of elements must be:
3815
+ // / for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
3816
+ // / for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
3817
+ // / or 128(only if alignment is 8-bytes or more);
3818
+ // / for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
3819
+ // / or 256(only if alignment is 8-bytes or more);
3820
+ // / for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
3821
+ // / or 512(only if alignment is 8-bytes or more).
3822
+ template <typename T, int N, typename AccessorT,
3823
+ typename RegionTy = region1d_t <T, N, 1 >,
3824
+ typename PropertyListT =
3825
+ ext::oneapi::experimental::detail::empty_properties_t >
3826
+ __ESIMD_API std::enable_if_t <
3827
+ detail::is_local_accessor_with_v<AccessorT,
3828
+ detail::accessor_mode_cap::can_write> &&
3829
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3830
+ block_store (AccessorT lacc, uint32_t byte_offset, simd_view<T, RegionTy> vals,
3831
+ simd_mask<1 > pred, PropertyListT props = {}) {
3832
+ block_store<T, N>(lacc, byte_offset, vals.read (), pred, props);
3833
+ }
3834
+
3835
+ // / void block_store(local_accessor lacc, // (lacc-bs-8)
3836
+ // / simd_view<T, RegionTy> vals,
3837
+ // / simd_mask<1> pred, props={});
3838
+ // / Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
3839
+ // / Memory) associated with the local accessor \p lacc. The parameter \p pred is
3840
+ // / the one-element predicate. If it is set to 1, then all 'N' elements are
3841
+ // / stored. Otherwise, the block store operation is a NO-OP.
3842
+ // /
3843
+ // / The parameter 'props' specifies the optional compile-time properties
3844
+ // / list. Only esimd::alignment property is used. Other properties are ignored.
3845
+ // /
3846
+ // / Alignment: If \p props does not specify the 'alignment' property, then
3847
+ // / the default expected alignment is the minimally required (see (R1) below).
3848
+ // /
3849
+ // / Restrictions - predicate imposed - temporary:
3850
+ // / R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
3851
+ // / elements and 8-byte aligned for 8-byte elements.
3852
+ // / R2: The number of elements must be:
3853
+ // / for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
3854
+ // / for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
3855
+ // / or 128(only if alignment is 8-bytes or more);
3856
+ // / for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
3857
+ // / or 256(only if alignment is 8-bytes or more);
3858
+ // / for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
3859
+ // / or 512(only if alignment is 8-bytes or more).
3860
+ // / R3: The target device must be DG2, PVC or newer GPU.
3861
+ template <typename T, int N, typename AccessorT,
3862
+ typename RegionTy = region1d_t <T, N, 1 >,
3863
+ typename PropertyListT =
3864
+ ext::oneapi::experimental::detail::empty_properties_t >
3865
+ __ESIMD_API std::enable_if_t <
3866
+ detail::is_local_accessor_with_v<AccessorT,
3867
+ detail::accessor_mode_cap::can_write> &&
3868
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3869
+ block_store (AccessorT lacc, simd_view<T, RegionTy> vals, simd_mask<1 > pred,
3870
+ PropertyListT props = {}) {
3871
+ block_store<T, N>(lacc, vals.read (), pred, props);
3872
+ }
3655
3873
namespace detail {
3656
3874
3657
3875
// lsc_atomic_update() operations may share atomic_op values for data types
@@ -6486,8 +6704,7 @@ __ESIMD_API
6486
6704
// / @param vals The vector to store.
6487
6705
// / @param Flags Specifies the alignment.
6488
6706
// /
6489
- template <typename Tx, int N, typename AccessorTy,
6490
- typename Flags = overaligned_tag<detail::OperandSize::OWORD>>
6707
+ template <typename Tx, int N, typename AccessorTy, typename Flags>
6491
6708
__ESIMD_API
6492
6709
std::enable_if_t <detail::is_local_accessor_with_v<
6493
6710
AccessorTy, detail::accessor_mode_cap::can_write> &&
0 commit comments