Skip to content

Commit 5e241ee

Browse files
committed
address feedback
Signed-off-by: Sarnie, Nick <[email protected]>
1 parent da83991 commit 5e241ee

File tree

8 files changed

+379
-80
lines changed

8 files changed

+379
-80
lines changed

sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ __ESIMD_INTRIN void __esimd_slm_block_st(uint32_t offset,
157157
}
158158
#endif // __SYCL_DEVICE_ONLY__
159159

160-
/// SLM scatter.
160+
/// SLM block_store/scatter.
161161
/// Supported platforms: DG2, PVC
162162
///
163163
/// Scatters elements located to slm.

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 225 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3371,8 +3371,7 @@ block_load(AccessorT lacc, simd_mask<1> pred, simd<T, N> pass_thru,
33713371
/// @param vals The vector to store.
33723372
/// @param Flags Specifies the alignment.
33733373
///
3374-
template <typename T, int N,
3375-
typename Flags = overaligned_tag<detail::OperandSize::OWORD>>
3374+
template <typename T, int N, typename Flags>
33763375
__ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>>
33773376
slm_block_store(uint32_t offset, simd<T, N> vals, Flags) {
33783377
constexpr size_t Align = Flags::template alignment<simd<T, N>>;
@@ -3391,6 +3390,12 @@ slm_block_store(uint32_t offset, simd<T, N> vals, Flags) {
33913390
/// simd_mask<1> pred, props={});
33923391
/// void slm_block_store(uint32_t byte_offset, simd<T, N> vals, // (slm-bs-2)
33933392
/// props={});
3393+
/// void slm_block_store(uint32_t byte_offset, // (slm-bs-3)
3394+
/// simd_view<T, RegionTy> vals,
3395+
/// simd_mask<1> pred, props={});
3396+
///
3397+
/// void slm_block_store(uint32_t byte_offset, // (slm-bs-4)
3398+
/// simd_view<T, RegionTy> vals, props={});
33943399
///
33953400
/// The following functions do the same work as slm_block_store(). They accept
33963401
/// a local accessor \p lacc and the store of \p vals is done to SLM associated
@@ -3403,11 +3408,26 @@ slm_block_store(uint32_t offset, simd<T, N> vals, Flags) {
34033408
/// props={});
34043409
///
34053410
/// void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-3)
3406-
/// simd<T, N> vals, simd_mask<1> pred, props={});
3411+
/// simd<T, N> vals,
3412+
/// simd_mask<1> pred, props={});
34073413
///
34083414
/// void block_store(local_accessor lacc, simd<T, N> vals, // (lacc-bs-4)
34093415
/// simd_mask<1> pred, props={});
3410-
3416+
///
3417+
/// void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-5)
3418+
/// simd_view<T, RegionTy> vals, props={});
3419+
///
3420+
/// void block_store(local_accessor lacc, // (lacc-bs-6)
3421+
/// simd_view<T, RegionTy> vals, props={});
3422+
///
3423+
/// void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-7)
3424+
/// simd_view<T, RegionTy> vals,
3425+
/// simd_mask<1> pred, props={});
3426+
///
3427+
/// void block_store(local_accessor lacc, // (lacc-bs-8)
3428+
/// simd_view<T, RegionTy> vals,
3429+
/// simd_mask<1> pred, props={});
3430+
///
34113431
/// void slm_block_store(uint32_t byte_offset, simd<T, N> vals, // (slm-bs-1)
34123432
/// simd_mask<1> pred, props={});
34133433
/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
@@ -3516,8 +3536,71 @@ slm_block_store(uint32_t byte_offset, simd<T, N> vals,
35163536
constexpr size_t DefaultAlignment = detail::OperandSize::OWORD;
35173537
constexpr size_t Alignment =
35183538
detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
3519-
__esimd_slm_block_st<detail::__raw_t<T>, N, Alignment>(byte_offset,
3520-
vals.data());
3539+
using StoreElemT = detail::__raw_t<T>;
3540+
__esimd_slm_block_st<StoreElemT, N, Alignment>(
3541+
byte_offset,
3542+
sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, N>>(vals.data()));
3543+
}
3544+
3545+
/// void slm_block_store(uint32_t byte_offset, // (slm-bs-3)
3546+
/// simd_view<T, RegionTy> vals,
3547+
/// simd_mask<1> pred, props={});
3548+
/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
3549+
/// Memory) at the given \p byte_offset. The parameter \p pred is the
3550+
/// one-element predicate. If it is set to 1, then all 'N' elements are stored.
3551+
/// Otherwise, the block stored operation is a NO-OP.
3552+
///
3553+
/// The parameter 'props' specifies the optional compile-time properties
3554+
/// list. Only esimd::alignment property is used. Other properties are ignored.
3555+
///
3556+
/// Alignment: If \p props does not specify the 'alignment' property, then
3557+
/// the default expected alignment is the minimally required (see (R1) below).
3558+
///
3559+
/// Restrictions - predicate imposed - temporary:
3560+
/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
3561+
/// elements and 8-byte aligned for 8-byte elements.
3562+
/// R2: The number of elements must be:
3563+
/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
3564+
/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
3565+
/// or 128(only if alignment is 8-bytes or more);
3566+
/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
3567+
/// or 256(only if alignment is 8-bytes or more);
3568+
/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
3569+
/// or 512(only if alignment is 8-bytes or more).
3570+
/// R3: The target device must be DG2, PVC or newer GPU.
3571+
template <typename T, int N, typename RegionTy = region1d_t<T, N, 1>,
3572+
typename PropertyListT =
3573+
ext::oneapi::experimental::detail::empty_properties_t>
3574+
__ESIMD_API std::enable_if_t<
3575+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3576+
slm_block_store(uint32_t byte_offset, simd_view<T, RegionTy> vals,
3577+
simd_mask<1> pred, PropertyListT props = {}) {
3578+
slm_block_store<T, N>(byte_offset, vals.read(), pred, props);
3579+
}
3580+
3581+
/// void slm_block_store(uint32_t byte_offset, // (slm-bs-4)
3582+
/// simd_view<T, RegionTy> vals, props = {});
3583+
/// Stores the vector \p vals to a contiguous memory block in SLM
3584+
/// (Shared Local Memory) at the given \p byte_offset. The parameter 'props'
3585+
/// specifies the optional compile-time properties list. Only esimd::alignment
3586+
/// property is used. Other properties are ignored.
3587+
///
3588+
/// Alignment: If \p props does not specify the 'alignment' property, then
3589+
/// the default expected alignment is 16-bytes to generate block_store
3590+
/// instruction on all known target devices (Gen12, DG2, PVC, etc).
3591+
/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
3592+
/// is valid, but requires JIT compiler generating a slower SCATTER instead
3593+
/// of faster BLOCK_STORE.
3594+
/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
3595+
/// the actual alignment in \p props produces incorrect store results on Gen12.
3596+
template <typename T, int N, typename RegionTy = region1d_t<T, N, 1>,
3597+
typename PropertyListT =
3598+
ext::oneapi::experimental::detail::empty_properties_t>
3599+
__ESIMD_API std::enable_if_t<
3600+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3601+
slm_block_store(uint32_t byte_offset, simd_view<T, RegionTy> vals,
3602+
PropertyListT props = {}) {
3603+
slm_block_store<T, N>(byte_offset, vals.read(), props);
35213604
}
35223605

35233606
/// void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-1)
@@ -3652,6 +3735,141 @@ block_store(AccessorT lacc, simd<T, N> vals, simd_mask<1> pred,
36523735
slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, pred, props);
36533736
}
36543737

3738+
/// void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-5)
3739+
/// simd_view<T, RegionTy> vals, props={});
3740+
/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
3741+
/// Memory) associated with the local accessor \p lacc at the given \p
3742+
/// byte_offset. The parameter 'props' specifies the optional compile-time
3743+
/// properties list. Only esimd::alignment property is used. Other properties
3744+
/// are ignored.
3745+
///
3746+
/// Alignment: If \p props does not specify the 'alignment' property, then
3747+
/// the default expected alignment is 16-bytes to generate block_store
3748+
/// instruction on all known target devices (Gen12, DG2, PVC, etc).
3749+
/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
3750+
/// is valid, but requires JIT compiler generating a slower SCATTER instead
3751+
/// of faster BLOCK_STORE.
3752+
/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
3753+
/// the actual alignment in \p props produces incorrect store results on Gen12.
3754+
template <typename T, int N, typename AccessorT,
3755+
typename RegionTy = region1d_t<T, N, 1>,
3756+
typename PropertyListT =
3757+
ext::oneapi::experimental::detail::empty_properties_t>
3758+
__ESIMD_API std::enable_if_t<
3759+
detail::is_local_accessor_with_v<AccessorT,
3760+
detail::accessor_mode_cap::can_write> &&
3761+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3762+
block_store(AccessorT lacc, uint32_t byte_offset, simd_view<T, RegionTy> vals,
3763+
PropertyListT props = {}) {
3764+
block_store<T, N>(lacc, byte_offset, vals.read(), props);
3765+
}
3766+
3767+
/// void block_store(local_accessor lacc, // (lacc-bs-6)
3768+
/// simd_view<T, RegionTy> vals, props={});
3769+
/// Stores the vector \p vals to a contiguous memory block in SLM
3770+
/// (Shared Local Memory) associated with the local accessor \p lacc. The
3771+
/// parameter 'props' specifies the optional compile-time properties list. Only
3772+
/// esimd::alignment property is used. Other properties are ignored.
3773+
///
3774+
/// Alignment: If \p props does not specify the 'alignment' property, then
3775+
/// the default expected alignment is 16-bytes to generate block_store
3776+
/// instruction on all known target devices (Gen12, DG2, PVC, etc).
3777+
/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
3778+
/// is valid, but requires JIT compiler generating a slower SCATTER instead
3779+
/// of faster BLOCK_STORE.
3780+
/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
3781+
/// the actual alignment in \p props produces incorrect store results on Gen12.
3782+
template <typename T, int N, typename AccessorT,
3783+
typename RegionTy = region1d_t<T, N, 1>,
3784+
typename PropertyListT =
3785+
ext::oneapi::experimental::detail::empty_properties_t>
3786+
__ESIMD_API std::enable_if_t<
3787+
detail::is_local_accessor_with_v<AccessorT,
3788+
detail::accessor_mode_cap::can_write> &&
3789+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3790+
block_store(AccessorT lacc, simd_view<T, RegionTy> vals,
3791+
PropertyListT props = {}) {
3792+
block_store<T, N>(lacc, vals.read(), props);
3793+
}
3794+
3795+
/// void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-7)
3796+
/// simd_view<T, RegionTy> vals,
3797+
/// simd_mask<1> pred, props={});
3798+
///
3799+
/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
3800+
/// Memory) associated with the local accessor \p lacc at the given \p
3801+
/// byte_offset. The parameter \p pred is the one-element predicate. If it is
3802+
/// set to 1, then all 'N' elements are stored. Otherwise, the block store
3803+
/// operation is a NO-OP.
3804+
///
3805+
/// The parameter 'props' specifies the optional compile-time properties
3806+
/// list. Only esimd::alignment property is used. Other properties are ignored.
3807+
///
3808+
/// Alignment: If \p props does not specify the 'alignment' property, then
3809+
/// the default expected alignment is the minimally required (see (R1) below).
3810+
///
3811+
/// Restrictions - predicate imposed - temporary:
3812+
/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
3813+
/// elements and 8-byte aligned for 8-byte elements.
3814+
/// R2: The number of elements must be:
3815+
/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
3816+
/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
3817+
/// or 128(only if alignment is 8-bytes or more);
3818+
/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
3819+
/// or 256(only if alignment is 8-bytes or more);
3820+
/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
3821+
/// or 512(only if alignment is 8-bytes or more).
3822+
template <typename T, int N, typename AccessorT,
3823+
typename RegionTy = region1d_t<T, N, 1>,
3824+
typename PropertyListT =
3825+
ext::oneapi::experimental::detail::empty_properties_t>
3826+
__ESIMD_API std::enable_if_t<
3827+
detail::is_local_accessor_with_v<AccessorT,
3828+
detail::accessor_mode_cap::can_write> &&
3829+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3830+
block_store(AccessorT lacc, uint32_t byte_offset, simd_view<T, RegionTy> vals,
3831+
simd_mask<1> pred, PropertyListT props = {}) {
3832+
block_store<T, N>(lacc, byte_offset, vals.read(), pred, props);
3833+
}
3834+
3835+
/// void block_store(local_accessor lacc, // (lacc-bs-8)
3836+
/// simd_view<T, RegionTy> vals,
3837+
/// simd_mask<1> pred, props={});
3838+
/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
3839+
/// Memory) associated with the local accessor \p lacc. The parameter \p pred is
3840+
/// the one-element predicate. If it is set to 1, then all 'N' elements are
3841+
/// stored. Otherwise, the block store operation is a NO-OP.
3842+
///
3843+
/// The parameter 'props' specifies the optional compile-time properties
3844+
/// list. Only esimd::alignment property is used. Other properties are ignored.
3845+
///
3846+
/// Alignment: If \p props does not specify the 'alignment' property, then
3847+
/// the default expected alignment is the minimally required (see (R1) below).
3848+
///
3849+
/// Restrictions - predicate imposed - temporary:
3850+
/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
3851+
/// elements and 8-byte aligned for 8-byte elements.
3852+
/// R2: The number of elements must be:
3853+
/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
3854+
/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
3855+
/// or 128(only if alignment is 8-bytes or more);
3856+
/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
3857+
/// or 256(only if alignment is 8-bytes or more);
3858+
/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
3859+
/// or 512(only if alignment is 8-bytes or more).
3860+
/// R3: The target device must be DG2, PVC or newer GPU.
3861+
template <typename T, int N, typename AccessorT,
3862+
typename RegionTy = region1d_t<T, N, 1>,
3863+
typename PropertyListT =
3864+
ext::oneapi::experimental::detail::empty_properties_t>
3865+
__ESIMD_API std::enable_if_t<
3866+
detail::is_local_accessor_with_v<AccessorT,
3867+
detail::accessor_mode_cap::can_write> &&
3868+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3869+
block_store(AccessorT lacc, simd_view<T, RegionTy> vals, simd_mask<1> pred,
3870+
PropertyListT props = {}) {
3871+
block_store<T, N>(lacc, vals.read(), pred, props);
3872+
}
36553873
namespace detail {
36563874

36573875
// lsc_atomic_update() operations may share atomic_op values for data types
@@ -6486,8 +6704,7 @@ __ESIMD_API
64866704
/// @param vals The vector to store.
64876705
/// @param Flags Specifies the alignment.
64886706
///
6489-
template <typename Tx, int N, typename AccessorTy,
6490-
typename Flags = overaligned_tag<detail::OperandSize::OWORD>>
6707+
template <typename Tx, int N, typename AccessorTy, typename Flags>
64916708
__ESIMD_API
64926709
std::enable_if_t<detail::is_local_accessor_with_v<
64936710
AccessorTy, detail::accessor_mode_cap::can_write> &&

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1523,6 +1523,7 @@ __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
15231523
template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size>
15241524
__ESIMD_API void lsc_slm_block_store(uint32_t offset,
15251525
__ESIMD_NS::simd<T, NElts> vals) {
1526+
// Make sure we generate an LSC block store
15261527
constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
15271528
__ESIMD_NS::properties Props{__ESIMD_NS::alignment<DefaultAlignment>};
15281529
__ESIMD_NS::simd_mask<1> pred = 1;

sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -793,8 +793,8 @@ template <typename T, bool TestPVCFeatures> bool testSLM(queue Q) {
793793

794794
constexpr int I32Factor =
795795
std::max(static_cast<int>(sizeof(int) / sizeof(T)), 1);
796-
constexpr size_t ReqiredAlignment = sizeof(T) <= 4 ? 4 : 8;
797-
properties PVCProps{alignment<ReqiredAlignment>};
796+
constexpr size_t RequiredAlignment = sizeof(T) <= 4 ? 4 : 8;
797+
properties PVCProps{alignment<RequiredAlignment>};
798798

799799
// Test block_load() that is available on PVC:
800800
// 1, 2, 3, 4, 8, ... N elements (up to 512-bytes).

0 commit comments

Comments
 (0)