Skip to content

Commit 21ca00f

Browse files
authored
[SYCL][ESIMD] Support passing local accessor to API accepting accessor parameter - part1 (#9931)
1 parent f46a63f commit 21ca00f

File tree

5 files changed

+640
-25
lines changed

5 files changed

+640
-25
lines changed

sycl/include/sycl/ext/intel/esimd/detail/sycl_util.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,12 @@ template <typename T, accessor_mode_cap_val_t Capability,
7979
using EnableIfAccessor = std::enable_if_t<
8080
detail::is_sycl_accessor_with<T, Capability, AccessTarget>::value, RetT>;
8181

82+
template <typename T, int Dimensions>
83+
__ESIMD_API uint32_t localAccessorToOffset(local_accessor<T, Dimensions> acc) {
84+
return static_cast<uint32_t>(
85+
reinterpret_cast<std::uintptr_t>(acc.get_pointer()));
86+
}
87+
8288
} // namespace ext::intel::esimd::detail
8389
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
8490
} // namespace sycl

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 200 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ scatter(Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1) {
283283
/// 1, 2, 4 or 8 owords long.
284284
/// @tparam Flags The alignment specifier type tag. Auto-deduced from the
285285
/// \c Flags parameter. If it is less than \c 16, then slower unaligned
286-
/// access is generated, othewise the access is aligned.
286+
/// access is generated, otherwise the access is aligned.
287287
/// @param addr The address to load from.
288288
/// @param Flags Specifies the alignment.
289289
/// @return A vector of loaded elements.
@@ -320,15 +320,18 @@ __ESIMD_API simd<Tx, N> block_load(const Tx *addr, Flags = {}) {
320320
/// @tparam AccessorTy Accessor type (auto-deduced).
321321
/// @tparam Flags The alignment specifier type tag. Auto-deduced from the
322322
/// \c Flags parameter. If it is less than \c 16, then slower unaligned
323-
/// access is generated, othewise the access is aligned.
323+
/// access is generated, otherwise the access is aligned.
324324
/// @param acc The accessor.
325325
/// @param offset The offset to load from in bytes.
326326
/// @param Flags Specifies the alignment.
327327
/// @return A vector of loaded elements.
328328
///
329329
template <typename Tx, int N, typename AccessorTy,
330330
typename Flags = vector_aligned_tag,
331-
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>,
331+
typename = std::enable_if_t<
332+
is_simd_flag_type_v<Flags> &&
333+
sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
334+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>,
332335
class T = detail::__raw_t<Tx>>
333336
__ESIMD_API simd<Tx, N> block_load(AccessorTy acc,
334337
#ifdef __ESIMD_FORCE_STATELESS_MEM
@@ -399,13 +402,16 @@ __ESIMD_API void block_store(Tx *p, simd<Tx, N> vals) {
399402
///
400403
template <typename Tx, int N, typename AccessorTy,
401404
class T = detail::__raw_t<Tx>>
402-
__ESIMD_API void block_store(AccessorTy acc,
405+
__ESIMD_API std::enable_if_t<
406+
sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
407+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
408+
block_store(AccessorTy acc,
403409
#ifdef __ESIMD_FORCE_STATELESS_MEM
404-
uint64_t offset,
410+
uint64_t offset,
405411
#else
406-
uint32_t offset,
412+
uint32_t offset,
407413
#endif
408-
simd<Tx, N> vals) {
414+
simd<Tx, N> vals) {
409415
#ifdef __ESIMD_FORCE_STATELESS_MEM
410416
block_store<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset), vals);
411417
#else
@@ -438,7 +444,6 @@ ESIMD_INLINE
438444
!std::is_pointer<AccessorTy>::value>
439445
scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
440446
uint32_t glob_offset, simd_mask<N> mask) {
441-
442447
constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
443448
// TODO (performance) use hardware-supported scale once BE supports it
444449
constexpr int16_t scale = 0;
@@ -522,7 +527,7 @@ gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
522527
/// @anchor accessor_gather Accessor-based gather.
523528
///
524529
/// Collects elements located at given offsets in an accessor and returns them
525-
/// as a single \ref simd object. An element can be 1, 2 or 4-byte value.
530+
/// as a single \ref simd object. An element can be a 1, 2 or 4-byte value.
526531
///
527532
/// @tparam T Element type; can only be a 1,2,4-byte integer, \c sycl::half or
528533
/// \c float.
@@ -537,10 +542,11 @@ gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
537542
/// undefined.
538543
///
539544
template <typename T, int N, typename AccessorTy>
540-
__ESIMD_API std::enable_if_t<(sizeof(T) <= 4) &&
541-
(N == 1 || N == 8 || N == 16 || N == 32) &&
542-
!std::is_pointer<AccessorTy>::value,
543-
simd<T, N>>
545+
__ESIMD_API std::enable_if_t<
546+
(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
547+
sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
548+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
549+
simd<T, N>>
544550
gather(AccessorTy acc,
545551
#ifdef __ESIMD_FORCE_STATELESS_MEM
546552
simd<uint64_t, N> offsets, uint64_t glob_offset = 0,
@@ -574,7 +580,7 @@ gather(AccessorTy acc, simd<Toffset, N> offsets, uint64_t glob_offset = 0,
574580
/// Accessor-based scatter.
575581
///
576582
/// Writes elements of a \ref simd object into an accessor at given offsets.
577-
/// An element can be 1, 2 or 4-byte value.
583+
/// An element can be a 1, 2 or 4-byte value.
578584
///
579585
/// @tparam T Element type; can only be a 1,2,4-byte integer, \c sycl::half or
580586
/// \c float.
@@ -590,9 +596,10 @@ gather(AccessorTy acc, simd<Toffset, N> offsets, uint64_t glob_offset = 0,
590596
///
591597
///
592598
template <typename T, int N, typename AccessorTy>
593-
__ESIMD_API std::enable_if_t<(sizeof(T) <= 4) &&
594-
(N == 1 || N == 8 || N == 16 || N == 32) &&
595-
!std::is_pointer<AccessorTy>::value>
599+
__ESIMD_API std::enable_if_t<
600+
(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
601+
sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
602+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
596603
scatter(AccessorTy acc,
597604
#ifdef __ESIMD_FORCE_STATELESS_MEM
598605
simd<uint64_t, N> offsets,
@@ -863,7 +870,7 @@ __ESIMD_API std::
863870
///
864871
/// @tparam RGBAMask A pixel's channel mask.
865872
/// @tparam AccessorT The accessor type for the memory to be loaded/gathered.
866-
/// The returned vector elements mutch the accessor data type. The loaded
873+
/// The returned vector elements must match the accessor data type. The loaded
867874
/// elements must be 4 bytes in size.
868875
/// @tparam N Number of pixels to access (matches the size of the \c offsets
869876
/// vector). Must be 8, 16 or 32.
@@ -878,9 +885,11 @@ __ESIMD_API std::
878885
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
879886
typename AccessorT, int N,
880887
typename T = typename AccessorT::value_type>
881-
__ESIMD_API std::enable_if_t<((N == 8 || N == 16 || N == 32) &&
882-
sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
883-
simd<T, N * get_num_channels_enabled(RGBAMask)>>
888+
__ESIMD_API std::enable_if_t<
889+
((N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
890+
!std::is_pointer_v<AccessorT> &&
891+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorT>),
892+
simd<T, N * get_num_channels_enabled(RGBAMask)>>
884893
gather_rgba(AccessorT acc,
885894
#ifdef __ESIMD_FORCE_STATELESS_MEM
886895
simd<uint64_t, N> offsets, uint64_t global_offset = 0,
@@ -923,7 +932,7 @@ gather_rgba(AccessorT acc, simd<Toffset, N> offsets, uint64_t global_offset = 0,
923932
/// the operation semantics and parameter restrictions/interdependencies.
924933
/// @tparam RGBAMask Pixel's channel mask.
925934
/// @tparam AccessorT The accessor type for the memory to be stored/scattered.
926-
/// The returned vector elements mast match the accessor data type. The loaded
935+
/// The returned vector elements must match the accessor data type. The loaded
927936
/// elements must be 4 bytes in size.
928937
/// @tparam N The number of elements to access.
929938
/// @param offsets Byte offsets of each element.
@@ -934,8 +943,10 @@ gather_rgba(AccessorT acc, simd<Toffset, N> offsets, uint64_t global_offset = 0,
934943
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
935944
typename AccessorT, int N,
936945
typename T = typename AccessorT::value_type>
937-
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
938-
!std::is_pointer_v<AccessorT>>
946+
__ESIMD_API std::enable_if_t<
947+
(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
948+
!std::is_pointer_v<AccessorT> &&
949+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorT>>
939950
scatter_rgba(AccessorT acc,
940951
#ifdef __ESIMD_FORCE_STATELESS_MEM
941952
simd<uint64_t, N> offsets,
@@ -2003,12 +2014,176 @@ __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
20032014
}
20042015
}
20052016

2017+
/// Variant of block_load that uses local accessor as a parameter
2018+
/// Loads a contiguous block of memory from given accessor and offset and
2019+
/// returns the loaded data as a vector. Actual code generated depends on
2020+
/// the alignment parameter.
2021+
/// @tparam Tx Element type.
2022+
/// @tparam N Number of elements to load, <code>N * sizeof(Tx)</code> must
2023+
/// be 1, 2, 4 or 8 owords long.
2024+
/// @tparam AccessorTy Accessor type (auto-deduced).
2025+
/// @tparam Flags The alignment specifier type tag. Auto-deduced from the
2026+
/// \c Flags parameter. If it is less than \c 16, then slower unaligned
2027+
/// access is generated, otherwise the access is aligned.
2028+
/// @param acc The accessor.
2029+
/// @param offset The offset to load from in bytes.
2030+
/// @return A vector of loaded elements.
2031+
///
2032+
template <typename Tx, int N, typename AccessorTy,
2033+
typename = std::enable_if_t<
2034+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>>
2035+
__ESIMD_API simd<Tx, N> block_load(AccessorTy acc, uint32_t offset) {
2036+
return slm_block_load<Tx, N>(offset +
2037+
__ESIMD_DNS::localAccessorToOffset(acc));
2038+
}
2039+
2040+
/// Variant of block_store that uses local accessor as a parameter
2041+
/// Stores elements of a vector to a contiguous block of memory represented
2042+
/// by an accessor and an offset within this accessor.
2043+
/// @tparam Tx Element type.
2044+
/// @tparam N Number of elements to store, <code>N * sizeof(Tx)</code> must
2045+
/// be
2046+
/// 1, 2, 4 or 8 owords long.
2047+
/// @tparam AccessorTy Accessor type (auto-deduced).
2048+
/// @param acc The accessor to store to.
2049+
/// @param offset The offset to store at. It is in bytes and must be a
2050+
/// multiple
2051+
/// of \c 16.
2052+
/// @param vals The vector to store.
2053+
///
2054+
template <typename Tx, int N, typename AccessorTy>
2055+
__ESIMD_API std::enable_if_t<
2056+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
2057+
block_store(AccessorTy acc, uint32_t offset, simd<Tx, N> vals) {
2058+
slm_block_store<Tx, N>(offset + __ESIMD_DNS::localAccessorToOffset(acc),
2059+
vals);
2060+
}
2061+
2062+
/// Variant of gather that uses local accessor as a parameter
2063+
///
2064+
/// Collects elements located at given offsets in an accessor and returns them
2065+
/// as a single \ref simd object. An element can be a 1, 2 or 4-byte value.
2066+
///
2067+
/// @tparam T Element type; can only be a 1,2,4-byte integer, \c sycl::half or
2068+
/// \c float.
2069+
/// @tparam N The number of vector elements. Can be \c 1, \c 8, \c 16 or \c 32.
2070+
/// @tparam AccessorTy The accessor type.
2071+
/// @param acc The accessor to gather from.
2072+
/// @param offsets Per-element offsets in bytes.
2073+
/// @param glob_offset Offset in bytes added to each individual element's offset
2074+
/// to compute actual memory access offset for that element.
2075+
/// @param mask Memory access mask. Elements with zero corresponding mask's
2076+
/// predicate are not accessed, their values in the resulting vector are
2077+
/// undefined.
2078+
///
2079+
template <typename T, int N, typename AccessorTy>
2080+
__ESIMD_API std::enable_if_t<
2081+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>, simd<T, N>>
2082+
gather(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset = 0,
2083+
simd_mask<N> mask = 1) {
2084+
return slm_gather<T, N>(
2085+
offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
2086+
}
2087+
2088+
/// Variant of scatter that uses local accessor as a parameter
2089+
///
2090+
/// Writes elements of a \ref simd object into an accessor at given offsets.
2091+
/// An element can be a 1, 2 or 4-byte value.
2092+
///
2093+
/// @tparam T Element type; can only be a 1,2,4-byte integer, \c sycl::half or
2094+
/// \c float.
2095+
/// @tparam N The number of vector elements. Can be \c 1, \c 8, \c 16 or \c 32.
2096+
/// @tparam AccessorTy The accessor type.
2097+
/// @param acc The accessor to scatter to.
2098+
/// @param offsets Per-element offsets in bytes.
2099+
/// @param vals Values to write.
2100+
/// @param glob_offset Offset in bytes added to each individual element's offset
2101+
/// to compute actual memory access offset for that element.
2102+
/// @param mask Memory access mask. Elements with zero corresponding mask's
2103+
/// predicate are not accessed.
2104+
///
2105+
///
2106+
template <typename T, int N, typename AccessorTy>
2107+
__ESIMD_API std::enable_if_t<
2108+
sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
2109+
scatter(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N> vals,
2110+
uint32_t glob_offset = 0, simd_mask<N> mask = 1) {
2111+
slm_scatter<T, N>(offsets + glob_offset +
2112+
__ESIMD_DNS::localAccessorToOffset(acc),
2113+
vals, mask);
2114+
}
2115+
2116+
/// Variant of gather_rgba that uses local accessor as a parameter
2117+
///
2118+
/// Gather and transpose pixels from the given memory locations defined by the
2119+
/// base specified by \c acc, the global offset \c global_offset and a vector of
2120+
/// offsets \c offsets. Up to 4 32-bit data elements may be accessed at each
2121+
/// address depending on the channel mask \c RGBAMask. Each pixel's address must
2122+
/// be 4-byte aligned.
2123+
/// For usage examples, see \ref usm_gather_rgba above, the only difference
2124+
/// would be the usage of an accessor instead of a usm pointer.
2125+
///
2126+
/// @tparam RGBAMask A pixel's channel mask.
2127+
/// @tparam AccessorT The accessor type for the memory to be loaded/gathered.
2128+
/// The returned vector elements must match the accessor data type. The loaded
2129+
/// elements must be 4 bytes in size.
2130+
/// @tparam N Number of pixels to access (matches the size of the \c offsets
2131+
/// vector). Must be 8, 16 or 32.
2132+
/// @param acc The accessor representing memory address of the access.
2133+
/// @param offsets Byte offsets of the pixels relative to the base pointer.
2134+
/// @param global_offset Byte offset of the pixels relative to the base pointer.
2135+
/// @param mask Memory access mask. Pixels with zero corresponding mask's
2136+
/// predicate are not accessed. Their values in the resulting vector are
2137+
/// undefined.
2138+
/// @return Read data - up to N*4 values of type \c Tx.
2139+
///
2140+
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
2141+
typename AccessorT, int N,
2142+
typename T = typename AccessorT::value_type>
2143+
__ESIMD_API std::enable_if_t<
2144+
sycl::detail::acc_properties::is_local_accessor_v<AccessorT>,
2145+
simd<T, N * get_num_channels_enabled(RGBAMask)>>
2146+
gather_rgba(AccessorT acc, simd<uint32_t, N> offsets,
2147+
uint32_t global_offset = 0, simd_mask<N> mask = 1) {
2148+
return slm_gather_rgba<T, N, RGBAMask>(
2149+
offsets + global_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
2150+
}
2151+
2152+
/// Variant of scatter_rgba that uses local accessor as a parameter
2153+
/// Gather data from the memory addressed by accessor \c acc, offset common
2154+
/// for all loaded elements \c global_offset and per-element offsets \c offsets,
2155+
/// and return it as simd vector. See @ref usm_gather_rgba for information about
2156+
/// the operation semantics and parameter restrictions/interdependencies.
2157+
///
2158+
/// @tparam RGBAMask Pixel's channel mask.
2159+
/// @tparam AccessorT The accessor type for the memory to be stored/scattered.
2160+
/// The returned vector elements must match the accessor data type. The loaded
2161+
/// elements must be 4 bytes in size.
2162+
/// @tparam N The number of elements to access.
2163+
/// @param offsets Byte offsets of each element.
2164+
/// @param vals values to be written.
2165+
/// @param global_offset Byte offset of the pixels relative to the base pointer.
2166+
/// @param mask Operation mask. All-1 by default.
2167+
///
2168+
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
2169+
typename AccessorT, int N,
2170+
typename T = typename AccessorT::value_type>
2171+
__ESIMD_API std::enable_if_t<
2172+
sycl::detail::acc_properties::is_local_accessor_v<AccessorT>>
2173+
scatter_rgba(AccessorT acc, simd<uint32_t, N> offsets,
2174+
simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
2175+
uint32_t global_offset = 0, simd_mask<N> mask = 1) {
2176+
detail::validate_rgba_write_channel_mask<RGBAMask>();
2177+
slm_scatter_rgba<T, N, RGBAMask>(offsets + global_offset +
2178+
__ESIMD_DNS::localAccessorToOffset(acc),
2179+
vals, mask);
2180+
}
20062181
/// @} sycl_esimd_memory
20072182

20082183
/// @cond EXCLUDE
20092184

20102185
namespace detail {
2011-
// ----- Outlined implementations of simd_obj_impl class memory access APIs.
2186+
// -- Outlined implementations of simd_obj_impl class memory access APIs.
20122187

20132188
template <typename T, int N, class T1, class SFINAE>
20142189
template <typename Flags, int ChunkSize, typename>

0 commit comments

Comments
 (0)