@@ -283,7 +283,7 @@ scatter(Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1) {
283
283
// / 1, 2, 4 or 8 owords long.
284
284
// / @tparam Flags The alignment specifier type tag. Auto-deduced from the
285
285
// / \c Flags parameter. If it is less than \c 16, then slower unaligned
286
- // / access is generated, othewise the access is aligned.
286
+ // / access is generated, otherwise the access is aligned.
287
287
// / @param addr The address to load from.
288
288
// / @param Flags Specifies the alignment.
289
289
// / @return A vector of loaded elements.
@@ -320,15 +320,18 @@ __ESIMD_API simd<Tx, N> block_load(const Tx *addr, Flags = {}) {
320
320
// / @tparam AccessorTy Accessor type (auto-deduced).
321
321
// / @tparam Flags The alignment specifier type tag. Auto-deduced from the
322
322
// / \c Flags parameter. If it is less than \c 16, then slower unaligned
323
- // / access is generated, othewise the access is aligned.
323
+ // / access is generated, otherwise the access is aligned.
324
324
// / @param acc The accessor.
325
325
// / @param offset The offset to load from in bytes.
326
326
// / @param Flags Specifies the alignment.
327
327
// / @return A vector of loaded elements.
328
328
// /
329
329
template <typename Tx, int N, typename AccessorTy,
330
330
typename Flags = vector_aligned_tag,
331
- typename = std::enable_if_t <is_simd_flag_type_v<Flags>>,
331
+ typename = std::enable_if_t <
332
+ is_simd_flag_type_v<Flags> &&
333
+ sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
334
+ !sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>,
332
335
class T = detail::__raw_t <Tx>>
333
336
__ESIMD_API simd<Tx, N> block_load (AccessorTy acc,
334
337
#ifdef __ESIMD_FORCE_STATELESS_MEM
@@ -399,13 +402,16 @@ __ESIMD_API void block_store(Tx *p, simd<Tx, N> vals) {
399
402
// /
400
403
template <typename Tx, int N, typename AccessorTy,
401
404
class T = detail::__raw_t <Tx>>
402
- __ESIMD_API void block_store (AccessorTy acc,
405
+ __ESIMD_API std::enable_if_t <
406
+ sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
407
+ !sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
408
+ block_store (AccessorTy acc,
403
409
#ifdef __ESIMD_FORCE_STATELESS_MEM
404
- uint64_t offset,
410
+ uint64_t offset,
405
411
#else
406
- uint32_t offset,
412
+ uint32_t offset,
407
413
#endif
408
- simd<Tx, N> vals) {
414
+ simd<Tx, N> vals) {
409
415
#ifdef __ESIMD_FORCE_STATELESS_MEM
410
416
block_store<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset), vals);
411
417
#else
@@ -438,7 +444,6 @@ ESIMD_INLINE
438
444
!std::is_pointer<AccessorTy>::value>
439
445
scatter_impl (AccessorTy acc, simd<T, N> vals, simd<uint32_t , N> offsets,
440
446
uint32_t glob_offset, simd_mask<N> mask) {
441
-
442
447
constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof (T)>();
443
448
// TODO (performance) use hardware-supported scale once BE supports it
444
449
constexpr int16_t scale = 0 ;
@@ -522,7 +527,7 @@ gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
522
527
// / @anchor accessor_gather Accessor-based gather.
523
528
// /
524
529
// / Collects elements located at given offsets in an accessor and returns them
525
- // / as a single \ref simd object. An element can be 1, 2 or 4-byte value.
530
+ // / as a single \ref simd object. An element can be a 1, 2 or 4-byte value.
526
531
// /
527
532
// / @tparam T Element type; can only be a 1,2,4-byte integer, \c sycl::half or
528
533
// / \c float.
@@ -537,10 +542,11 @@ gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
537
542
// / undefined.
538
543
// /
539
544
template <typename T, int N, typename AccessorTy>
540
- __ESIMD_API std::enable_if_t <(sizeof (T) <= 4 ) &&
541
- (N == 1 || N == 8 || N == 16 || N == 32 ) &&
542
- !std::is_pointer<AccessorTy>::value,
543
- simd<T, N>>
545
+ __ESIMD_API std::enable_if_t <
546
+ (sizeof (T) <= 4 ) && (N == 1 || N == 8 || N == 16 || N == 32 ) &&
547
+ sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
548
+ !sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
549
+ simd<T, N>>
544
550
gather (AccessorTy acc,
545
551
#ifdef __ESIMD_FORCE_STATELESS_MEM
546
552
simd<uint64_t , N> offsets, uint64_t glob_offset = 0 ,
@@ -574,7 +580,7 @@ gather(AccessorTy acc, simd<Toffset, N> offsets, uint64_t glob_offset = 0,
574
580
// / Accessor-based scatter.
575
581
// /
576
582
// / Writes elements of a \ref simd object into an accessor at given offsets.
577
- // / An element can be 1, 2 or 4-byte value.
583
+ // / An element can be a 1, 2 or 4-byte value.
578
584
// /
579
585
// / @tparam T Element type; can only be a 1,2,4-byte integer, \c sycl::half or
580
586
// / \c float.
@@ -590,9 +596,10 @@ gather(AccessorTy acc, simd<Toffset, N> offsets, uint64_t glob_offset = 0,
590
596
// /
591
597
// /
592
598
template <typename T, int N, typename AccessorTy>
593
- __ESIMD_API std::enable_if_t <(sizeof (T) <= 4 ) &&
594
- (N == 1 || N == 8 || N == 16 || N == 32 ) &&
595
- !std::is_pointer<AccessorTy>::value>
599
+ __ESIMD_API std::enable_if_t <
600
+ (sizeof (T) <= 4 ) && (N == 1 || N == 8 || N == 16 || N == 32 ) &&
601
+ sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
602
+ !sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
596
603
scatter (AccessorTy acc,
597
604
#ifdef __ESIMD_FORCE_STATELESS_MEM
598
605
simd<uint64_t , N> offsets,
@@ -863,7 +870,7 @@ __ESIMD_API std::
863
870
// /
864
871
// / @tparam RGBAMask A pixel's channel mask.
865
872
// / @tparam AccessorT The accessor type for the memory to be loaded/gathered.
866
- // / The returned vector elements mutch the accessor data type. The loaded
873
+ // / The returned vector elements must match the accessor data type. The loaded
867
874
// / elements must be 4 bytes in size.
868
875
// / @tparam N Number of pixels to access (matches the size of the \c offsets
869
876
// / vector). Must be 8, 16 or 32.
@@ -878,9 +885,11 @@ __ESIMD_API std::
878
885
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
879
886
typename AccessorT, int N,
880
887
typename T = typename AccessorT::value_type>
881
- __ESIMD_API std::enable_if_t <((N == 8 || N == 16 || N == 32 ) &&
882
- sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
883
- simd<T, N * get_num_channels_enabled(RGBAMask)>>
888
+ __ESIMD_API std::enable_if_t <
889
+ ((N == 8 || N == 16 || N == 32 ) && sizeof(T) == 4 &&
890
+ !std::is_pointer_v<AccessorT> &&
891
+ !sycl::detail::acc_properties::is_local_accessor_v<AccessorT>),
892
+ simd<T, N * get_num_channels_enabled(RGBAMask)>>
884
893
gather_rgba(AccessorT acc,
885
894
#ifdef __ESIMD_FORCE_STATELESS_MEM
886
895
simd<uint64_t , N> offsets, uint64_t global_offset = 0 ,
@@ -923,7 +932,7 @@ gather_rgba(AccessorT acc, simd<Toffset, N> offsets, uint64_t global_offset = 0,
923
932
// / the operation semantics and parameter restrictions/interdependencies.
924
933
// / @tparam RGBAMask Pixel's channel mask.
925
934
// / @tparam AccessorT The accessor type for the memory to be stored/scattered.
926
- // / The returned vector elements mast match the accessor data type. The loaded
935
+ // / The returned vector elements must match the accessor data type. The loaded
927
936
// / elements must be 4 bytes in size.
928
937
// / @tparam N The number of elements to access.
929
938
// / @param offsets Byte offsets of each element.
@@ -934,8 +943,10 @@ gather_rgba(AccessorT acc, simd<Toffset, N> offsets, uint64_t global_offset = 0,
934
943
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
935
944
typename AccessorT, int N,
936
945
typename T = typename AccessorT::value_type>
937
- __ESIMD_API std::enable_if_t <(N == 8 || N == 16 || N == 32 ) && sizeof(T) == 4 &&
938
- !std::is_pointer_v<AccessorT>>
946
+ __ESIMD_API std::enable_if_t <
947
+ (N == 8 || N == 16 || N == 32 ) && sizeof(T) == 4 &&
948
+ !std::is_pointer_v<AccessorT> &&
949
+ !sycl::detail::acc_properties::is_local_accessor_v<AccessorT>>
939
950
scatter_rgba(AccessorT acc,
940
951
#ifdef __ESIMD_FORCE_STATELESS_MEM
941
952
simd<uint64_t , N> offsets,
@@ -2003,12 +2014,176 @@ __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
2003
2014
}
2004
2015
}
2005
2016
2017
+ // / Variant of block_load that uses local accessor as a parameter
2018
+ // / Loads a contiguous block of memory from given accessor and offset and
2019
+ // / returns the loaded data as a vector. Actual code generated depends on
2020
+ // / the alignment parameter.
2021
+ // / @tparam Tx Element type.
2022
+ // / @tparam N Number of elements to load, <code>N * sizeof(Tx)</code> must
2023
+ // / be 1, 2, 4 or 8 owords long.
2024
+ // / @tparam AccessorTy Accessor type (auto-deduced).
2025
+ // / @tparam Flags The alignment specifier type tag. Auto-deduced from the
2026
+ // / \c Flags parameter. If it is less than \c 16, then slower unaligned
2027
+ // / access is generated, otherwise the access is aligned.
2028
+ // / @param acc The accessor.
2029
+ // / @param offset The offset to load from in bytes.
2030
+ // / @return A vector of loaded elements.
2031
+ // /
2032
+ template <typename Tx, int N, typename AccessorTy,
2033
+ typename = std::enable_if_t <
2034
+ sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>>
2035
+ __ESIMD_API simd<Tx, N> block_load (AccessorTy acc, uint32_t offset) {
2036
+ return slm_block_load<Tx, N>(offset +
2037
+ __ESIMD_DNS::localAccessorToOffset (acc));
2038
+ }
2039
+
2040
+ // / Variant of block_store that uses local accessor as a parameter
2041
+ // / Stores elements of a vector to a contiguous block of memory represented
2042
+ // / by an accessor and an offset within this accessor.
2043
+ // / @tparam Tx Element type.
2044
+ // / @tparam N Number of elements to store, <code>N * sizeof(Tx)</code> must
2045
+ // / be
2046
+ // / 1, 2, 4 or 8 owords long.
2047
+ // / @tparam AccessorTy Accessor type (auto-deduced).
2048
+ // / @param acc The accessor to store to.
2049
+ // / @param offset The offset to store at. It is in bytes and must be a
2050
+ // / multiple
2051
+ // / of \c 16.
2052
+ // / @param vals The vector to store.
2053
+ // /
2054
+ template <typename Tx, int N, typename AccessorTy>
2055
+ __ESIMD_API std::enable_if_t <
2056
+ sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
2057
+ block_store (AccessorTy acc, uint32_t offset, simd<Tx, N> vals) {
2058
+ slm_block_store<Tx, N>(offset + __ESIMD_DNS::localAccessorToOffset (acc),
2059
+ vals);
2060
+ }
2061
+
2062
+ // / Variant of gather that uses local accessor as a parameter
2063
+ // /
2064
+ // / Collects elements located at given offsets in an accessor and returns them
2065
+ // / as a single \ref simd object. An element can be a 1, 2 or 4-byte value.
2066
+ // /
2067
+ // / @tparam T Element type; can only be a 1,2,4-byte integer, \c sycl::half or
2068
+ // / \c float.
2069
+ // / @tparam N The number of vector elements. Can be \c 1, \c 8, \c 16 or \c 32.
2070
+ // / @tparam AccessorTy The accessor type.
2071
+ // / @param acc The accessor to gather from.
2072
+ // / @param offsets Per-element offsets in bytes.
2073
+ // / @param glob_offset Offset in bytes added to each individual element's offset
2074
+ // / to compute actual memory access offset for that element.
2075
+ // / @param mask Memory access mask. Elements with zero corresponding mask's
2076
+ // / predicate are not accessed, their values in the resulting vector are
2077
+ // / undefined.
2078
+ // /
2079
+ template <typename T, int N, typename AccessorTy>
2080
+ __ESIMD_API std::enable_if_t <
2081
+ sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>, simd<T, N>>
2082
+ gather (AccessorTy acc, simd<uint32_t , N> offsets, uint32_t glob_offset = 0 ,
2083
+ simd_mask<N> mask = 1 ) {
2084
+ return slm_gather<T, N>(
2085
+ offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset (acc), mask);
2086
+ }
2087
+
2088
+ // / Variant of scatter that uses local accessor as a parameter
2089
+ // /
2090
+ // / Writes elements of a \ref simd object into an accessor at given offsets.
2091
+ // / An element can be a 1, 2 or 4-byte value.
2092
+ // /
2093
+ // / @tparam T Element type; can only be a 1,2,4-byte integer, \c sycl::half or
2094
+ // / \c float.
2095
+ // / @tparam N The number of vector elements. Can be \c 1, \c 8, \c 16 or \c 32.
2096
+ // / @tparam AccessorTy The accessor type.
2097
+ // / @param acc The accessor to scatter to.
2098
+ // / @param offsets Per-element offsets in bytes.
2099
+ // / @param vals Values to write.
2100
+ // / @param glob_offset Offset in bytes added to each individual element's offset
2101
+ // / to compute actual memory access offset for that element.
2102
+ // / @param mask Memory access mask. Elements with zero corresponding mask's
2103
+ // / predicate are not accessed.
2104
+ // /
2105
+ // /
2106
+ template <typename T, int N, typename AccessorTy>
2107
+ __ESIMD_API std::enable_if_t <
2108
+ sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>>
2109
+ scatter (AccessorTy acc, simd<uint32_t , N> offsets, simd<T, N> vals,
2110
+ uint32_t glob_offset = 0 , simd_mask<N> mask = 1 ) {
2111
+ slm_scatter<T, N>(offsets + glob_offset +
2112
+ __ESIMD_DNS::localAccessorToOffset (acc),
2113
+ vals, mask);
2114
+ }
2115
+
2116
+ // / Variant of gather_rgba that uses local accessor as a parameter
2117
+ // /
2118
+ // / Gather and transpose pixels from the given memory locations defined by the
2119
+ // / base specified by \c acc, the global offset \c global_offset and a vector of
2120
+ // / offsets \c offsets. Up to 4 32-bit data elements may be accessed at each
2121
+ // / address depending on the channel mask \c RGBAMask. Each pixel's address must
2122
+ // / be 4-byte aligned.
2123
+ // / For usage examples, see \ref usm_gather_rgba above, the only difference
2124
+ // / would be the usage of an accessor instead of a usm pointer.
2125
+ // /
2126
+ // / @tparam RGBAMask A pixel's channel mask.
2127
+ // / @tparam AccessorT The accessor type for the memory to be loaded/gathered.
2128
+ // / The returned vector elements must match the accessor data type. The loaded
2129
+ // / elements must be 4 bytes in size.
2130
+ // / @tparam N Number of pixels to access (matches the size of the \c offsets
2131
+ // / vector). Must be 8, 16 or 32.
2132
+ // / @param acc The accessor representing memory address of the access.
2133
+ // / @param offsets Byte offsets of the pixels relative to the base pointer.
2134
+ // / @param global_offset Byte offset of the pixels relative to the base pointer.
2135
+ // / @param mask Memory access mask. Pixels with zero corresponding mask's
2136
+ // / predicate are not accessed. Their values in the resulting vector are
2137
+ // / undefined.
2138
+ // / @return Read data - up to N*4 values of type \c Tx.
2139
+ // /
2140
+ template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
2141
+ typename AccessorT, int N,
2142
+ typename T = typename AccessorT::value_type>
2143
+ __ESIMD_API std::enable_if_t <
2144
+ sycl::detail::acc_properties::is_local_accessor_v<AccessorT>,
2145
+ simd<T, N * get_num_channels_enabled (RGBAMask)>>
2146
+ gather_rgba(AccessorT acc, simd<uint32_t , N> offsets,
2147
+ uint32_t global_offset = 0 , simd_mask<N> mask = 1 ) {
2148
+ return slm_gather_rgba<T, N, RGBAMask>(
2149
+ offsets + global_offset + __ESIMD_DNS::localAccessorToOffset (acc), mask);
2150
+ }
2151
+
2152
+ // / Variant of scatter_rgba that uses local accessor as a parameter
2153
+ // / Gather data from the memory addressed by accessor \c acc, offset common
2154
+ // / for all loaded elements \c global_offset and per-element offsets \c offsets,
2155
+ // / and return it as simd vector. See @ref usm_gather_rgba for information about
2156
+ // / the operation semantics and parameter restrictions/interdependencies.
2157
+ // /
2158
+ // / @tparam RGBAMask Pixel's channel mask.
2159
+ // / @tparam AccessorT The accessor type for the memory to be stored/scattered.
2160
+ // / The returned vector elements must match the accessor data type. The loaded
2161
+ // / elements must be 4 bytes in size.
2162
+ // / @tparam N The number of elements to access.
2163
+ // / @param offsets Byte offsets of each element.
2164
+ // / @param vals values to be written.
2165
+ // / @param global_offset Byte offset of the pixels relative to the base pointer.
2166
+ // / @param mask Operation mask. All-1 by default.
2167
+ // /
2168
+ template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
2169
+ typename AccessorT, int N,
2170
+ typename T = typename AccessorT::value_type>
2171
+ __ESIMD_API std::enable_if_t <
2172
+ sycl::detail::acc_properties::is_local_accessor_v<AccessorT>>
2173
+ scatter_rgba (AccessorT acc, simd<uint32_t , N> offsets,
2174
+ simd<T, N * get_num_channels_enabled (RGBAMask)> vals,
2175
+ uint32_t global_offset = 0, simd_mask<N> mask = 1) {
2176
+ detail::validate_rgba_write_channel_mask<RGBAMask>();
2177
+ slm_scatter_rgba<T, N, RGBAMask>(offsets + global_offset +
2178
+ __ESIMD_DNS::localAccessorToOffset (acc),
2179
+ vals, mask);
2180
+ }
2006
2181
// / @} sycl_esimd_memory
2007
2182
2008
2183
// / @cond EXCLUDE
2009
2184
2010
2185
namespace detail {
2011
- // ----- Outlined implementations of simd_obj_impl class memory access APIs.
2186
+ // -- Outlined implementations of simd_obj_impl class memory access APIs.
2012
2187
2013
2188
template <typename T, int N, class T1 , class SFINAE >
2014
2189
template <typename Flags, int ChunkSize, typename >
0 commit comments