Skip to content

Commit 4c13388

Browse files
authored
[SYCL][ESIMD] Introduce Masked Compress load/store API (#14941)
1 parent 6dbee94 commit 4c13388

File tree

5 files changed

+502
-0
lines changed

5 files changed

+502
-0
lines changed

sycl/include/sycl/ext/intel/esimd/detail/util.hpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,29 @@ constexpr void check_wrregion_params() {
281281
check_rdregion_params<N, M, VStride, Width, Stride>();
282282
}
283283

284+
// Generate an array of bitmasks for compressed load/store -- all 1 bits
285+
// strictly less than i -- [0 1 3 7 15 31 63 127 255 511 1023 2047 ... ]
286+
template <uint32_t... args> struct CompressedBitmask {
287+
static const uint32_t value[sizeof...(args)];
288+
};
289+
290+
template <uint32_t... args>
291+
const uint32_t CompressedBitmask<args...>::value[sizeof...(args)] = {args...};
292+
293+
template <int N, unsigned... args> struct GenerateCompressedBitmaskImpl {
294+
using value =
295+
typename GenerateCompressedBitmaskImpl<N - 1, ~(((uint32_t)(~0)) << N),
296+
args...>::value;
297+
};
298+
299+
template <unsigned... args> struct GenerateCompressedBitmaskImpl<0, args...> {
300+
using value = CompressedBitmask<0, args...>;
301+
};
302+
303+
template <int N> struct GenerateCompressedBitmask {
304+
using value = typename GenerateCompressedBitmaskImpl<N - 1>::value;
305+
};
306+
284307
} // namespace ext::intel::esimd::detail
285308
} // namespace _V1
286309
} // namespace sycl

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14302,6 +14302,241 @@ named_barrier_signal(uint8_t barrier_id, uint8_t producer_consumer_mode,
1430214302

1430314303
/// @} sycl_esimd_memory_nbarrier
1430414304

14305+
/// @defgroup sycl_esimd_mask_compressed Mask compressed APIs.
14306+
/// @ingroup sycl_esimd_memory
14307+
14308+
/// @addtogroup sycl_esimd_mask_compressed
14309+
/// @{
14310+
14311+
/// template <typename T, int N,
14312+
/// typename PropertyListT = oneapi::experimental::empty_properties_t>
14313+
/// simd<T, N>
14314+
/// mask_expand_load(const T *p, simd_mask<N> mask, PropertyListT props = {});
14315+
/// Mask expand load from USM memory location.
14316+
/// The function reads data from a memory location using following algorithm:
14317+
///
14318+
/// \code{.cpp}
14319+
///
14320+
/// int Index = 0;
14321+
/// for (int i = 0; i < N; ++i) {
14322+
/// if (Mask[i]) {
14323+
/// Result[i] = *(p + Index);
14324+
/// ++Index;
14325+
/// }
14326+
/// }
14327+
/// \endcode
14328+
///
14329+
///
14330+
/// @tparam T is the element type.
14331+
/// @tparam N is the data size.
14332+
/// @param p is the base address for this operation.
14333+
/// @param mask is the mask determining which elements will be read.
14334+
/// @param props The compile-time properties. Only cache hint
14335+
/// properties are used.
14336+
///
14337+
template <typename T, int N,
14338+
typename PropertyListT = oneapi::experimental::empty_properties_t>
14339+
__ESIMD_API std::enable_if_t<
14340+
ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
14341+
mask_expand_load(const T *p, simd_mask<N> mask, PropertyListT props = {}) {
14342+
// offsets::value contains binary masks that for every location at index i it
14343+
// contains i 1's i.e. 0,1,3,7,...
14344+
using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14345+
// Performing '&' operation with packed mask will leave at index i a bitmask
14346+
// with number of 1's corresponding to a number of elements to be loaded so
14347+
// far (number of 1's in the mask preceding the index i). Number of 1's
14348+
// becomes an index for compressed store/expanded load operation.
14349+
simd<uint32_t, N> offset =
14350+
cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14351+
return gather(p, offset * sizeof(T), mask, props);
14352+
}
14353+
14354+
/// template <typename T, int N, typename AccessorTy,
14355+
/// typename PropertyListT = oneapi::experimental::empty_properties_t>
14356+
/// simd<T, N>
14357+
/// mask_expand_load(AccessorTy acc, simd_mask<N> mask, PropertyListT props =
14358+
/// {});
14359+
/// Mask expand load from accessor memory (could be local or device
14360+
/// accessor). The function reads data from a memory location using following
14361+
/// algorithm:
14362+
///
14363+
/// \code{.cpp}
14364+
///
14365+
/// int Index = 0;
14366+
/// for (int i = 0; i < N; ++i) {
14367+
/// if (Mask[i])
14368+
/// Result[i] = acc[global_offset + Index++];
14369+
/// }
14370+
/// \endcode
14371+
///
14372+
///
14373+
/// @tparam T is the element type.
14374+
/// @tparam N is the data size.
14375+
/// @param acc is the accessor to read from.
14376+
/// @param global_offset is the global offset in bytes.
14377+
/// @param mask is the mask determining which elements will be read.
14378+
/// @param props The compile-time properties. Only cache hint
14379+
/// properties are used.
14380+
///
14381+
template <typename T, int N, typename AccessorTy,
14382+
typename PropertyListT = oneapi::experimental::empty_properties_t>
14383+
__ESIMD_API std::enable_if_t<
14384+
ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
14385+
detail::is_accessor_with_v<AccessorTy,
14386+
detail::accessor_mode_cap::can_read>,
14387+
simd<T, N>>
14388+
mask_expand_load(AccessorTy acc, uint32_t global_offset, simd_mask<N> mask,
14389+
PropertyListT props = {}) {
14390+
// offsets::value contains binary masks that for every location at index i it
14391+
// contains i 1's i.e. 0,1,3,7,...
14392+
using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14393+
// Performing '&' operation with packed mask will leave at index i a bitmask
14394+
// with number of 1's corresponding to a number of elements to be loaded so
14395+
// far (number of 1's in the mask preceding the index i). Number of 1's
14396+
// becomes an index for compressed store/expanded load operation.
14397+
simd<uint32_t, N> offset =
14398+
cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14399+
return gather<T>(acc, offset * sizeof(T) + global_offset, mask, props);
14400+
}
14401+
14402+
/// template <typename T, int N,
14403+
/// typename PropertyListT = oneapi::experimental::empty_properties_t>
14404+
/// simd<T, N>
14405+
/// mask_compress_store(T *p, simd<T, N> vals, simd_mask<N> mask,
14406+
/// PropertyListT props = {});
14407+
/// Mask compressed store to USM memory location.
14408+
/// The function stores data to a memory location using following algorithm:
14409+
///
14410+
/// \code{.cpp}
14411+
///
14412+
/// int Index = 0;
14413+
/// for (int i = 0; i < N; ++i) {
14414+
/// if (Mask[i]) {
14415+
/// *(p + Index) = val[i];
14416+
/// ++Index;
14417+
/// }
14418+
/// }
14419+
/// \endcode
14420+
///
14421+
///
14422+
/// @tparam T is the element type.
14423+
/// @tparam N is the data size.
14424+
/// @param p is the base address for this operation.
14425+
/// @param vals is the data to store.
14426+
/// @param mask is the mask determining which elements will be stored.
14427+
/// @param props The compile-time properties. Only cache hint
14428+
/// properties are used.
14429+
///
14430+
template <typename T, int N,
14431+
typename PropertyListT = oneapi::experimental::empty_properties_t>
14432+
__ESIMD_API std::enable_if_t<
14433+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14434+
mask_compress_store(T *p, simd<T, N> vals, simd_mask<N> mask,
14435+
PropertyListT props = {}) {
14436+
// offsets::value contains binary masks that for every location at index i it
14437+
// contains i 1's i.e. 0,1,3,7,...
14438+
using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14439+
// Performing '&' operation with packed mask will leave at index i a bitmask
14440+
// with number of 1's corresponding to a number of elements to be loaded so
14441+
// far (number of 1's in the mask preceding the index i). Number of 1's
14442+
// becomes an index for compressed store/expanded load operation.
14443+
simd<uint32_t, N> offset =
14444+
cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14445+
scatter(p, offset * sizeof(T), vals, mask, props);
14446+
}
14447+
14448+
/// template <typename T, int N, typename AccessorTy,
14449+
/// typename PropertyListT = oneapi::experimental::empty_properties_t>
14450+
/// simd<T, N>
14451+
/// mask_compress_store(AccessorTy acc, simd<T, N> vals, simd_mask<N> mask,
14452+
/// PropertyListT props = {});
14453+
/// Mask compressed store to accessor memory (could be local or device
14454+
/// accessor).
14455+
/// The function stores data to a memory location using following algorithm:
14456+
///
14457+
/// \code{.cpp}
14458+
///
14459+
/// int Index = 0;
14460+
/// for (int i = 0; i < N; ++i) {
14461+
/// if (Mask[i])
14462+
/// acc[global_offset + Index++] = val[i];
14463+
/// }
14464+
/// \endcode
14465+
///
14466+
///
14467+
/// @tparam T is the element type.
14468+
/// @tparam N is the data size.
14469+
/// @param acc is the accessor to write to.
14470+
/// @param global_offset is the global offset in bytes.
14471+
/// @param vals is the data to store.
14472+
/// @param mask is the mask determining which elements will be stored.
14473+
/// @param props The compile-time properties. Only cache hint
14474+
/// properties are used.
14475+
///
14476+
template <typename T, int N, typename AccessorTy,
14477+
typename PropertyListT = oneapi::experimental::empty_properties_t>
14478+
__ESIMD_API std::enable_if_t<
14479+
ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
14480+
detail::is_accessor_with_v<AccessorTy,
14481+
detail::accessor_mode_cap::can_write>>
14482+
mask_compress_store(AccessorTy acc, uint32_t global_offset, simd<T, N> vals,
14483+
simd_mask<N> mask, PropertyListT props = {}) {
14484+
// offsets::value contains binary masks that for every location at index i it
14485+
// contains i 1's i.e. 0,1,3,7,...
14486+
using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14487+
// Performing '&' operation with packed mask will leave at index i a bitmask
14488+
// with number of 1's corresponding to a number of elements to be loaded so
14489+
// far (number of 1's in the mask preceding the index i). Number of 1's
14490+
// becomes an index for compressed store/expanded load operation.
14491+
simd<uint32_t, N> offset =
14492+
cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14493+
scatter<T, N>(acc, offset * sizeof(T) + global_offset, vals, mask, props);
14494+
}
14495+
14496+
/// template <typename T, int N, int M>
14497+
/// simd<T, N>
14498+
/// mask_compress_store(simd<T, M> &dst, uint32_t global_offset, simd<T, N>
14499+
/// vals, simd_mask<N> mask);
14500+
/// Mask compressed store to another vector. The function reads data to a
14501+
/// vector using following algorithm:
14502+
///
14503+
/// \code{.cpp}
14504+
///
14505+
/// int Index = 0;
14506+
/// for (int i = 0; i < N; ++i) {
14507+
/// if (Mask[i])
14508+
/// dst[global_offset + Index++] = vals[i];
14509+
/// }
14510+
/// \endcode
14511+
///
14512+
///
14513+
/// @tparam T is the element type.
14514+
/// @tparam N is the data size.
14515+
/// @tparam M is the source data size.
14516+
/// @param dst is the vector to write to.
14517+
/// @param global_offset is an offset to use for all writes.
14518+
/// @param vals is the data to store.
14519+
/// @param mask is the mask determining which elements will be stored.
14520+
///
14521+
template <typename T, int N, int M>
14522+
__ESIMD_API std::enable_if_t<M >= N>
14523+
mask_compress_store(simd<T, M> &dst, uint32_t global_offset, simd<T, N> vals,
14524+
simd_mask<N> mask) {
14525+
// offsets::value contains binary masks that for every location at index i it
14526+
// contains i 1's i.e. 0,1,3,7,...
14527+
using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14528+
// Performing '&' operation with packed mask will leave at index i a bitmask
14529+
// with number of 1's corresponding to a number of elements to be loaded so
14530+
// far (number of 1's in the mask preceding the index i). Number of 1's
14531+
// becomes an index for compressed store/expanded load operation.
14532+
simd<uint32_t, N> offset =
14533+
cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14534+
14535+
simd<uint16_t, N> Indices = global_offset + offset;
14536+
dst.iupdate(Indices, vals, mask);
14537+
}
14538+
14539+
/// @} sycl_esimd_mask_compressed
1430514540
/// @} sycl_esimd_memory
1430614541

1430714542
/// @cond EXCLUDE
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
//=mask_compress_store_acc.cpp-Test to verify compressed store functionality=//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
10+
// RUN: %{run} %t.out
11+
12+
// This is a basic test to validate the compressed store API.
13+
14+
#include "esimd_test_utils.hpp"
15+
16+
using namespace sycl;
17+
using namespace sycl::ext::intel::esimd;
18+
19+
template <int N> bool test() {
20+
std::vector<uint32_t> OutputAcc(N);
21+
std::vector<uint32_t> ExpectedOutput(N);
22+
23+
int idx = 0;
24+
for (int I = 0; I < N; I++) {
25+
if ((I % 2) == 0)
26+
ExpectedOutput[idx++] = I + 1;
27+
}
28+
{
29+
auto Queue = queue{gpu_selector_v};
30+
esimd_test::printTestLabel(Queue);
31+
32+
sycl::buffer<uint32_t, 1> OutputAcc_buffer(OutputAcc.data(),
33+
OutputAcc.size());
34+
35+
auto e = Queue.submit([&](sycl::handler &cgh) {
36+
auto OutputAcc_out =
37+
OutputAcc_buffer.get_access<access::mode::read_write>(cgh);
38+
39+
auto kernel = ([=]() [[intel::sycl_explicit_simd]] {
40+
simd<uint32_t, N> Input(1, 1);
41+
simd_mask<N> Mask;
42+
for (int i = 0; i < N; i++)
43+
Mask[i] = (i % 2) == 0;
44+
mask_compress_store(OutputAcc_out, 0, Input, Mask);
45+
});
46+
cgh.single_task(kernel);
47+
});
48+
Queue.wait();
49+
}
50+
51+
for (int I = 0; I < N; I++) {
52+
if (OutputAcc[I] != ExpectedOutput[I]) {
53+
std::cout << "mask_compress_store: error at I = " << std::to_string(I)
54+
<< ": " << std::to_string(ExpectedOutput[I])
55+
<< " != " << std::to_string(OutputAcc[I]) << std::endl;
56+
return false;
57+
}
58+
}
59+
60+
return true;
61+
}
62+
63+
int main() {
64+
65+
bool Pass = true;
66+
67+
Pass &= test<8>();
68+
Pass &= test<16>();
69+
Pass &= test<32>();
70+
71+
if (Pass)
72+
std::cout << "Pass" << std::endl;
73+
74+
return !Pass;
75+
}

0 commit comments

Comments
 (0)