Skip to content

Commit 655ab10

Browse files
authored
[ESIMD] Implement gather(acc) accepting compile-time properties (#12414)
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent ae24111 commit 655ab10

File tree

9 files changed

+964
-79
lines changed

9 files changed

+964
-79
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 451 additions & 35 deletions
Large diffs are not rendered by default.

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 7 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -803,23 +803,10 @@ __ESIMD_API
803803
return lsc_gather<T, NElts, DS, L1H, L3H>(
804804
reinterpret_cast<T *>(acc.get_pointer().get()), offsets, pred);
805805
#else
806-
detail::check_lsc_vector_size<NElts>();
807-
detail::check_lsc_data_size<T, DS>();
808-
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
809-
constexpr uint16_t _AddressScale = 1;
810-
constexpr int _ImmOffset = 0;
811-
constexpr lsc_data_size _DS =
812-
detail::expand_data_size(detail::finalize_data_size<T, DS>());
813-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
814-
constexpr detail::lsc_data_order _Transposed =
815-
detail::lsc_data_order::nontranspose;
816-
using MsgT = typename detail::lsc_expand_type<T>::type;
817-
auto si = __ESIMD_NS::get_surface_index(acc);
818-
__ESIMD_NS::simd<MsgT, N * NElts> Tmp =
819-
__esimd_lsc_load_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
820-
_Transposed, N>(pred.data(), offsets.data(), si);
821-
return detail::lsc_format_ret<T>(Tmp);
822-
#endif
806+
__ESIMD_NS::simd<T, N * NElts> PassThru; // Intentionally unitialized.
807+
return __ESIMD_DNS::gather_impl<T, N * NElts, NElts, L1H, L3H, DS>(
808+
acc, offsets, pred, PassThru);
809+
#endif // __ESIMD_FORCE_STATELESS_MEM
823810
}
824811

825812
#ifdef __ESIMD_FORCE_STATELESS_MEM
@@ -891,25 +878,9 @@ __ESIMD_API
891878
reinterpret_cast<T *>(acc.get_pointer().get()), offsets, pred, pass_thru);
892879

893880
#else
894-
detail::check_lsc_vector_size<NElts>();
895-
detail::check_lsc_data_size<T, DS>();
896-
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
897-
constexpr uint16_t _AddressScale = 1;
898-
constexpr int _ImmOffset = 0;
899-
constexpr lsc_data_size _DS =
900-
detail::expand_data_size(detail::finalize_data_size<T, DS>());
901-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
902-
constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
903-
using MsgT = typename detail::lsc_expand_type<T>::type;
904-
auto SI = __ESIMD_NS::get_surface_index(acc);
905-
__ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
906-
detail::lsc_format_input<MsgT>(pass_thru);
907-
__ESIMD_NS::simd<MsgT, N * NElts> Result =
908-
__esimd_lsc_load_merge_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
909-
_VS, _Transposed, N>(
910-
pred.data(), offsets.data(), SI, PassThruExpanded.data());
911-
return detail::lsc_format_ret<T>(Result);
912-
#endif
881+
return __ESIMD_DNS::gather_impl<T, N * NElts, NElts, L1H, L3H, DS>(
882+
acc, offsets, pred, pass_thru);
883+
#endif // __ESIMD_FORCE_STATELESS_MEM
913884
}
914885

915886
#ifdef __ESIMD_FORCE_STATELESS_MEM

sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp

Lines changed: 240 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,188 @@ bool testUSM(queue Q, uint32_t MaskStride, PropertiesT) {
207207
}
208208
} // end if (VS == 1)
209209
Vals.copy_to(Out + GlobalID * N);
210-
// scatter(Out, ByteOffsets.template select<NOffsets, 1>(), Vals);
210+
}).wait();
211+
} catch (sycl::exception const &e) {
212+
std::cout << "SYCL exception caught: " << e.what() << '\n';
213+
sycl::free(In, Q);
214+
sycl::free(Out, Q);
215+
return false;
216+
}
217+
218+
bool Passed = verify(In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru);
219+
if (!Passed)
220+
std::cout << "Case FAILED" << std::endl;
221+
222+
sycl::free(In, Q);
223+
sycl::free(Out, Q);
224+
return Passed;
225+
}
226+
227+
template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UsePassThru,
228+
bool UseProperties, typename PropertiesT>
229+
bool testACC(queue Q, uint32_t MaskStride, PropertiesT) {
230+
231+
static_assert(VS > 0 && N % VS == 0,
232+
"Incorrect VS parameter. N must be divisible by VS.");
233+
constexpr int NOffsets = N / VS;
234+
static_assert(!UsePassThru || UseMask,
235+
"PassThru cannot be used without using mask");
236+
237+
uint32_t Groups = 8;
238+
uint32_t Threads = 16;
239+
240+
std::cout << "Running case: T=" << esimd_test::type_name<T>() << ", N=" << N
241+
<< ", VS=" << VS << ", MaskStride=" << MaskStride
242+
<< ", Groups=" << Groups << ", Threads=" << Threads
243+
<< ", use_mask=" << UseMask << ", use_pass_thru=" << UsePassThru
244+
<< ", use_properties=" << UseProperties << std::endl;
245+
246+
uint16_t Size = Groups * Threads * N;
247+
using Tuint = esimd_test::uint_type_t<sizeof(T)>;
248+
249+
sycl::range<1> GlobalRange{Groups};
250+
sycl::range<1> LocalRange{Threads};
251+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
252+
253+
T *Out = sycl::malloc_shared<T>(Size, Q);
254+
std::memset(Out, 0, Size * sizeof(T));
255+
256+
T *In = sycl::malloc_shared<T>(Size * 2, Q);
257+
for (int I = 0; I < Size; I++)
258+
In[I] = esimd_test::getRandomValue<T>();
259+
260+
try {
261+
buffer<T, 1> InBuf(In, Size * 2);
262+
Q.submit([&](handler &CGH) {
263+
accessor InAcc{InBuf, CGH};
264+
CGH.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
265+
int GlobalID = NDI.get_global_id(0);
266+
PropertiesT Props{};
267+
268+
simd<OffsetT, NOffsets> ByteOffsets(GlobalID * N * sizeof(T),
269+
VS * sizeof(T));
270+
simd_view ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
271+
272+
simd_mask<NOffsets> Pred;
273+
for (int I = 0; I < NOffsets; I++)
274+
Pred[I] = (I % MaskStride == 0) ? 1 : 0;
275+
276+
using Tuint = esimd_test::uint_type_t<sizeof(T)>;
277+
simd<Tuint, N> PassThruInt(GlobalID * N, 1);
278+
simd<T, N> PassThru = PassThruInt.template bit_cast_view<T>();
279+
auto PassThruView = PassThru.template select<N, 1>(0);
280+
281+
simd<T, N> Vals;
282+
if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
283+
if constexpr (UsePassThru) {
284+
if constexpr (UseProperties) {
285+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
286+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThru,
287+
Props);
288+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
289+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThruView,
290+
Props);
291+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
292+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, PassThru,
293+
Props);
294+
else // ByteOffset - view, PassThru - view
295+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred,
296+
PassThruView, Props);
297+
} else { // UseProperties is false
298+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
299+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThru);
300+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
301+
Vals =
302+
gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThruView);
303+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
304+
Vals =
305+
gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, PassThru);
306+
else // ByteOffset - view, PassThru - view
307+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred,
308+
PassThruView);
309+
}
310+
} else if constexpr (UseMask) { // UsePassThru is false
311+
if constexpr (UseProperties) {
312+
if (GlobalID % 2 == 0) // ByteOffset - simd
313+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, Props);
314+
else // ByteOffset - simd_view
315+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, Props);
316+
} else { // UseProperties is false
317+
if (GlobalID % 2 == 0) // ByteOffset - simd
318+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred);
319+
else // ByteOffset - simd_view
320+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred);
321+
}
322+
} else { // UseMask is false, UsePassThru is false
323+
if constexpr (UseProperties) {
324+
if (GlobalID % 2 == 0) // ByteOffset - simd
325+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Props);
326+
else // ByteOffset - simd_view
327+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Props);
328+
} else { // UseProperties is false
329+
if (GlobalID % 2 == 0) // ByteOffset - simd
330+
Vals = gather<T, N, VS>(InAcc, ByteOffsets);
331+
else // ByteOffset - simd_view
332+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView);
333+
}
334+
}
335+
} else {
336+
// if (VS == 1) then <T, N, VS> can often be omitted - test it here.
337+
// The variants accepting simd_view for 'PassThru' operand though
338+
// still require <T, N> to be specified explicitly to help
339+
// C++ FE do simd to simd_view matching.
340+
if constexpr (UsePassThru) {
341+
if constexpr (UseProperties) {
342+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
343+
Vals = gather<T>(InAcc, ByteOffsets, Pred, PassThru, Props);
344+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
345+
Vals = gather<T, N>(InAcc, ByteOffsets, Pred, PassThruView,
346+
Props);
347+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
348+
Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru, Props);
349+
else // ByteOffset - view, PassThru - view
350+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThruView,
351+
Props);
352+
} else { // UseProperties is false
353+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
354+
Vals = gather(InAcc, ByteOffsets, Pred, PassThru);
355+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
356+
Vals = gather<T, N>(InAcc, ByteOffsets, Pred, PassThruView);
357+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
358+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThru);
359+
else // ByteOffset - view, PassThru - view
360+
Vals =
361+
gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThruView);
362+
}
363+
} else if constexpr (UseMask) { // UsePassThru is false
364+
if constexpr (UseProperties) {
365+
if (GlobalID % 2 == 0) // ByteOffset - simd
366+
Vals = gather<T>(InAcc, ByteOffsets, Pred, Props);
367+
else // ByteOffset - simd_view
368+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, Props);
369+
} else { // UseProperties is false
370+
if (GlobalID % 2 == 0) // ByteOffset - simd
371+
Vals = gather<T>(InAcc, ByteOffsets, Pred);
372+
else // ByteOffset - simd_view
373+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred);
374+
}
375+
} else { // UsePassThru is false, UseMask is false
376+
if constexpr (UseProperties) {
377+
if (GlobalID % 2 == 0) // ByteOffset - simd
378+
Vals = gather<T>(InAcc, ByteOffsets, Props);
379+
else // ByteOffset - simd_view
380+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Props);
381+
} else {
382+
if (GlobalID % 2 == 0) // ByteOffset - simd
383+
Vals = gather<T>(InAcc, ByteOffsets);
384+
else // ByteOffset - simd_view
385+
Vals = gather<T, N>(InAcc, ByteOffsetsView);
386+
}
387+
}
388+
} // end if (VS == 1)
389+
Vals.copy_to(Out + GlobalID * N);
390+
// scatter(Out, ByteOffsets.template select<NOffsets, 1>(), Vals);
391+
});
211392
}).wait();
212393
} catch (sycl::exception const &e) {
213394
std::cout << "SYCL exception caught: " << e.what() << '\n';
@@ -286,3 +467,61 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
286467
}
287468
return Passed;
288469
}
470+
471+
template <typename T, TestFeatures Features> bool testACC(queue Q) {
472+
constexpr bool UseMask = true;
473+
constexpr bool UsePassThru = true;
474+
constexpr bool UseProperties = true;
475+
476+
properties AlignElemProps{alignment<sizeof(T)>};
477+
478+
bool Passed = true;
479+
Passed &= testACC<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
480+
Q, 2, AlignElemProps);
481+
#ifdef __ESIMD_FORCE_STATELESS_MEM
482+
Passed &= testACC<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
483+
Q, 2, AlignElemProps);
484+
Passed &= testACC<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
485+
Q, 2, AlignElemProps);
486+
#endif // __ESIMD_FORCE_STATELESS_MEM
487+
Passed &= testACC<T, 8, 1, UseMask, !UsePassThru, !UseProperties>(
488+
Q, 3, AlignElemProps);
489+
Passed &= testACC<T, 16, 1, UseMask, !UsePassThru, UseProperties>(
490+
Q, 2, AlignElemProps);
491+
Passed &= testACC<T, 32, 1, UseMask, !UsePassThru, !UseProperties>(
492+
Q, 3, AlignElemProps);
493+
494+
if constexpr (Features == TestFeatures::PVC ||
495+
Features == TestFeatures::DG2) {
496+
properties LSCProps{cache_hint_L1<cache_hint::streaming>,
497+
cache_hint_L2<cache_hint::cached>,
498+
alignment<sizeof(T)>};
499+
Passed &=
500+
testACC<T, 1, 1, !UseMask, !UsePassThru, UseProperties>(Q, 2, LSCProps);
501+
Passed &=
502+
testACC<T, 2, 1, UseMask, !UsePassThru, UseProperties>(Q, 2, LSCProps);
503+
Passed &=
504+
testACC<T, 4, 1, UseMask, UsePassThru, UseProperties>(Q, 2, LSCProps);
505+
Passed &=
506+
testACC<T, 8, 1, UseMask, UsePassThru, UseProperties>(Q, 3, LSCProps);
507+
508+
Passed &=
509+
testACC<T, 32, 1, UseMask, UsePassThru, UseProperties>(Q, 2, LSCProps);
510+
511+
// Check VS > 1. GPU supports only dwords and qwords in this mode.
512+
if constexpr (sizeof(T) >= 4) {
513+
// TODO: This test case causes flaky fail. Enable it after the issue
514+
// in GPU driver is fixed.
515+
// Passed &= testACC<T, 16, 2, UseMask, !UsePassThru, UseProperties>(
516+
// Q, 3, AlignElemProps);
517+
518+
Passed &= testACC<T, 32, 2, !UseMask, !UsePassThru, UseProperties>(
519+
Q, 3, AlignElemProps);
520+
Passed &= testACC<T, 32, 2, UseMask, !UsePassThru, UseProperties>(
521+
Q, 3, AlignElemProps);
522+
Passed &= testACC<T, 32, 2, UseMask, UsePassThru, UseProperties>(
523+
Q, 3, AlignElemProps);
524+
}
525+
}
526+
return Passed;
527+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
//==------- gather_acc.cpp - DPC++ ESIMD on-device test --------------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// Use per-kernel compilation to have more information about failing cases.
9+
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
10+
// RUN: %{run} %t.out
11+
12+
// The test verifies esimd::gather() functions accepting ACCESSOR
13+
// and optional compile-time esimd::properties.
14+
// The gather() calls in this test do not use cache-hint properties
15+
// or VS > 1 (number of loads per offset) to not impose using DG2/PVC features.
16+
17+
#include "Inputs/gather.hpp"
18+
19+
int main() {
20+
auto Q = queue{gpu_selector_v};
21+
esimd_test::printTestLabel(Q);
22+
23+
constexpr auto TestFeatures = TestFeatures::Generic;
24+
bool Passed = true;
25+
26+
Passed &= testACC<int8_t, TestFeatures>(Q);
27+
Passed &= testACC<int16_t, TestFeatures>(Q);
28+
if (Q.get_device().has(sycl::aspect::fp16))
29+
Passed &= testACC<sycl::half, TestFeatures>(Q);
30+
Passed &= testACC<uint32_t, TestFeatures>(Q);
31+
Passed &= testACC<float, TestFeatures>(Q);
32+
Passed &= testACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
33+
#ifdef __ESIMD_FORCE_STATELESS_MEM
34+
Passed &= testACC<int64_t, TestFeatures>(Q);
35+
if (Q.get_device().has(sycl::aspect::fp64))
36+
Passed &= testACC<double, TestFeatures>(Q);
37+
#endif // __ESIMD_FORCE_STATELESS_MEM
38+
std::cout << (Passed ? "Passed\n" : "FAILED\n");
39+
return Passed ? 0 : 1;
40+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//==------- gather_acc_dg2_pvc.cpp - DPC++ ESIMD on-device test ------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc
10+
11+
// Use per-kernel compilation to have more information about failing cases.
12+
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
13+
// RUN: %{run} %t.out
14+
15+
// The test verifies esimd::gather() functions accepting ACCESSOR
16+
// and optional compile-time esimd::properties.
17+
// The gather() calls in this test can use cache-hint properties
18+
// or VS > 1 (number of loads per offset).
19+
20+
#include "Inputs/gather.hpp"
21+
22+
int main() {
23+
auto Q = queue{gpu_selector_v};
24+
esimd_test::printTestLabel(Q);
25+
26+
constexpr auto TestFeatures = TestFeatures::DG2;
27+
bool Passed = true;
28+
29+
Passed &= testACC<int8_t, TestFeatures>(Q);
30+
Passed &= testACC<int16_t, TestFeatures>(Q);
31+
if (Q.get_device().has(sycl::aspect::fp16))
32+
Passed &= testACC<sycl::half, TestFeatures>(Q);
33+
Passed &= testACC<uint32_t, TestFeatures>(Q);
34+
Passed &= testACC<float, TestFeatures>(Q);
35+
Passed &= testACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
36+
#ifdef __ESIMD_FORCE_STATELESS_MEM
37+
Passed &= testACC<int64_t, TestFeatures>(Q);
38+
if (Q.get_device().has(sycl::aspect::fp64))
39+
Passed &= testACC<double, TestFeatures>(Q);
40+
#endif // __ESIMD_FORCE_STATELESS_MEM
41+
std::cout << (Passed ? "Passed\n" : "FAILED\n");
42+
return Passed ? 0 : 1;
43+
}

0 commit comments

Comments
 (0)