Skip to content

Commit 04edb74

Browse files
committed
[ESIMD] Implement gather(acc) accepting compile-time properties
Signed-off-by: Vyacheslav N Klochkov <[email protected]> Signed-off-by: Klochkov, Vyacheslav N <[email protected]>
1 parent 7b62154 commit 04edb74

File tree

7 files changed

+903
-78
lines changed

7 files changed

+903
-78
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 427 additions & 35 deletions
Large diffs are not rendered by default.

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 7 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -803,23 +803,10 @@ __ESIMD_API
803803
return lsc_gather<T, NElts, DS, L1H, L3H>(
804804
reinterpret_cast<T *>(acc.get_pointer().get()), offsets, pred);
805805
#else
806-
detail::check_lsc_vector_size<NElts>();
807-
detail::check_lsc_data_size<T, DS>();
808-
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
809-
constexpr uint16_t _AddressScale = 1;
810-
constexpr int _ImmOffset = 0;
811-
constexpr lsc_data_size _DS =
812-
detail::expand_data_size(detail::finalize_data_size<T, DS>());
813-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
814-
constexpr detail::lsc_data_order _Transposed =
815-
detail::lsc_data_order::nontranspose;
816-
using MsgT = typename detail::lsc_expand_type<T>::type;
817-
auto si = __ESIMD_NS::get_surface_index(acc);
818-
__ESIMD_NS::simd<MsgT, N * NElts> Tmp =
819-
__esimd_lsc_load_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
820-
_Transposed, N>(pred.data(), offsets.data(), si);
821-
return detail::lsc_format_ret<T>(Tmp);
822-
#endif
806+
__ESIMD_NS::simd<T, N * NElts> PassThru; // Intentionally unitialized.
807+
return __ESIMD_DNS::gather_impl<T, N * NElts, NElts, L1H, L3H, DS>(
808+
acc, offsets, pred, PassThru);
809+
#endif // __ESIMD_FORCE_STATELESS_MEM
823810
}
824811

825812
#ifdef __ESIMD_FORCE_STATELESS_MEM
@@ -891,25 +878,9 @@ __ESIMD_API
891878
reinterpret_cast<T *>(acc.get_pointer().get()), offsets, pred, pass_thru);
892879

893880
#else
894-
detail::check_lsc_vector_size<NElts>();
895-
detail::check_lsc_data_size<T, DS>();
896-
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
897-
constexpr uint16_t _AddressScale = 1;
898-
constexpr int _ImmOffset = 0;
899-
constexpr lsc_data_size _DS =
900-
detail::expand_data_size(detail::finalize_data_size<T, DS>());
901-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
902-
constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
903-
using MsgT = typename detail::lsc_expand_type<T>::type;
904-
auto SI = __ESIMD_NS::get_surface_index(acc);
905-
__ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
906-
detail::lsc_format_input<MsgT>(pass_thru);
907-
__ESIMD_NS::simd<MsgT, N * NElts> Result =
908-
__esimd_lsc_load_merge_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
909-
_VS, _Transposed, N>(
910-
pred.data(), offsets.data(), SI, PassThruExpanded.data());
911-
return detail::lsc_format_ret<T>(Result);
912-
#endif
881+
return __ESIMD_DNS::gather_impl<T, N * NElts, NElts, L1H, L3H, DS>(
882+
acc, offsets, pred, pass_thru);
883+
#endif // __ESIMD_FORCE_STATELESS_MEM
913884
}
914885

915886
#ifdef __ESIMD_FORCE_STATELESS_MEM

sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,188 @@ bool testUSM(queue Q, uint32_t MaskStride, PropertiesT) {
225225
return Passed;
226226
}
227227

228+
template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UsePassThru,
229+
bool UseProperties, typename PropertiesT>
230+
bool testACC(queue Q, uint32_t MaskStride, PropertiesT) {
231+
232+
static_assert(VS > 0 && N % VS == 0,
233+
"Incorrect VS parameter. N must be divisible by VS.");
234+
constexpr int NOffsets = N / VS;
235+
static_assert(!UsePassThru || UseMask,
236+
"PassThru cannot be used without using mask");
237+
238+
uint32_t Groups = 8;
239+
uint32_t Threads = 16;
240+
241+
std::cout << "Running case: T=" << esimd_test::type_name<T>() << ", N=" << N
242+
<< ", VS=" << VS << ", MaskStride=" << MaskStride
243+
<< ", Groups=" << Groups << ", Threads=" << Threads
244+
<< ", use_mask=" << UseMask << ", use_pass_thru=" << UsePassThru
245+
<< ", use_properties=" << UseProperties << std::endl;
246+
247+
uint16_t Size = Groups * Threads * N;
248+
using Tuint = esimd_test::uint_type_t<sizeof(T)>;
249+
250+
sycl::range<1> GlobalRange{Groups};
251+
sycl::range<1> LocalRange{Threads};
252+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
253+
254+
T *Out = sycl::malloc_shared<T>(Size, Q);
255+
std::memset(Out, 0, Size * sizeof(T));
256+
257+
T *In = sycl::malloc_shared<T>(Size * 2, Q);
258+
for (int I = 0; I < Size; I++)
259+
In[I] = esimd_test::getRandomValue<T>();
260+
261+
try {
262+
buffer<T, 1> InBuf(In, Size * 2);
263+
Q.submit([&](handler &CGH) {
264+
accessor InAcc{InBuf, CGH};
265+
CGH.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
266+
int GlobalID = NDI.get_global_id(0);
267+
PropertiesT Props{};
268+
269+
simd<OffsetT, NOffsets> ByteOffsets(GlobalID * N * sizeof(T),
270+
VS * sizeof(T));
271+
simd_view ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
272+
273+
simd_mask<NOffsets> Pred;
274+
for (int I = 0; I < NOffsets; I++)
275+
Pred[I] = (I % MaskStride == 0) ? 1 : 0;
276+
277+
using Tuint = esimd_test::uint_type_t<sizeof(T)>;
278+
simd<Tuint, N> PassThruInt(GlobalID * N, 1);
279+
simd<T, N> PassThru = PassThruInt.template bit_cast_view<T>();
280+
auto PassThruView = PassThru.template select<N, 1>(0);
281+
282+
simd<T, N> Vals;
283+
if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
284+
if constexpr (UsePassThru) {
285+
if constexpr (UseProperties) {
286+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
287+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThru,
288+
Props);
289+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
290+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThruView,
291+
Props);
292+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
293+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, PassThru,
294+
Props);
295+
else // ByteOffset - view, PassThru - view
296+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred,
297+
PassThruView, Props);
298+
} else { // UseProperties is false
299+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
300+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThru);
301+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
302+
Vals =
303+
gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThruView);
304+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
305+
Vals =
306+
gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, PassThru);
307+
else // ByteOffset - view, PassThru - view
308+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred,
309+
PassThruView);
310+
}
311+
} else if constexpr (UseMask) { // UsePassThru is false
312+
if constexpr (UseProperties) {
313+
if (GlobalID % 2 == 0) // ByteOffset - simd
314+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, Props);
315+
else // ByteOffset - simd_view
316+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, Props);
317+
} else { // UseProperties is false
318+
if (GlobalID % 2 == 0) // ByteOffset - simd
319+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred);
320+
else // ByteOffset - simd_view
321+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred);
322+
}
323+
} else { // UseMask is false, UsePassThru is false
324+
if constexpr (UseProperties) {
325+
if (GlobalID % 2 == 0) // ByteOffset - simd
326+
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Props);
327+
else // ByteOffset - simd_view
328+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Props);
329+
} else { // UseProperties is false
330+
if (GlobalID % 2 == 0) // ByteOffset - simd
331+
Vals = gather<T, N, VS>(InAcc, ByteOffsets);
332+
else // ByteOffset - simd_view
333+
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView);
334+
}
335+
}
336+
} else {
337+
// if (VS == 1) then <T, N, VS> can often be omitted - test it here.
338+
// The variants accepting simd_view for 'PassThru' operand though
339+
// still require <T, N> to be specified explicitly to help
340+
// C++ FE do simd to simd_view matching.
341+
if constexpr (UsePassThru) {
342+
if constexpr (UseProperties) {
343+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
344+
Vals = gather<T>(InAcc, ByteOffsets, Pred, PassThru, Props);
345+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
346+
Vals = gather<T, N>(InAcc, ByteOffsets, Pred, PassThruView,
347+
Props);
348+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
349+
Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru, Props);
350+
else // ByteOffset - view, PassThru - view
351+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThruView,
352+
Props);
353+
} else { // UseProperties is false
354+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
355+
Vals = gather(InAcc, ByteOffsets, Pred, PassThru);
356+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
357+
Vals = gather<T, N>(InAcc, ByteOffsets, Pred, PassThruView);
358+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
359+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThru);
360+
else // ByteOffset - view, PassThru - view
361+
Vals =
362+
gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThruView);
363+
}
364+
} else if constexpr (UseMask) { // UsePassThru is false
365+
if constexpr (UseProperties) {
366+
if (GlobalID % 2 == 0) // ByteOffset - simd
367+
Vals = gather<T>(InAcc, ByteOffsets, Pred, Props);
368+
else // ByteOffset - simd_view
369+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, Props);
370+
} else { // UseProperties is false
371+
if (GlobalID % 2 == 0) // ByteOffset - simd
372+
Vals = gather<T>(InAcc, ByteOffsets, Pred);
373+
else // ByteOffset - simd_view
374+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred);
375+
}
376+
} else { // UsePassThru is false, UseMask is false
377+
if constexpr (UseProperties) {
378+
if (GlobalID % 2 == 0) // ByteOffset - simd
379+
Vals = gather<T>(InAcc, ByteOffsets, Props);
380+
else // ByteOffset - simd_view
381+
Vals = gather<T, N>(InAcc, ByteOffsetsView, Props);
382+
} else {
383+
if (GlobalID % 2 == 0) // ByteOffset - simd
384+
Vals = gather<T>(InAcc, ByteOffsets);
385+
else // ByteOffset - simd_view
386+
Vals = gather<T, N>(InAcc, ByteOffsetsView);
387+
}
388+
}
389+
} // end if (VS == 1)
390+
Vals.copy_to(Out + GlobalID * N);
391+
// scatter(Out, ByteOffsets.template select<NOffsets, 1>(), Vals);
392+
});
393+
}).wait();
394+
} catch (sycl::exception const &e) {
395+
std::cout << "SYCL exception caught: " << e.what() << '\n';
396+
sycl::free(In, Q);
397+
sycl::free(Out, Q);
398+
return false;
399+
}
400+
401+
bool Passed = verify(In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru);
402+
if (!Passed)
403+
std::cout << "Case FAILED" << std::endl;
404+
405+
sycl::free(In, Q);
406+
sycl::free(Out, Q);
407+
return Passed;
408+
}
409+
228410
template <typename T, TestFeatures Features> bool testUSM(queue Q) {
229411
constexpr bool UseMask = true;
230412
constexpr bool UsePassThru = true;
@@ -286,3 +468,61 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
286468
}
287469
return Passed;
288470
}
471+
472+
template <typename T, TestFeatures Features> bool testACC(queue Q) {
473+
constexpr bool UseMask = true;
474+
constexpr bool UsePassThru = true;
475+
constexpr bool UseProperties = true;
476+
477+
properties AlignElemProps{alignment<sizeof(T)>};
478+
479+
bool Passed = true;
480+
Passed &= testACC<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
481+
Q, 2, AlignElemProps);
482+
#ifdef __ESIMD_FORCE_STATELESS_MEM
483+
Passed &= testACC<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
484+
Q, 2, AlignElemProps);
485+
Passed &= testACC<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
486+
Q, 2, AlignElemProps);
487+
#endif // __ESIMD_FORCE_STATELESS_MEM
488+
Passed &= testACC<T, 8, 1, UseMask, !UsePassThru, !UseProperties>(
489+
Q, 3, AlignElemProps);
490+
Passed &= testACC<T, 16, 1, UseMask, !UsePassThru, UseProperties>(
491+
Q, 2, AlignElemProps);
492+
Passed &= testACC<T, 32, 1, UseMask, !UsePassThru, !UseProperties>(
493+
Q, 3, AlignElemProps);
494+
495+
if constexpr (Features == TestFeatures::PVC ||
496+
Features == TestFeatures::DG2) {
497+
properties LSCProps{cache_hint_L1<cache_hint::streaming>,
498+
cache_hint_L2<cache_hint::cached>,
499+
alignment<sizeof(T)>};
500+
Passed &=
501+
testACC<T, 1, 1, !UseMask, !UsePassThru, UseProperties>(Q, 2, LSCProps);
502+
Passed &=
503+
testACC<T, 2, 1, UseMask, !UsePassThru, UseProperties>(Q, 2, LSCProps);
504+
Passed &=
505+
testACC<T, 4, 1, UseMask, UsePassThru, UseProperties>(Q, 2, LSCProps);
506+
Passed &=
507+
testACC<T, 8, 1, UseMask, UsePassThru, UseProperties>(Q, 3, LSCProps);
508+
509+
Passed &=
510+
testACC<T, 32, 1, UseMask, UsePassThru, UseProperties>(Q, 2, LSCProps);
511+
512+
// Check VS > 1. GPU supports only dwords and qwords in this mode.
513+
if constexpr (sizeof(T) >= 4) {
514+
// TODO: This test case causes flaky fail. Enable it after the issue
515+
// in GPU driver is fixed.
516+
// Passed &= testACC<T, 16, 2, UseMask, !UsePassThru, UseProperties>(
517+
// Q, 3, AlignElemProps);
518+
519+
Passed &= testACC<T, 32, 2, !UseMask, !UsePassThru, UseProperties>(
520+
Q, 3, AlignElemProps);
521+
Passed &= testACC<T, 32, 2, UseMask, !UsePassThru, UseProperties>(
522+
Q, 3, AlignElemProps);
523+
Passed &= testACC<T, 32, 2, UseMask, UsePassThru, UseProperties>(
524+
Q, 3, AlignElemProps);
525+
}
526+
}
527+
return Passed;
528+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
//==------- gather_acc.cpp - DPC++ ESIMD on-device test --------------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// Use per-kernel compilation to have more information about failing cases.
9+
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
10+
// RUN: %{run} %t.out
11+
12+
// The test verifies esimd::gather() functions accepting ACCESSOR
13+
// and optional compile-time esimd::properties.
14+
// The gather() calls in this test do not use cache-hint properties
15+
// or VS > 1 (number of loads per offset) to not impose using PVC features.
16+
17+
#include "Inputs/gather.hpp"
18+
19+
int main() {
20+
auto Q = queue{gpu_selector_v};
21+
esimd_test::printTestLabel(Q);
22+
23+
constexpr auto TestFeatures = TestFeatures::Generic;
24+
bool Passed = true;
25+
26+
Passed &= testACC<int8_t, TestFeatures>(Q);
27+
Passed &= testACC<int16_t, TestFeatures>(Q);
28+
if (Q.get_device().has(sycl::aspect::fp16))
29+
Passed &= testACC<sycl::half, TestFeatures>(Q);
30+
Passed &= testACC<uint32_t, TestFeatures>(Q);
31+
Passed &= testACC<float, TestFeatures>(Q);
32+
Passed &= testACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
33+
#ifdef __ESIMD_FORCE_STATELESS_MEM
34+
Passed &= testACC<int64_t, TestFeatures>(Q);
35+
if (Q.get_device().has(sycl::aspect::fp64))
36+
Passed &= testACC<double, TestFeatures>(Q);
37+
#endif // __ESIMD_FORCE_STATELESS_MEM
38+
std::cout << (Passed ? "Passed\n" : "FAILED\n");
39+
return Passed ? 0 : 1;
40+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//==------- gather_acc_dg2_pvc.cpp - DPC++ ESIMD on-device test ------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc
10+
11+
// Use per-kernel compilation to have more information about failing cases.
12+
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
13+
// RUN: %{run} %t.out
14+
15+
// The test verifies esimd::gather() functions accepting ACCESSOR
16+
// and optional compile-time esimd::properties.
17+
// The gather() calls in this test do not use cache-hint properties
18+
// or VS > 1 (number of loads per offset) to not impose using PVC features.
19+
20+
#include "Inputs/gather.hpp"
21+
22+
int main() {
23+
auto Q = queue{gpu_selector_v};
24+
esimd_test::printTestLabel(Q);
25+
26+
constexpr auto TestFeatures = TestFeatures::DG2;
27+
bool Passed = true;
28+
29+
Passed &= testACC<int8_t, TestFeatures>(Q);
30+
Passed &= testACC<int16_t, TestFeatures>(Q);
31+
if (Q.get_device().has(sycl::aspect::fp16))
32+
Passed &= testACC<sycl::half, TestFeatures>(Q);
33+
Passed &= testACC<uint32_t, TestFeatures>(Q);
34+
Passed &= testACC<float, TestFeatures>(Q);
35+
Passed &= testACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
36+
#ifdef __ESIMD_FORCE_STATELESS_MEM
37+
Passed &= testACC<int64_t, TestFeatures>(Q);
38+
if (Q.get_device().has(sycl::aspect::fp64))
39+
Passed &= testACC<double, TestFeatures>(Q);
40+
#endif // __ESIMD_FORCE_STATELESS_MEM
41+
std::cout << (Passed ? "Passed\n" : "FAILED\n");
42+
return Passed ? 0 : 1;
43+
}

0 commit comments

Comments
 (0)