Skip to content

Commit 5582ce4

Browse files
authored
[SYCL][ESIMD] Implement slm_gather accepting compile-time properties (#12456)
1 parent f4b4a84 commit 5582ce4

File tree

8 files changed

+806
-53
lines changed

8 files changed

+806
-53
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 375 additions & 18 deletions
Large diffs are not rendered by default.

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -556,20 +556,8 @@ template <typename T, int NElts = 1,
556556
__ESIMD_API __ESIMD_NS::simd<T, N * NElts>
557557
lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
558558
__ESIMD_NS::simd_mask<N> pred = 1) {
559-
detail::check_lsc_vector_size<NElts>();
560-
detail::check_lsc_data_size<T, DS>();
561-
constexpr uint16_t _AddressScale = 1;
562-
constexpr int _ImmOffset = 0;
563-
constexpr lsc_data_size _DS =
564-
detail::expand_data_size(detail::finalize_data_size<T, DS>());
565-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
566-
constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
567-
using MsgT = typename detail::lsc_expand_type<T>::type;
568-
__ESIMD_NS::simd<MsgT, N * NElts> Tmp =
569-
__esimd_lsc_load_slm<MsgT, cache_hint::none, cache_hint::none,
570-
_AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(
571-
pred.data(), offsets.data());
572-
return detail::lsc_format_ret<T>(Tmp);
559+
__ESIMD_NS::simd<T, N * NElts> pass_thru;
560+
return __ESIMD_DNS::slm_gather_impl<T, NElts, DS>(offsets, pred, pass_thru);
573561
}
574562

575563
/// SLM gather.
@@ -595,24 +583,7 @@ __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
595583
lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
596584
__ESIMD_NS::simd_mask<N> pred,
597585
__ESIMD_NS::simd<T, N * NElts> pass_thru) {
598-
detail::check_lsc_vector_size<NElts>();
599-
detail::check_lsc_data_size<T, DS>();
600-
constexpr uint16_t _AddressScale = 1;
601-
constexpr int _ImmOffset = 0;
602-
constexpr lsc_data_size _DS =
603-
detail::expand_data_size(detail::finalize_data_size<T, DS>());
604-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
605-
constexpr detail::lsc_data_order _Transposed =
606-
detail::lsc_data_order::nontranspose;
607-
using MsgT = typename detail::lsc_expand_type<T>::type;
608-
__ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
609-
detail::lsc_format_input<MsgT>(pass_thru);
610-
__ESIMD_NS::simd<MsgT, N * NElts> Result =
611-
__esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
612-
_AddressScale, _ImmOffset, _DS, _VS,
613-
_Transposed, N>(pred.data(), offsets.data(),
614-
PassThruExpanded.data());
615-
return detail::lsc_format_ret<T>(Result);
586+
return __ESIMD_DNS::slm_gather_impl<T, NElts, DS>(offsets, pred, pass_thru);
616587
}
617588

618589
/// Transposed SLM gather with 1 channel.

sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp

Lines changed: 238 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,244 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
468468
return Passed;
469469
}
470470

471+
template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UsePassThru,
472+
bool UseProperties, typename PropertiesT>
473+
bool testSLM(queue Q, uint32_t MaskStride, PropertiesT) {
474+
475+
static_assert(VS > 0 && N % VS == 0,
476+
"Incorrect VS parameter. N must be divisible by VS.");
477+
constexpr int NOffsets = N / VS;
478+
static_assert(!UsePassThru || UseMask,
479+
"PassThru cannot be used without using mask");
480+
481+
constexpr uint32_t Groups = 8;
482+
constexpr uint32_t Threads = 16;
483+
484+
std::cout << "Running slm_gather case: T=" << esimd_test::type_name<T>()
485+
<< ", N=" << N << ", VS=" << VS << ", MaskStride=" << MaskStride
486+
<< ", Groups=" << Groups << ", Threads=" << Threads
487+
<< ", use_mask=" << UseMask << ", use_pass_thru=" << UsePassThru
488+
<< ", use_properties=" << UseProperties << std::endl;
489+
490+
constexpr uint16_t Size = Groups * Threads * N;
491+
using Tuint = esimd_test::uint_type_t<sizeof(T)>;
492+
493+
sycl::range<1> GlobalRange{Groups};
494+
sycl::range<1> LocalRange{Threads};
495+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
496+
497+
T *Out = sycl::malloc_shared<T>(Size, Q);
498+
std::memset(Out, 0, Size * sizeof(T));
499+
500+
T *In = sycl::malloc_shared<T>(Size * 2, Q);
501+
for (int I = 0; I < Size; I++)
502+
In[I] = esimd_test::getRandomValue<T>();
503+
504+
try {
505+
Q.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
506+
uint16_t GlobalID = NDI.get_global_id(0);
507+
uint16_t LocalID = NDI.get_local_id(0);
508+
uint32_t GlobalElemOffset = GlobalID * N;
509+
uint32_t LocalElemOffset = LocalID * N;
510+
511+
// Allocate a bit more to safely initialize it with 4-element chunks.
512+
constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T);
513+
slm_init<SLMSize>();
514+
515+
if (LocalID == 0) {
516+
for (int I = 0; I < Threads * N; I += 8) {
517+
simd<T, 8> InVec(In + GlobalElemOffset + I);
518+
simd<uint32_t, 8> offsets(I * sizeof(T), sizeof(T));
519+
slm_scatter<T>(offsets, InVec);
520+
}
521+
}
522+
barrier();
523+
524+
PropertiesT Props{};
525+
526+
simd<uint32_t, NOffsets> ByteOffsets(LocalElemOffset * sizeof(T),
527+
VS * sizeof(T));
528+
simd_view ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
529+
530+
simd_mask<NOffsets> Pred;
531+
for (int I = 0; I < NOffsets; I++)
532+
Pred[I] = (I % MaskStride == 0) ? 1 : 0;
533+
534+
using Tuint = esimd_test::uint_type_t<sizeof(T)>;
535+
simd<Tuint, N> PassThruInt(GlobalElemOffset, 1);
536+
simd<T, N> PassThru = PassThruInt.template bit_cast_view<T>();
537+
auto PassThruView = PassThru.template select<N, 1>(0);
538+
539+
simd<T, N> Vals;
540+
if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
541+
if constexpr (UsePassThru) {
542+
if constexpr (UseProperties) {
543+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
544+
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThru, Props);
545+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
546+
Vals =
547+
slm_gather<T, N, VS>(ByteOffsets, Pred, PassThruView, Props);
548+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
549+
Vals =
550+
slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThru, Props);
551+
else // ByteOffset - view, PassThru - view
552+
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThruView,
553+
Props);
554+
} else { // UseProperties is false
555+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
556+
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThru);
557+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
558+
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThruView);
559+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
560+
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThru);
561+
else // ByteOffset - view, PassThru - view
562+
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThruView);
563+
}
564+
} else if constexpr (UseMask) { // UsePassThru is false
565+
if constexpr (UseProperties) {
566+
if (GlobalID % 2 == 0) // ByteOffset - simd
567+
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, Props);
568+
else // ByteOffset - simd_view
569+
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, Props);
570+
} else { // UseProperties is false
571+
if (GlobalID % 2 == 0) // ByteOffset - simd
572+
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred);
573+
else // ByteOffset - simd_view
574+
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred);
575+
}
576+
} else { // UseMask is false, UsePassThru is false
577+
if constexpr (UseProperties) {
578+
if (GlobalID % 2 == 0) // ByteOffset - simd
579+
Vals = slm_gather<T, N, VS>(ByteOffsets, Props);
580+
else // ByteOffset - simd_view
581+
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Props);
582+
} else { // UseProperties is false
583+
if (GlobalID % 2 == 0) // ByteOffset - simd
584+
Vals = slm_gather<T, N, VS>(ByteOffsets);
585+
else // ByteOffset - simd_view
586+
Vals = slm_gather<T, N, VS>(ByteOffsetsView);
587+
}
588+
}
589+
} else {
590+
// if (VS == 1) then <T, N, VS> can often be omitted - test it here.
591+
// The variants accepting simd_view for 'PassThru' operand though
592+
// still require <T, N> to be specified explicitly to help
593+
// C++ FE do simd to simd_view matching.
594+
if constexpr (UsePassThru) {
595+
if constexpr (UseProperties) {
596+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
597+
Vals = slm_gather<T>(ByteOffsets, Pred, PassThru, Props);
598+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
599+
Vals = slm_gather<T, N>(ByteOffsets, Pred, PassThruView, Props);
600+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
601+
Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThru, Props);
602+
else // ByteOffset - view, PassThru - view
603+
Vals =
604+
slm_gather<T, N>(ByteOffsetsView, Pred, PassThruView, Props);
605+
} else { // UseProperties is false
606+
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
607+
Vals = slm_gather<T>(ByteOffsets, Pred, PassThru);
608+
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
609+
Vals = slm_gather<T, N>(ByteOffsets, Pred, PassThruView);
610+
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
611+
Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThru);
612+
else // ByteOffset - view, PassThru - view
613+
Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThruView);
614+
}
615+
} else if constexpr (UseMask) { // UsePassThru is false
616+
if constexpr (UseProperties) {
617+
if (GlobalID % 2 == 0) // ByteOffset - simd
618+
Vals = slm_gather<T>(ByteOffsets, Pred, Props);
619+
else // ByteOffset - simd_view
620+
Vals = slm_gather<T, N>(ByteOffsetsView, Pred, Props);
621+
} else { // UseProperties is false
622+
if (GlobalID % 2 == 0) // ByteOffset - simd
623+
Vals = slm_gather<T>(ByteOffsets, Pred);
624+
else // ByteOffset - simd_view
625+
Vals = slm_gather<T, N>(ByteOffsetsView, Pred);
626+
}
627+
} else { // UsePassThru is false, UseMask is false
628+
if constexpr (UseProperties) {
629+
if (GlobalID % 2 == 0) // ByteOffset - simd
630+
Vals = slm_gather<T>(ByteOffsets, Props);
631+
else // ByteOffset - simd_view
632+
Vals = slm_gather<T, N>(ByteOffsetsView, Props);
633+
} else {
634+
if (GlobalID % 2 == 0) // ByteOffset - simd
635+
Vals = slm_gather<T>(ByteOffsets);
636+
else // ByteOffset - simd_view
637+
Vals = slm_gather<T, N>(ByteOffsetsView);
638+
}
639+
}
640+
} // end if (VS == 1)
641+
Vals.copy_to(Out + GlobalElemOffset);
642+
}).wait();
643+
} catch (sycl::exception const &e) {
644+
std::cout << "SYCL exception caught: " << e.what() << '\n';
645+
sycl::free(In, Q);
646+
sycl::free(Out, Q);
647+
return false;
648+
}
649+
650+
bool Passed = verify(In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru);
651+
if (!Passed)
652+
std::cout << "Case FAILED" << std::endl;
653+
654+
sycl::free(In, Q);
655+
sycl::free(Out, Q);
656+
return Passed;
657+
}
658+
659+
template <typename T, TestFeatures Features> bool testSLM(queue Q) {
660+
constexpr bool UseMask = true;
661+
constexpr bool UsePassThru = true;
662+
constexpr bool UseProperties = true;
663+
664+
properties AlignElemProps{alignment<sizeof(T)>};
665+
666+
bool Passed = true;
667+
Passed &= testSLM<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
668+
Q, 2, AlignElemProps);
669+
Passed &= testSLM<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
670+
Q, 2, AlignElemProps);
671+
Passed &= testSLM<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
672+
Q, 2, AlignElemProps);
673+
Passed &= testSLM<T, 8, 1, UseMask, !UsePassThru, UseProperties>(
674+
Q, 3, AlignElemProps);
675+
// UsePassThru requires either DG2/PVC or support of llvm.masked.gather LLVM
676+
// IR.
677+
#ifdef __ESIMD_GATHER_SCATTER_LLVM_IR
678+
Passed &= testSLM<T, 16, 1, UseMask, UsePassThru, UseProperties>(
679+
Q, 2, AlignElemProps);
680+
Passed &= testSLM<T, 32, 1, UseMask, UsePassThru, !UseProperties>(
681+
Q, 3, AlignElemProps);
682+
#endif
683+
684+
// TODO: test non-power-of-2 N
685+
// Such cases were promised to be supported, but in fact they fail.
686+
// Create some test cases here after the issue in GPU driver is resolved.
687+
688+
if constexpr (Features == TestFeatures::PVC ||
689+
Features == TestFeatures::DG2) {
690+
691+
// Check VS > 1. GPU supports only dwords and qwords in this mode.
692+
if constexpr (sizeof(T) >= 4) {
693+
// TODO: This test case causes flaky fail. Enable it after the issue
694+
// in GPU driver is fixed.
695+
// Passed &= testUSM<T, 16, 2, UseMask, !UsePassThru, UseProperties>(
696+
// Q, 3, AlignElemProps);
697+
698+
Passed &= testSLM<T, 32, 2, !UseMask, !UsePassThru, UseProperties>(
699+
Q, 3, AlignElemProps);
700+
Passed &= testSLM<T, 32, 2, UseMask, !UsePassThru, UseProperties>(
701+
Q, 3, AlignElemProps);
702+
Passed &= testSLM<T, 32, 2, UseMask, UsePassThru, UseProperties>(
703+
Q, 3, AlignElemProps);
704+
}
705+
}
706+
return Passed;
707+
}
708+
471709
template <typename T, TestFeatures Features> bool testACC(queue Q) {
472710
constexpr bool UseMask = true;
473711
constexpr bool UsePassThru = true;
@@ -478,12 +716,10 @@ template <typename T, TestFeatures Features> bool testACC(queue Q) {
478716
bool Passed = true;
479717
Passed &= testACC<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
480718
Q, 2, AlignElemProps);
481-
#ifdef __ESIMD_FORCE_STATELESS_MEM
482719
Passed &= testACC<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
483720
Q, 2, AlignElemProps);
484721
Passed &= testACC<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
485722
Q, 2, AlignElemProps);
486-
#endif // __ESIMD_FORCE_STATELESS_MEM
487723
Passed &= testACC<T, 8, 1, UseMask, !UsePassThru, !UseProperties>(
488724
Q, 3, AlignElemProps);
489725
Passed &= testACC<T, 16, 1, UseMask, !UsePassThru, UseProperties>(
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
//==------- slm_gather.cpp - DPC++ ESIMD on-device test --------------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// Use per-kernel compilation to have more information about failing cases.
9+
// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out
10+
// RUN: %{run} %t.out
11+
12+
// The test verifies esimd::slm_gather() functions accepting optional
13+
// compile-time esimd::properties. The slm_gather() calls in this test do not
14+
// use VS > 1 (number of loads per offset) to not impose using DG2/PVC features.
15+
16+
#include "Inputs/gather.hpp"
17+
18+
int main() {
19+
auto Q = queue{gpu_selector_v};
20+
esimd_test::printTestLabel(Q);
21+
22+
constexpr auto TestFeatures = TestFeatures::Generic;
23+
bool Passed = true;
24+
25+
Passed &= testSLM<int8_t, TestFeatures>(Q);
26+
Passed &= testSLM<int16_t, TestFeatures>(Q);
27+
if (Q.get_device().has(sycl::aspect::fp16))
28+
Passed &= testSLM<sycl::half, TestFeatures>(Q);
29+
Passed &= testSLM<uint32_t, TestFeatures>(Q);
30+
Passed &= testSLM<float, TestFeatures>(Q);
31+
Passed &= testSLM<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
32+
std::cout << (Passed ? "Passed\n" : "FAILED\n");
33+
return Passed ? 0 : 1;
34+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
//==------- slm_gather_dg2_pvc.cpp - DPC++ ESIMD on-device test ------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc
9+
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
10+
// RUN: %{run} %t.out
11+
12+
// The test verifies esimd::slm_gather() functions accepting optional
13+
// compile-time esimd::properties. The slm_gather() calls in this test use
14+
// VS > 1 (number of loads per offset) and require DG2 or PVC to run.
15+
16+
#include "Inputs/gather.hpp"
17+
18+
int main() {
19+
auto Q = queue{gpu_selector_v};
20+
esimd_test::printTestLabel(Q);
21+
22+
// DG2 and PVC support same gather() configurations. If some gather call
23+
// has corresponding instructions in PVC and does not have it in DG2, then
24+
// GPU RT emulates it for DG2.
25+
constexpr auto TestFeatures = TestFeatures::DG2;
26+
bool Passed = true;
27+
28+
Passed &= testSLM<int8_t, TestFeatures>(Q);
29+
Passed &= testSLM<int16_t, TestFeatures>(Q);
30+
if (Q.get_device().has(sycl::aspect::fp16))
31+
Passed &= testSLM<sycl::half, TestFeatures>(Q);
32+
Passed &= testSLM<uint32_t, TestFeatures>(Q);
33+
Passed &= testSLM<float, TestFeatures>(Q);
34+
Passed &= testSLM<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
35+
36+
std::cout << (Passed ? "Passed\n" : "FAILED\n");
37+
return Passed ? 0 : 1;
38+
}

0 commit comments

Comments
 (0)