Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit d61a45f

Browse files
authored
[ESIMD] Add tests for lsc_gather() lsc_slm_gather() with merging sem… (#1632)
[ESIMD] Add tests for lsc_gather() lsc_slm_gather() with merging semantics Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 6305c41 commit d61a45f

16 files changed

+367
-280
lines changed

SYCL/ESIMD/esimd_test_utils.hpp

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#pragma once
1010

11-
#include <sycl/ext/intel/experimental/esimd/tfloat32.hpp>
11+
#include <sycl/ext/intel/esimd.hpp>
1212
#include <sycl/sycl.hpp>
1313
#define NOMINMAX
1414

@@ -277,7 +277,7 @@ void display_timing_stats(double const *kernelTime,
277277
std::cout << "[OverallTime][Primary]: " << overallTime << "\n";
278278
}
279279

280-
// Get signed integer of given byte size.
280+
// Get signed integer of given byte size or 'void'.
281281
template <int N>
282282
using int_type_t = std::conditional_t<
283283
N == 1, int8_t,
@@ -286,6 +286,15 @@ using int_type_t = std::conditional_t<
286286
std::conditional_t<N == 4, int32_t,
287287
std::conditional_t<N == 8, int64_t, void>>>>;
288288

289+
// Get unsigned integer type of given byte size or 'void'.
290+
template <int N>
291+
using uint_type_t = std::conditional_t<
292+
N == 1, uint8_t,
293+
std::conditional_t<
294+
N == 2, uint16_t,
295+
std::conditional_t<N == 4, uint32_t,
296+
std::conditional_t<N == 8, uint64_t, void>>>>;
297+
289298
enum class BinaryOp {
290299
add,
291300
sub,
@@ -578,4 +587,27 @@ TID(sycl::ext::intel::experimental::esimd::tfloat32)
578587
TID(float)
579588
TID(double)
580589

590+
std::string toString(sycl::ext::intel::experimental::esimd::lsc_data_size DS) {
591+
switch (DS) {
592+
case sycl::ext::intel::experimental::esimd::lsc_data_size::default_size:
593+
return "lsc_data_size::default";
594+
case sycl::ext::intel::experimental::esimd::lsc_data_size::u8:
595+
return "lsc_data_size::u8";
596+
case sycl::ext::intel::experimental::esimd::lsc_data_size::u16:
597+
return "lsc_data_size::u16";
598+
case sycl::ext::intel::experimental::esimd::lsc_data_size::u32:
599+
return "lsc_data_size::u32";
600+
case sycl::ext::intel::experimental::esimd::lsc_data_size::u64:
601+
return "lsc_data_size::u64";
602+
case sycl::ext::intel::experimental::esimd::lsc_data_size::u8u32:
603+
return "lsc_data_size::u8u32";
604+
case sycl::ext::intel::experimental::esimd::lsc_data_size::u16u32:
605+
return "lsc_data_size::u16u32";
606+
case sycl::ext::intel::experimental::esimd::lsc_data_size::u16u32h:
607+
return "lsc_data_size::u16u32h";
608+
}
609+
assert(false && "Unknown lsc_data_size");
610+
return "INVALID lsc_data_size";
611+
}
612+
581613
} // namespace esimd_test

SYCL/ESIMD/lsc/Inputs/lsc_slm_load.hpp

Lines changed: 78 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,18 @@
33

44
#include <iostream>
55

6+
#include "../../esimd_test_utils.hpp"
7+
68
using namespace sycl;
79
using namespace sycl::ext::intel::esimd;
810
using namespace sycl::ext::intel::experimental::esimd;
911

10-
// TODO: The SPEC does not say what values are returned for lsc_slm_gather
11-
// when the corresponding elements of the predicate/mask is zero.
12-
// It is assumed to be undefined values there.
13-
// Thus this test does not check those elements now. From the API point of view
14-
// it may be better to have another argument for the values being copied to
15-
// the result when the mask bit is 0.
16-
17-
template <int CaseNum, typename T, uint32_t Groups, uint32_t LocalRange,
18-
uint16_t VL, uint16_t NChannels, bool Transpose,
12+
template <typename T, uint32_t Groups, uint32_t LocalRange, uint16_t VL,
13+
uint16_t NChannels, bool Transpose, bool TestMergeOperand,
1914
lsc_data_size DS = lsc_data_size::default_size>
20-
bool test(uint32_t PMask = ~0) {
15+
bool test(queue Q, uint32_t PMask = ~0) {
16+
using Tuint = esimd_test::uint_type_t<sizeof(T)>;
17+
2118
static_assert((NChannels == 1) || !Transpose,
2219
"Transpose must have exec size 1");
2320
if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
@@ -35,101 +32,111 @@ bool test(uint32_t PMask = ~0) {
3532
"vector size is greater than 1");
3633
}
3734

38-
T VMask = static_cast<T>(-1);
35+
std::cout << "Running test: T=" << esimd_test::type_name<T>() << ", VL=" << VL
36+
<< ", NChannels=" << NChannels
37+
<< ", DS=" << esimd_test::toString(DS)
38+
<< ", Transpose=" << Transpose
39+
<< ", TestMergeOperand=" << TestMergeOperand
40+
<< ", Groups=" << Groups << ", LocalRange=" << LocalRange
41+
<< std::endl;
42+
43+
Tuint VMask = static_cast<Tuint>(-1);
3944
if constexpr (DS == lsc_data_size::u8u32)
40-
VMask = static_cast<T>(0xff);
45+
VMask = 0xff;
4146
else if constexpr (DS == lsc_data_size::u16u32)
42-
VMask = static_cast<T>(0xffff);
47+
VMask = 0xffff;
4348
else if constexpr (DS == lsc_data_size::u16u32h)
44-
VMask = static_cast<T>(0xffff0000);
45-
46-
queue Q(gpu_selector_v);
47-
auto D = Q.get_device();
48-
std::cout << "Running case #" << CaseNum << " on "
49-
<< D.get_info<sycl::info::device::name>() << std::endl;
49+
VMask = 0xffff0000;
5050

5151
nd_range<1> Range{range<1>{Groups * LocalRange}, range<1>{LocalRange}};
52-
constexpr uint16_t OutSize = Groups * LocalRange * VL * NChannels;
52+
uint32_t OutSize = Groups * LocalRange * VL * NChannels;
5353
T *Out = malloc_shared<T>(OutSize, Q);
5454
memset(Out, 0, OutSize * sizeof(T));
55+
T MergeValue = 2;
5556

5657
try {
57-
Q.submit([&](handler &cgh) {
58-
cgh.parallel_for(Range, [=](sycl::nd_item<1> NDId) SYCL_ESIMD_KERNEL {
59-
uint32_t GID = NDId.get_global_id(0);
60-
uint32_t LID = NDId.get_local_id(0);
61-
uint32_t GroupID = NDId.get_group_linear_id();
62-
63-
// Allocate and init 128-byte multiple size SLM memory with
64-
// consequential values. i-th group gets values:
65-
// {0, 1, 2, ...} + GroupID * 1000000.
66-
constexpr uint32_t ResultSIMDByteSize = VL * NChannels * sizeof(T);
67-
constexpr uint32_t SLMSize =
68-
(ResultSIMDByteSize * LocalRange + 127) & ~127;
69-
slm_init(SLMSize);
70-
if (NDId.get_local_id(0) == 0) {
71-
simd<T, 4> Vals(GroupID * 1000000, 1);
72-
for (int I = 0; I < SLMSize; I += 4 * sizeof(T)) {
73-
slm_block_store<T, 4>(I, Vals);
74-
Vals += 4;
75-
}
58+
Q.parallel_for(Range, [=](sycl::nd_item<1> NDId) SYCL_ESIMD_KERNEL {
59+
uint32_t GID = NDId.get_global_id(0);
60+
uint32_t LID = NDId.get_local_id(0);
61+
uint32_t GroupID = NDId.get_group_linear_id();
62+
63+
// Allocate and init 128-byte multiple size SLM memory with
64+
// consequential values. i-th group gets values:
65+
// {0, 1, 2, ...} + GroupID * 1000000.
66+
constexpr uint32_t ResultSIMDByteSize = VL * NChannels * sizeof(T);
67+
constexpr uint32_t SLMSize =
68+
(ResultSIMDByteSize * LocalRange + 127) & ~127;
69+
slm_init<SLMSize>();
70+
if (NDId.get_local_id(0) == 0) {
71+
simd<Tuint, 4> Vals(GroupID * 1000000, 1);
72+
for (int I = 0; I < SLMSize; I += 4 * sizeof(T)) {
73+
slm_block_store<Tuint, 4>(I, Vals);
74+
Vals += 4;
7675
}
77-
barrier();
78-
79-
if constexpr (Transpose) {
80-
auto Vals = lsc_slm_block_load<T, VL, DS>(LID * VL * sizeof(T));
81-
Vals.copy_to(Out + GID * VL);
76+
}
77+
barrier();
78+
79+
if constexpr (Transpose) {
80+
auto Vals = lsc_slm_block_load<T, VL, DS>(LID * VL * sizeof(T));
81+
Vals.copy_to(Out + GID * VL);
82+
} else {
83+
simd<uint32_t, VL> Offsets(LID * VL * NChannels * sizeof(T),
84+
NChannels * sizeof(T));
85+
86+
// Create the predicate for the gather from 'PMask'.
87+
simd_mask<VL> Pred;
88+
for (int I = 0; I < VL; I++)
89+
Pred.template select<1, 1>(I) = (PMask >> I) & 1;
90+
91+
simd<T, VL * NChannels> Vals;
92+
if constexpr (TestMergeOperand) {
93+
simd<T, VL *NChannels> OldVals = MergeValue;
94+
Vals = lsc_slm_gather<T, NChannels, DS>(Offsets, Pred, OldVals);
8295
} else {
83-
simd<uint32_t, VL> Offsets(LID * VL * NChannels * sizeof(T),
84-
NChannels * sizeof(T));
85-
86-
// Create the predicate for the gather from 'PMask'.
87-
simd_mask<VL> Pred;
88-
for (int I = 0; I < VL; I++)
89-
Pred.template select<1, 1>(I) = (PMask >> I) & 1;
90-
91-
simd<T, VL *NChannels> Vals =
92-
lsc_slm_gather<T, NChannels, DS>(Offsets, Pred);
93-
94-
Vals.copy_to(Out + GID * VL * NChannels);
96+
Vals = lsc_slm_gather<T, NChannels, DS>(Offsets, Pred);
9597
}
96-
});
98+
99+
Vals.copy_to(Out + GID * VL * NChannels);
100+
}
97101
}).wait();
98102
} catch (sycl::exception const &e) {
99103
std::cout << "SYCL exception caught: " << e.what() << '\n';
100104
sycl::free(Out, Q);
101105
return false;
102106
}
103107

104-
bool Passed = true;
108+
int NErrors = 0;
105109

106110
if constexpr (Transpose) {
107111
for (uint32_t I = 0; I < OutSize; I++) {
108112
uint32_t GroupId = I / (LocalRange * VL * NChannels);
109113
uint32_t LID = I % (LocalRange * VL * NChannels);
110114
T ExpectedVal = GroupId * 1000000 + LID;
111-
if (Out[I] != ExpectedVal) {
112-
Passed = false;
113-
std::cout << I << ": Value = " << Out[I]
115+
if (Out[I] != ExpectedVal && NErrors++ < 32) {
116+
std::cout << "Error: " << I << ": Value = " << Out[I]
114117
<< ", Expected value = " << ExpectedVal << std::endl;
115118
}
116119
}
117120
} else {
118121
for (uint32_t I = 0; I < OutSize; I += VL * NChannels) {
119122
uint32_t GroupId = I / (LocalRange * VL * NChannels);
120123
uint32_t LID = I % (LocalRange * VL * NChannels);
121-
T ExpectedValBase = GroupId * 1000000 + LID;
124+
Tuint ExpectedValBase = GroupId * 1000000 + LID;
122125
for (int ChannelId = 0; ChannelId < NChannels; ChannelId++) {
123126
for (int J = 0; J < VL; J++) {
124127
uint32_t OutIndex = I + ChannelId * VL + J;
125128

126-
if (((PMask >> J) & 1) == 0)
129+
bool IsMaskSet = (PMask >> J) & 1;
130+
if (!TestMergeOperand && !IsMaskSet)
127131
continue;
128-
T ExpectedVal = (ExpectedValBase + ChannelId + J * NChannels) & VMask;
129-
if (Out[OutIndex] != ExpectedVal) {
130-
Passed = false;
131-
std::cout << OutIndex << ": Value = " << Out[OutIndex]
132-
<< ", Expected value = " << ExpectedVal << std::endl;
132+
Tuint ExpectedVal =
133+
IsMaskSet ? (ExpectedValBase + ChannelId + J * NChannels) & VMask
134+
: sycl::bit_cast<Tuint>(MergeValue);
135+
Tuint ComputedVal = sycl::bit_cast<Tuint>(Out[OutIndex]);
136+
if (ComputedVal != ExpectedVal && NErrors++ < 32) {
137+
std::cout << "Error: " << OutIndex << ": Value = " << ComputedVal
138+
<< ", Expected value = " << ExpectedVal
139+
<< ", Mask = " << IsMaskSet << std::endl;
133140
}
134141
}
135142
}
@@ -138,7 +145,7 @@ bool test(uint32_t PMask = ~0) {
138145

139146
sycl::free(Out, Q);
140147

141-
if (!Passed)
142-
std::cout << "Case #" << CaseNum << " FAILED" << std::endl;
143-
return Passed;
148+
if (NErrors)
149+
std::cout << " FAILED" << std::endl;
150+
return NErrors == 0;
144151
}

SYCL/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,15 @@ template <typename T, uint16_t N,
2222
lsc_data_size DS = lsc_data_size::default_size,
2323
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2424
bool UsePrefetch = false, bool UseOldValuesOperand = true>
25-
bool test(uint32_t Groups, uint32_t Threads) {
25+
bool test(queue Q, uint32_t Groups, uint32_t Threads) {
2626
static_assert(DS != lsc_data_size::u8u32 && DS != lsc_data_size::u16u32,
2727
"unsupported DS for lsc_block_load()");
2828
static_assert(DS != lsc_data_size::u16u32h, "D16U32h not supported in HW");
2929

3030
uint32_t Size = Groups * Threads * N;
3131
using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
3232

33-
auto Q = queue{gpu_selector_v};
34-
auto D = Q.get_device();
35-
std::cout << "Running on " << D.get_info<sycl::info::device::name>()
36-
<< ", T=" << esimd_test::type_name<T>() << ",N=" << N
33+
std::cout << "Running case: T=" << esimd_test::type_name<T>() << ",N=" << N
3734
<< ",UsePrefetch=" << UsePrefetch
3835
<< ",UseOldValuesOperand=" << UseOldValuesOperand;
3936

@@ -121,46 +118,57 @@ template <typename T> bool test_lsc_block_load() {
121118
constexpr bool CheckMerge = true;
122119
constexpr bool NoCheckMerge = false;
123120

121+
auto Q = queue{gpu_selector_v};
122+
std::cout << "Running lsc_block_load() tests for T="
123+
<< esimd_test::type_name<T>() << " on "
124+
<< Q.get_device().get_info<sycl::info::device::name>() << std::endl;
125+
124126
bool Passed = true;
125-
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(1, 4);
126-
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(1, 4);
127-
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(2, 2);
128-
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(2, 8);
129-
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(3, 3);
127+
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 1, 4);
128+
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 1, 4);
129+
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 2, 2);
130+
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 2, 8);
131+
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 3, 3);
130132
if constexpr (sizeof(T) * 2 >= sizeof(int))
131-
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(5, 5);
133+
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 5, 5);
132134
if constexpr (sizeof(T) >= sizeof(int))
133-
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(3, 5);
135+
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 3, 5);
134136

135-
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, CheckMerge>(1, 4);
136-
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, CheckMerge>(2, 2);
137-
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, CheckMerge>(4, 4);
138-
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, CheckMerge>(2, 8);
139-
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, CheckMerge>(3, 3);
137+
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 1, 4);
138+
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 2, 2);
139+
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 4, 4);
140+
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 2, 8);
141+
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 3, 3);
140142
if constexpr (sizeof(T) * 2 >= sizeof(int))
141-
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, CheckMerge>(5, 5);
143+
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 5, 5);
142144
if constexpr (sizeof(T) >= sizeof(int))
143-
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, CheckMerge>(3, 5);
145+
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 3, 5);
144146

145147
return Passed;
146148
}
147149

148-
template <typename T, lsc_data_size DS = lsc_data_size::default_size>
149-
bool test_lsc_prefetch() {
150+
template <typename T, lsc_data_size DS = lsc_data_size::default_size,
151+
bool IsGatherLikePrefetch = false>
152+
std::enable_if_t<!IsGatherLikePrefetch, bool> test_lsc_prefetch() {
150153
constexpr cache_hint L1H = cache_hint::cached;
151154
constexpr cache_hint L3H = cache_hint::uncached;
152155
constexpr bool DoPrefetch = true;
153156

157+
auto Q = queue{gpu_selector_v};
158+
std::cout << "Running block-load-like lsc_prefetch() tests for T="
159+
<< esimd_test::type_name<T>() << " on "
160+
<< Q.get_device().get_info<sycl::info::device::name>() << std::endl;
161+
154162
bool Passed = true;
155-
Passed &= test<T, 64, DS, L1H, L3H, DoPrefetch>(1, 4);
156-
Passed &= test<T, 32, DS, L1H, L3H, DoPrefetch>(1, 4);
157-
Passed &= test<T, 16, DS, L1H, L3H, DoPrefetch>(2, 2);
158-
Passed &= test<T, 8, DS, L1H, L3H, DoPrefetch>(2, 8);
159-
Passed &= test<T, 4, DS, L1H, L3H, DoPrefetch>(3, 3);
163+
Passed &= test<T, 64, DS, L1H, L3H, DoPrefetch>(Q, 1, 4);
164+
Passed &= test<T, 32, DS, L1H, L3H, DoPrefetch>(Q, 1, 4);
165+
Passed &= test<T, 16, DS, L1H, L3H, DoPrefetch>(Q, 2, 2);
166+
Passed &= test<T, 8, DS, L1H, L3H, DoPrefetch>(Q, 2, 8);
167+
Passed &= test<T, 4, DS, L1H, L3H, DoPrefetch>(Q, 3, 3);
160168
if constexpr (sizeof(T) * 2 >= sizeof(int))
161-
Passed &= test<T, 2, DS, L1H, L3H, DoPrefetch>(5, 5);
169+
Passed &= test<T, 2, DS, L1H, L3H, DoPrefetch>(Q, 5, 5);
162170
if constexpr (sizeof(T) >= sizeof(int))
163-
Passed &= test<T, 1, DS, L1H, L3H, DoPrefetch>(3, 5);
171+
Passed &= test<T, 1, DS, L1H, L3H, DoPrefetch>(Q, 3, 5);
164172

165173
return Passed;
166174
}

0 commit comments

Comments
 (0)