Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit a22c91e

Browse files
authored
[ESIMD] Test merging lsc_block_load(); separate block_load and gather (#1588)
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 9eb740b commit a22c91e

33 files changed

+687
-493
lines changed

SYCL/ESIMD/lsc/Inputs/common.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#pragma once
10+
911
#include <stdlib.h>
1012
#include <sycl/bit_cast.hpp>
1113

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
//==-- lsc_usm_block_load_prefetch.hpp - DPC++ ESIMD on-device test --------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <sycl/ext/intel/esimd.hpp>
10+
#include <sycl/sycl.hpp>
11+
12+
#include <iostream>
13+
14+
#include "../../esimd_test_utils.hpp"
15+
#include "common.hpp"
16+
17+
using namespace sycl;
18+
using namespace sycl::ext::intel::esimd;
19+
using namespace sycl::ext::intel::experimental::esimd;
20+
21+
template <typename T, uint16_t N,
22+
lsc_data_size DS = lsc_data_size::default_size,
23+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
24+
bool UsePrefetch = false, bool UseOldValuesOperand = true>
25+
bool test(uint32_t Groups, uint32_t Threads) {
26+
static_assert(DS != lsc_data_size::u8u32 && DS != lsc_data_size::u16u32,
27+
"unsupported DS for lsc_block_load()");
28+
static_assert(DS != lsc_data_size::u16u32h, "D16U32h not supported in HW");
29+
30+
uint32_t Size = Groups * Threads * N;
31+
using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
32+
33+
auto Q = queue{gpu_selector_v};
34+
auto D = Q.get_device();
35+
std::cout << "Running on " << D.get_info<sycl::info::device::name>()
36+
<< ", T=" << esimd_test::type_name<T>() << ",N=" << N
37+
<< ",UsePrefetch=" << UsePrefetch
38+
<< ",UseOldValuesOperand=" << UseOldValuesOperand;
39+
40+
sycl::range<1> GlobalRange{Groups};
41+
sycl::range<1> LocalRange{Threads};
42+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
43+
44+
T *Out = sycl::malloc_shared<T>(Size, Q);
45+
T *In = sycl::malloc_shared<T>(Size, Q);
46+
for (int i = 0; i < Size; i++) {
47+
In[i] = get_rand<T>();
48+
Out[i] = 0;
49+
}
50+
51+
try {
52+
Q.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
53+
uint16_t GlobalID = NDI.get_global_id(0);
54+
uint32_t ElemOffset = GlobalID * N;
55+
56+
simd<T, N> Vals;
57+
if constexpr (UseOldValuesOperand) {
58+
// TODO: these 2 lines work-around the problem with scalar conversions
59+
// to bfloat16. It could be just: "simd<T, N> OldValues(ElemOffset,
60+
// 1);"
61+
simd<uint32_t, N> OldValuesInt(ElemOffset, 1);
62+
simd<T, N> OldValues = OldValuesInt;
63+
64+
simd_mask<1> Mask = GlobalID % 1;
65+
if constexpr (UsePrefetch) {
66+
lsc_prefetch<T, N, DS, L1H, L3H>(In + ElemOffset);
67+
Vals = lsc_block_load<T, N, DS>(In + ElemOffset, Mask, OldValues);
68+
} else {
69+
Vals = lsc_block_load<T, N, DS, L1H, L3H>(In + ElemOffset, Mask,
70+
OldValues);
71+
}
72+
} else {
73+
if constexpr (UsePrefetch) {
74+
lsc_prefetch<T, N, DS, L1H, L3H>(In + ElemOffset);
75+
Vals = lsc_block_load<T, N, DS>(In + ElemOffset);
76+
} else {
77+
Vals = lsc_block_load<T, N, DS, L1H, L3H>(In + ElemOffset);
78+
}
79+
}
80+
lsc_block_store(Out + ElemOffset, Vals);
81+
}).wait();
82+
} catch (sycl::exception const &e) {
83+
std::cout << "SYCL exception caught: " << e.what() << '\n';
84+
sycl::free(Out, Q);
85+
sycl::free(In, Q);
86+
return false;
87+
}
88+
89+
int NumErrors = 0;
90+
for (int i = 0; i < Size && NumErrors < 32; i++) {
91+
bool IsMaskSet = (i / N) % 1;
92+
Tuint Expected = sycl::bit_cast<Tuint>(In[i]);
93+
Tuint Computed = sycl::bit_cast<Tuint>(Out[i]);
94+
95+
if (!IsMaskSet) {
96+
// Values loaded by lsc_block_load() are undefined - skip the check.
97+
if (!UseOldValuesOperand)
98+
continue;
99+
Expected = sycl::bit_cast<Tuint>((T)i);
100+
}
101+
102+
if (Computed != Expected) {
103+
NumErrors++;
104+
std::cout << "out[" << i << "] = 0x" << std::hex << Computed
105+
<< " vs etalon = 0x" << Expected << std::dec << std::endl;
106+
}
107+
}
108+
109+
std::cout << (NumErrors ? " FAILED" : " passed") << std::endl;
110+
sycl::free(Out, Q);
111+
sycl::free(In, Q);
112+
return NumErrors == 0;
113+
}
114+
115+
template <typename T> bool test_lsc_block_load() {
116+
constexpr lsc_data_size DS = lsc_data_size::default_size;
117+
constexpr cache_hint L1H = cache_hint::none;
118+
constexpr cache_hint L3H = cache_hint::none;
119+
120+
constexpr bool NoPrefetch = false;
121+
constexpr bool CheckMerge = true;
122+
constexpr bool NoCheckMerge = false;
123+
124+
bool Passed = true;
125+
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(1, 4);
126+
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(1, 4);
127+
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(2, 2);
128+
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(2, 8);
129+
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(3, 3);
130+
if constexpr (sizeof(T) * 2 >= sizeof(int))
131+
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(5, 5);
132+
if constexpr (sizeof(T) >= sizeof(int))
133+
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(3, 5);
134+
135+
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, CheckMerge>(1, 4);
136+
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, CheckMerge>(2, 2);
137+
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, CheckMerge>(4, 4);
138+
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, CheckMerge>(2, 8);
139+
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, CheckMerge>(3, 3);
140+
if constexpr (sizeof(T) * 2 >= sizeof(int))
141+
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, CheckMerge>(5, 5);
142+
if constexpr (sizeof(T) >= sizeof(int))
143+
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, CheckMerge>(3, 5);
144+
145+
return Passed;
146+
}
147+
148+
template <typename T, lsc_data_size DS = lsc_data_size::default_size>
149+
bool test_lsc_prefetch() {
150+
constexpr cache_hint L1H = cache_hint::cached;
151+
constexpr cache_hint L3H = cache_hint::uncached;
152+
constexpr bool DoPrefetch = true;
153+
154+
bool Passed = true;
155+
Passed &= test<T, 64, DS, L1H, L3H, DoPrefetch>(1, 4);
156+
Passed &= test<T, 32, DS, L1H, L3H, DoPrefetch>(1, 4);
157+
Passed &= test<T, 16, DS, L1H, L3H, DoPrefetch>(2, 2);
158+
Passed &= test<T, 8, DS, L1H, L3H, DoPrefetch>(2, 8);
159+
Passed &= test<T, 4, DS, L1H, L3H, DoPrefetch>(3, 3);
160+
if constexpr (sizeof(T) * 2 >= sizeof(int))
161+
Passed &= test<T, 2, DS, L1H, L3H, DoPrefetch>(5, 5);
162+
if constexpr (sizeof(T) >= sizeof(int))
163+
Passed &= test<T, 1, DS, L1H, L3H, DoPrefetch>(3, 5);
164+
165+
return Passed;
166+
}
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
//==------- lsc_usm_gather_prefetch.hpp - DPC++ ESIMD on-device test -------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <sycl/ext/intel/esimd.hpp>
10+
#include <sycl/sycl.hpp>
11+
12+
#include <iostream>
13+
14+
#include "common.hpp"
15+
16+
using namespace sycl;
17+
using namespace sycl::ext::intel::esimd;
18+
using namespace sycl::ext::intel::experimental::esimd;
19+
20+
#ifdef USE_64_BIT_OFFSET
21+
typedef uint64_t Toffset;
22+
#else
23+
typedef uint32_t Toffset;
24+
#endif
25+
26+
template <int case_num, typename T, uint32_t Groups, uint32_t Threads,
27+
uint16_t VL, uint16_t VS,
28+
lsc_data_size DS = lsc_data_size::default_size,
29+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
30+
bool use_prefetch = false>
31+
bool test(uint32_t pmask = 0xffffffff) {
32+
if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
33+
static_assert(VS == 1, "Only D32 and D64 support vector load");
34+
}
35+
static_assert(DS != lsc_data_size::u16u32h, "D16U32h not supported in HW");
36+
37+
if constexpr (VS > 1) {
38+
static_assert(VL == 16 || VL == 32,
39+
"IGC prohibits execution size less than SIMD size when "
40+
"vector size is greater than 1");
41+
}
42+
43+
uint16_t Size = Groups * Threads * VL * VS;
44+
using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
45+
Tuint vmask = (Tuint)-1;
46+
if constexpr (DS == lsc_data_size::u8u32)
47+
vmask = (T)0xff;
48+
if constexpr (DS == lsc_data_size::u16u32)
49+
vmask = (T)0xffff;
50+
if constexpr (DS == lsc_data_size::u16u32h)
51+
vmask = (T)0xffff0000;
52+
53+
T old_val = get_rand<T>();
54+
T zero_val = (T)0;
55+
56+
auto q = queue{gpu_selector_v};
57+
auto dev = q.get_device();
58+
std::cout << "Running case #" << case_num << " on "
59+
<< dev.get_info<sycl::info::device::name>() << "\n";
60+
auto ctx = q.get_context();
61+
62+
sycl::range<1> GlobalRange{Groups};
63+
sycl::range<1> LocalRange{Threads};
64+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
65+
66+
T *out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), dev, ctx));
67+
for (int i = 0; i < Size; i++)
68+
out[i] = old_val;
69+
70+
T *in = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), dev, ctx));
71+
for (int i = 0; i < Size; i++)
72+
in[i] = get_rand<T>();
73+
74+
try {
75+
auto e = q.submit([&](handler &cgh) {
76+
cgh.parallel_for<KernelID<case_num>>(
77+
Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
78+
uint16_t globalID = ndi.get_global_id(0);
79+
uint32_t elem_off = globalID * VL * VS;
80+
uint32_t byte_off = elem_off * sizeof(T);
81+
82+
#ifndef USE_SCALAR_OFFSET
83+
simd<Toffset, VL> offset(byte_off, VS * sizeof(T));
84+
#else
85+
Toffset offset = byte_off;
86+
#endif
87+
simd_mask<VL> pred;
88+
for (int i = 0; i < VL; i++)
89+
pred.template select<1, 1>(i) = (pmask >> i) & 1;
90+
91+
simd<T, VS * VL> vals;
92+
if constexpr (use_prefetch) {
93+
lsc_prefetch<T, VS, DS, L1H, L3H, VL>(in, offset, pred);
94+
vals =
95+
lsc_gather<T, VS, DS, cache_hint::none, cache_hint::none, VL>(
96+
in, offset, pred);
97+
} else {
98+
vals = lsc_gather<T, VS, DS, L1H, L3H, VL>(in, offset, pred);
99+
}
100+
101+
if constexpr (DS == lsc_data_size::u8u32 ||
102+
DS == lsc_data_size::u16u32)
103+
vals &= vmask;
104+
105+
lsc_scatter<T, VS, lsc_data_size::default_size, cache_hint::none,
106+
cache_hint::none, VL>(out, offset, vals, pred);
107+
});
108+
});
109+
e.wait();
110+
} catch (sycl::exception const &e) {
111+
std::cout << "SYCL exception caught: " << e.what() << '\n';
112+
sycl::free(out, ctx);
113+
sycl::free(in, ctx);
114+
return false;
115+
}
116+
117+
bool passed = true;
118+
119+
for (int i = 0; i < Size; i++) {
120+
Tuint in_val = sycl::bit_cast<Tuint>(in[i]);
121+
Tuint out_val = sycl::bit_cast<Tuint>(out[i]);
122+
#ifndef USE_SCALAR_OFFSET
123+
Tuint e = (pmask >> ((i / VS) % VL)) & 1 ? in_val & vmask
124+
: sycl::bit_cast<Tuint>(old_val);
125+
#else
126+
// Calculate the mask to identify the areas that were actually updated
127+
constexpr uint16_t mask =
128+
1U << ((sycl::bit_cast<uint32_t>((float)VL) >> 23) - 126);
129+
Tuint e = ((i / VS) % VL == 0) && (pmask >> ((i / VS) % VL)) & (mask - 1)
130+
? in_val & vmask
131+
: sycl::bit_cast<Tuint>(old_val);
132+
#endif
133+
if (out_val != e) {
134+
passed = false;
135+
std::cout << "out[" << i << "] = 0x" << std::hex << out_val
136+
<< " vs etalon = 0x" << e << std::dec << std::endl;
137+
}
138+
}
139+
140+
if (!passed)
141+
std::cout << "Case #" << case_num << " FAILED" << std::endl;
142+
143+
sycl::free(out, ctx);
144+
sycl::free(in, ctx);
145+
146+
return passed;
147+
}
148+
149+
template <int CaseNum, typename T,
150+
lsc_data_size DS = lsc_data_size::default_size, bool DoPrefetch>
151+
bool test_lsc_gather_prefetch() {
152+
constexpr cache_hint L1H = cache_hint::cached;
153+
constexpr cache_hint L3H = cache_hint::uncached;
154+
155+
bool Passed = true;
156+
Passed &= test<CaseNum, T, 4, 4, 1, 1, DS, L1H, L3H, DoPrefetch>(rand());
157+
#ifndef USE_SCALAR_OFFSET
158+
// These tests use lsc_scatter with scalar offset when USE_SCALAR_OFFSET macro
159+
// is set, which is UB and thus guarded by the macro here.
160+
Passed &= test<CaseNum + 1, T, 1, 4, 32, 1, DS, L1H, L3H, DoPrefetch>(rand());
161+
Passed &= test<CaseNum + 2, T, 2, 4, 16, 1, DS, L1H, L3H, DoPrefetch>(rand());
162+
Passed &= test<CaseNum + 3, T, 2, 2, 8, 1, DS, L1H, L3H, DoPrefetch>(rand());
163+
Passed &= test<CaseNum + 4, T, 4, 2, 4, 1, DS, L1H, L3H, DoPrefetch>(rand());
164+
Passed &= test<CaseNum + 5, T, 4, 16, 2, 1, DS, L1H, L3H, DoPrefetch>(rand());
165+
166+
if constexpr (((DS == lsc_data_size::default_size && sizeof(T) >= 4) ||
167+
DS == lsc_data_size::u32 || DS == lsc_data_size::u32) &&
168+
!DoPrefetch) {
169+
Passed &=
170+
test<CaseNum + 6, T, 2, 4, 32, 2, DS, L1H, L3H, DoPrefetch>(rand());
171+
}
172+
#endif // !USE_SCALAR_OFFSET
173+
174+
return Passed;
175+
}
176+
177+
template <int CaseNum, typename T,
178+
lsc_data_size DS = lsc_data_size::default_size>
179+
bool test_lsc_gather() {
180+
constexpr bool NoPrefetch = false;
181+
return test_lsc_gather_prefetch<CaseNum, T, DS, NoPrefetch>();
182+
}
183+
184+
template <int CaseNum, typename T,
185+
lsc_data_size DS = lsc_data_size::default_size>
186+
bool test_lsc_prefetch() {
187+
constexpr bool DoPrefetch = true;
188+
return test_lsc_gather_prefetch<CaseNum, T, DS, DoPrefetch>();
189+
}

0 commit comments

Comments
 (0)