Skip to content

Commit 5e9564d

Browse files
authored
[ESIMD] Conditionally revert the recent changes for slm_gather() (#12341)
The previous fix (#12316) added usage of llvm.masked.gather for slm_gather(). Such usage does not work with current GPU drivers if ESIMD function is called via invoke_simd() API. The fix here returns the previous lowering to GenX instead of llvm.masked.gather. Using the lowering to llvm.masked.gather can be used if define __ESIMD_GATHER_SCATTER_LLVM_IR macro (turned off by default). Signed-off-by: Klochkov, Vyacheslav N <[email protected]>
1 parent c7549f9 commit 5e9564d

File tree

2 files changed

+165
-3
lines changed

2 files changed

+165
-3
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3146,9 +3146,16 @@ slm_gather(simd<uint32_t, N> byte_offsets, simd_mask<N> mask,
31463146
detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
31473147
static_assert(Alignment >= sizeof(T),
31483148
"slm_gather() requires at least element-size alignment");
3149-
simd<MsgT, N> PassThru; // it is intentionally undefined
3150-
return __esimd_slm_gather_ld<MsgT, N, Alignment>(
3151-
byte_offsets.data(), mask.data(), PassThru.data());
3149+
if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
3150+
simd<MsgT, N> PassThru; // it is intentionally undefined
3151+
return __esimd_slm_gather_ld<MsgT, N, Alignment>(
3152+
byte_offsets.data(), mask.data(), PassThru.data());
3153+
} else {
3154+
static_assert(N == 1 || N == 8 || N == 16 || N == 32,
3155+
"Unsupported vector length");
3156+
detail::LocalAccessorMarker acc;
3157+
return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
3158+
}
31523159
}
31533160

31543161
template <typename T, int N,
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
// GPU driver had an error in handling of SLM aligned block_loads/stores,
2+
// which has been fixed only in "1.3.26816", and in win/opencl version going
3+
// _after_ 101.4575.
4+
// REQUIRES-INTEL-DRIVER: lin: 26816, win: 101.4576
5+
//
6+
// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
7+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
8+
//
9+
// VISALTO enable run
10+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
11+
12+
/*
13+
* Test check basic support of local memory access in invoke_simd.
14+
*/
15+
16+
#include "../invoke_simd_utils.hpp"
17+
18+
#include <sycl/ext/intel/esimd.hpp>
19+
#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
20+
#include <sycl/sycl.hpp>
21+
22+
#include <functional>
23+
#include <iostream>
24+
#include <type_traits>
25+
26+
// TODO: When gpu driver can pass/accept accessor by value,
27+
// the work-around undef #ifdef US_ACC_PTR should be removed.
28+
#define USE_ACC_PTR
29+
30+
/* Subgroup size attribute is optional
31+
* In case it is absent compiler decides what subgroup size to use
32+
*/
33+
#ifdef IMPL_SUBGROUP
34+
#define SUBGROUP_ATTR
35+
#else
36+
#define SUBGROUP_ATTR [[intel::reqd_sub_group_size(VL)]]
37+
#endif
38+
39+
using namespace sycl;
40+
using namespace sycl::ext::oneapi::experimental;
41+
namespace esimd = sycl::ext::intel::esimd;
42+
43+
using dtype = int;
44+
45+
constexpr int VL = 16;
46+
constexpr uint32_t LocalRange = VL * 2; // 2 sub-groups per 1 group.
47+
constexpr uint32_t GlobalRange = LocalRange * 2; // 2 groups.
48+
49+
ESIMD_INLINE void slm_load_store_test(
50+
local_accessor<dtype, 1> LocalAcc, uint32_t LAByteOffset, dtype *A,
51+
dtype *C, esimd::simd<uint32_t, VL> GlobalByteOffsets) SYCL_ESIMD_FUNCTION {
52+
53+
uint32_t LocalAccOffset =
54+
static_cast<uint32_t>(
55+
reinterpret_cast<std::uintptr_t>(LocalAcc.get_pointer().get())) +
56+
LAByteOffset;
57+
esimd::simd<uint32_t, VL> Offsets(LocalAccOffset, sizeof(dtype));
58+
auto Local1 = esimd::slm_gather<dtype, VL>(Offsets);
59+
Offsets += static_cast<uint32_t>(LocalRange * sizeof(dtype));
60+
auto Local2 = esimd::slm_gather<dtype, VL>(Offsets);
61+
62+
auto Global = esimd::gather(A, GlobalByteOffsets);
63+
auto Res = Global + Local1 + Local2;
64+
esimd::slm_scatter(Offsets, Res);
65+
}
66+
67+
[[intel::device_indirectly_callable]] SYCL_EXTERNAL void __regcall invoke_slm_load_store_test(
68+
#ifdef USE_ACC_PTR
69+
local_accessor<dtype, 1> *LocalAcc,
70+
#else
71+
local_accessor<dtype, 1> LocalAcc,
72+
#endif
73+
uint32_t SLMByteOffset, dtype *A, dtype *C,
74+
simd<uint32_t, VL> GlobalByteOffsets) SYCL_ESIMD_FUNCTION {
75+
#ifdef USE_ACC_PTR
76+
slm_load_store_test(*LocalAcc, SLMByteOffset, A, C, GlobalByteOffsets);
77+
#else
78+
slm_load_store_test(LocalAcc, SLMByteOffset, A, C, GlobalByteOffsets);
79+
#endif
80+
}
81+
82+
int main(void) {
83+
auto Q = queue{gpu_selector_v};
84+
auto Dev = Q.get_device();
85+
std::cout << "Running on " << Dev.get_info<sycl::info::device::name>()
86+
<< std::endl;
87+
88+
auto DeviceSLMSize = Dev.get_info<sycl::info::device::local_mem_size>();
89+
std::cout << "Local Memory Size: " << DeviceSLMSize << std::endl;
90+
91+
sycl::nd_range<1> NDRange{range<1>{GlobalRange}, range<1>{LocalRange}};
92+
93+
// The test is going to use (LocalRange * 2) elements of dtype type.
94+
if (DeviceSLMSize < LocalRange * 2 * sizeof(dtype)) {
95+
// Report an error - the test needs a fix.
96+
std::cerr << "Error: Test needs more SLM memory than device has"
97+
<< std::endl;
98+
return 1;
99+
}
100+
101+
auto *A = malloc_shared<dtype>(GlobalRange, Q);
102+
auto *C = malloc_shared<dtype>(GlobalRange, Q);
103+
104+
for (auto i = 0; i < GlobalRange; i++) {
105+
A[i] = i;
106+
C[i] = 0;
107+
}
108+
try {
109+
Q.submit([&](handler &CGH) {
110+
auto LocalAcc = local_accessor<dtype, 1>(LocalRange * 2, CGH);
111+
CGH.parallel_for(NDRange, [=](nd_item<1> Item) SUBGROUP_ATTR {
112+
uint32_t GlobalId = Item.get_global_id(0);
113+
uint32_t LocalId = Item.get_local_id(0);
114+
auto LocalAccCopy = LocalAcc;
115+
LocalAccCopy[LocalId] = GlobalId * 100;
116+
LocalAccCopy[LocalId + LocalRange] = GlobalId * 10000;
117+
Item.barrier();
118+
119+
uint32_t LAByteOffset = (LocalId / VL) * VL * sizeof(dtype);
120+
uint32_t GlobalByteOffset = GlobalId * sizeof(dtype);
121+
sycl::sub_group SG = Item.get_sub_group();
122+
#ifdef USE_ACC_PTR
123+
auto LocalAccArg = uniform{&LocalAccCopy};
124+
#else
125+
auto LocalAccArg = uniform{LocalAccCopy};
126+
#endif
127+
invoke_simd(SG, invoke_slm_load_store_test, LocalAccArg,
128+
uniform{LAByteOffset}, uniform{A}, uniform{C},
129+
GlobalByteOffset);
130+
C[GlobalId] = LocalAccCopy[LocalId + LocalRange];
131+
});
132+
}).wait();
133+
} catch (sycl::exception const &e) {
134+
std::cout << "SYCL exception caught: " << e.what() << '\n';
135+
free(A, Q);
136+
free(C, Q);
137+
return e.code().value();
138+
}
139+
140+
bool Pass = true;
141+
for (auto i = 0; i < GlobalRange; i++) {
142+
dtype Expected = A[i] + i * (10000 + 100);
143+
if (C[i] != Expected) {
144+
std::cout << "Error: C[" << i << "]:" << C[i]
145+
<< " != [expected]:" << Expected << std::endl;
146+
Pass = false;
147+
}
148+
}
149+
150+
free(A, Q);
151+
free(C, Q);
152+
153+
std::cout << "Test result: " << (Pass ? "Pass" : "Fail") << std::endl;
154+
return Pass ? 0 : 1;
155+
}

0 commit comments

Comments
 (0)