Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 4ce05a0

Browse files
[SYCL][Fusion] Test fusion of kernels with different ND-ranges (#1575)
Check that kernel fusion works when fusing kernels with different ND-ranges (different global sizes and dimensions). Implementation: intel/llvm#8209 --------- Signed-off-by: Victor Perez <[email protected]> Co-authored-by: Lukas Sommer <[email protected]>
1 parent a22c91e commit 4ce05a0

File tree

3 files changed

+274
-17
lines changed

3 files changed

+274
-17
lines changed

SYCL/KernelFusion/abort_fusion.cpp

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -80,25 +80,12 @@ int main() {
8080

8181
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
8282

83-
// Scenario: Fusing two kernels with different dimensionality should lead to
84-
// fusion being aborted.
85-
performFusion<class Kernel1_1, class Kernel2_1>(q, range<2>{32, 16},
86-
range<2>{1, 8});
87-
// CHECK: WARNING: Cannot fuse kernels with different dimensionality
88-
// CHECK-NEXT: COMPUTATION OK
89-
90-
// Scenario: Fusing two kernels with different global size should lead to
91-
// fusion being aborted.
92-
performFusion<class Kernel1_2, class Kernel2_2>(q, range<1>{256},
93-
range<1>{8});
94-
// CHECK-NEXT: WARNING: Cannot fuse kerneles with different global size
95-
// CHECK-NEXT: COMPUTATION OK
96-
9783
// Scenario: Fusing two kernels with different local size should lead to
9884
// fusion being aborted.
9985
performFusion<class Kernel1_3, class Kernel2_3>(q, range<1>{dataSize},
10086
range<1>{16});
101-
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
87+
// CHECK: ERROR: JIT compilation for kernel fusion failed with message:
88+
// CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
10289
// CHECK-NEXT: COMPUTATION OK
10390

10491
return 0;

SYCL/KernelFusion/abort_internalization.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,6 @@ int main() {
147147
// CHECK-NEXT: Local(0), Local(8)
148148
// CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
149149
// CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
150-
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
151150
// CHECK-NEXT: COMPUTATION OK
152151

153152
// Scenario: Both accessor with local promotion, but the kernels specify
@@ -157,7 +156,8 @@ int main() {
157156
performFusion(q, Internalization::Local, 8, Internalization::Local, 16);
158157
// CHECK-NEXT: Local(8), Local(16)
159158
// CHECK-NEXT: WARNING: Not performing specified local promotion due to work-group size mismatch
160-
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
159+
// CHECK-NEXT: ERROR: JIT compilation for kernel fusion failed with message:
160+
// CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
161161
// CHECK-NEXT: COMPUTATION OK
162162

163163
// Scenario: One accessor with local internalization, one with private
Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
3+
// RUN: %CPU_CHECK_PLACEHOLDER
4+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
5+
// RUN: %GPU_CHECK_PLACEHOLDER
6+
// UNSUPPORTED: cuda || hip
7+
// REQUIRES: fusion
8+
9+
// Test complete fusion of kernels with different ND-ranges.
10+
11+
// Kernels with different ND-ranges should be fused.
12+
// CHECK-NOT: Cannot fuse kernels with different offsets or local sizes
13+
14+
#include <sycl/sycl.hpp>
15+
16+
#include <algorithm>
17+
18+
using namespace sycl;
19+
20+
////////////////////////////////////////////////////////////////////////////////
21+
// Kernels
22+
////////////////////////////////////////////////////////////////////////////////
23+
24+
using DataTy = vec<uint16_t, 3>;
25+
using VecTy = std::vector<DataTy>;
26+
27+
template <std::size_t Dimensions> class FillBase {
28+
public:
29+
FillBase(accessor<DataTy, 1, access_mode::write> GS,
30+
accessor<DataTy, 1, access_mode::write> LS,
31+
accessor<DataTy, 1, access_mode::write> GrS,
32+
accessor<DataTy, 1, access_mode::write> G,
33+
accessor<DataTy, 1, access_mode::write> L,
34+
accessor<DataTy, 1, access_mode::write> Gr)
35+
: GS{GS}, LS{LS}, GrS{GrS}, G{G}, L{L}, Gr{Gr} {}
36+
37+
protected:
38+
template <typename F> static DataTy getValue(F gen) {
39+
DataTy x;
40+
for (std::size_t i = 0; i < Dimensions; ++i) {
41+
x[i] = gen(i);
42+
}
43+
return x;
44+
}
45+
46+
accessor<DataTy, 1, access_mode::write> GS;
47+
accessor<DataTy, 1, access_mode::write> LS;
48+
accessor<DataTy, 1, access_mode::write> GrS;
49+
accessor<DataTy, 1, access_mode::write> G;
50+
accessor<DataTy, 1, access_mode::write> L;
51+
accessor<DataTy, 1, access_mode::write> Gr;
52+
};
53+
54+
template <int Dimensions> class Fill : public FillBase<Dimensions> {
55+
public:
56+
using FillBase<Dimensions>::FillBase;
57+
using FillBase<Dimensions>::getValue;
58+
59+
void operator()(item<Dimensions> i) const {
60+
const auto lid = i.get_linear_id();
61+
62+
FillBase<Dimensions>::GS[lid] =
63+
getValue([i](int arg) { return i.get_range(arg); });
64+
FillBase<Dimensions>::G[lid] =
65+
getValue([i](int arg) { return i.get_id(arg); });
66+
}
67+
};
68+
69+
template <int Dimensions> class FillLS : public FillBase<Dimensions> {
70+
public:
71+
using FillBase<Dimensions>::FillBase;
72+
using FillBase<Dimensions>::getValue;
73+
74+
void operator()(nd_item<Dimensions> nd) const {
75+
const auto lid = nd.get_global_linear_id();
76+
FillBase<Dimensions>::GS[lid] =
77+
getValue([nd](int arg) { return nd.get_global_range(arg); });
78+
FillBase<Dimensions>::LS[lid] =
79+
getValue([nd](int arg) { return nd.get_local_range(arg); });
80+
FillBase<Dimensions>::GrS[lid] =
81+
getValue([nd](int arg) { return nd.get_group_range(arg); });
82+
FillBase<Dimensions>::G[lid] =
83+
getValue([nd](int arg) { return nd.get_global_id(arg); });
84+
FillBase<Dimensions>::L[lid] =
85+
getValue([nd](int arg) { return nd.get_local_id(arg); });
86+
FillBase<Dimensions>::Gr[lid] =
87+
getValue([nd](int arg) { return nd.get_group(arg); });
88+
}
89+
};
90+
91+
////////////////////////////////////////////////////////////////////////////////
92+
// Range description
93+
////////////////////////////////////////////////////////////////////////////////
94+
95+
struct RangeDesc {
96+
using Indices = std::array<std::size_t, 3>;
97+
98+
constexpr RangeDesc(std::initializer_list<std::size_t> GS)
99+
: Dimensions{static_cast<int>(GS.size())}, GS{init(GS)},
100+
LS{std::nullopt} {}
101+
constexpr RangeDesc(std::initializer_list<std::size_t> GS,
102+
std::initializer_list<std::size_t> LS)
103+
: Dimensions{static_cast<int>(GS.size())}, GS{init(GS)}, LS{init(LS)} {}
104+
105+
constexpr std::size_t num_work_items() const { return GS[0] * GS[1] * GS[2]; }
106+
107+
int Dimensions;
108+
Indices GS;
109+
std::optional<Indices> LS;
110+
111+
template <std::size_t D> range<D> get_range() const;
112+
template <std::size_t D> nd_range<D> get_nd_range() const;
113+
114+
private:
115+
static constexpr Indices init(std::initializer_list<std::size_t> sizes) {
116+
Indices res{1, 1, 1};
117+
std::copy(sizes.begin(), sizes.end(), res.begin());
118+
return res;
119+
}
120+
};
121+
122+
template <> range<1> RangeDesc::get_range<1>() const { return {GS[0]}; }
123+
124+
template <> range<2> RangeDesc::get_range<2>() const { return {GS[0], GS[1]}; }
125+
126+
template <> range<3> RangeDesc::get_range<3>() const {
127+
return {GS[0], GS[1], GS[2]};
128+
}
129+
130+
template <> nd_range<1> RangeDesc::get_nd_range<1>() const {
131+
return {get_range<1>(), {(*LS)[0]}};
132+
}
133+
134+
template <> nd_range<2> RangeDesc::get_nd_range<2>() const {
135+
return {get_range<2>(), {(*LS)[0], (*LS)[1]}};
136+
}
137+
138+
template <> nd_range<3> RangeDesc::get_nd_range<3>() const {
139+
return {get_range<3>(), {(*LS)[0], (*LS)[1], (*LS)[2]}};
140+
}
141+
142+
////////////////////////////////////////////////////////////////////////////////
143+
// Test
144+
////////////////////////////////////////////////////////////////////////////////
145+
146+
using TestResult = std::vector<VecTy>;
147+
148+
TestResult run(const std::vector<RangeDesc> &sizes, bool fuse) {
149+
const auto numWorkItems =
150+
std::max_element(sizes.begin(), sizes.end(),
151+
[](const auto &LHS, const auto &RHS) {
152+
return LHS.num_work_items() < RHS.num_work_items();
153+
})
154+
->num_work_items();
155+
TestResult res(6 * sizes.size(), VecTy(numWorkItems));
156+
157+
{
158+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
159+
std::vector<buffer<DataTy>> Buffers;
160+
Buffers.reserve(res.size());
161+
for (auto &v : res) {
162+
Buffers.emplace_back(v);
163+
}
164+
if (fuse) {
165+
ext::codeplay::experimental::fusion_wrapper fw{q};
166+
fw.start_fusion();
167+
}
168+
for (std::size_t i = 0; i < sizes.size(); ++i) {
169+
q.submit([&](handler &cgh) {
170+
const auto &size = *(sizes.begin() + i);
171+
const auto j = i * 6;
172+
accessor GS{Buffers[j], cgh, write_only};
173+
accessor LS{Buffers[j + 1], cgh, write_only};
174+
accessor GrS{Buffers[j + 2], cgh, write_only};
175+
accessor G{Buffers[j + 3], cgh, write_only};
176+
accessor L{Buffers[j + 4], cgh, write_only};
177+
accessor Gr{Buffers[j + 5], cgh, write_only};
178+
if (size.LS) {
179+
switch (size.Dimensions) {
180+
case 1:
181+
cgh.parallel_for(size.template get_nd_range<1>(),
182+
FillLS<1>{GS, LS, GrS, G, L, Gr});
183+
break;
184+
case 2:
185+
cgh.parallel_for(size.template get_nd_range<2>(),
186+
FillLS<2>{GS, LS, GrS, G, L, Gr});
187+
break;
188+
case 3:
189+
cgh.parallel_for(size.template get_nd_range<3>(),
190+
FillLS<3>{GS, LS, GrS, G, L, Gr});
191+
break;
192+
}
193+
} else {
194+
switch (size.Dimensions) {
195+
case 1:
196+
cgh.parallel_for(size.template get_range<1>(),
197+
Fill<1>{GS, LS, GrS, G, L, Gr});
198+
break;
199+
case 2:
200+
cgh.parallel_for(size.template get_range<2>(),
201+
Fill<2>{GS, LS, GrS, G, L, Gr});
202+
break;
203+
case 3:
204+
cgh.parallel_for(size.template get_range<3>(),
205+
Fill<3>{GS, LS, GrS, G, L, Gr});
206+
break;
207+
}
208+
}
209+
});
210+
}
211+
if (fuse) {
212+
ext::codeplay::experimental::fusion_wrapper fw{q};
213+
assert(fw.is_in_fusion_mode() && "Fusion failed");
214+
fw.complete_fusion(
215+
{ext::codeplay::experimental::property::no_barriers{}});
216+
assert(!fw.is_in_fusion_mode() && "Fusion failed");
217+
}
218+
}
219+
220+
return res;
221+
}
222+
223+
void test(const std::vector<RangeDesc> &sizes) {
224+
const auto res = run(sizes, /*fuse*/ false);
225+
const auto fusedRes = run(sizes, /*fuse*/ true);
226+
assert(std::equal(res.begin(), res.end(), fusedRes.begin(),
227+
[](const auto &LHS, const auto &RHS) {
228+
return std::equal(LHS.begin(), LHS.end(), RHS.begin(),
229+
[](const auto &LHS, const auto &RHS) {
230+
return all(LHS == RHS);
231+
});
232+
}) &&
233+
"COMPUTATION ERROR");
234+
}
235+
236+
int main() {
237+
// 1-D kernels with different global sizes
238+
test({RangeDesc{10}, RangeDesc{20}});
239+
test({RangeDesc{10}, RangeDesc{20}, RangeDesc{30}});
240+
241+
// Two 1-D kernels with different global sizes and a 2-D kernel with more
242+
// work-items.
243+
test({RangeDesc{10}, RangeDesc{20}, RangeDesc{10, 10}});
244+
245+
// Two 1-D kernels with different global sizes and specified (equal) local
246+
// size.
247+
const auto R2 = {2ul};
248+
test({RangeDesc{{10}, R2}, RangeDesc{{20}, R2}});
249+
250+
// Three 1-D kernels with different global sizes and specified (equal) local
251+
// size.
252+
const auto R5 = {5ul};
253+
test({RangeDesc{{10}, R5}, RangeDesc{{20}, R5}, RangeDesc{{30}, R5}});
254+
255+
// Two 1-D kernels with different global sizes and a 2-D kernel with more
256+
// work-items and specified (equal) local size.
257+
test({RangeDesc{{10}, R2}, RangeDesc{{20}, R2}, RangeDesc{{10, 10}, {2, 1}}});
258+
259+
// Three 2-D kernels with different global sizes.
260+
test({RangeDesc{{10, 15}, {2, 5}}, RangeDesc{{20, 10}, {2, 5}},
261+
RangeDesc{{10, 5}, {2, 5}}});
262+
263+
// Three 3-D kernels with different global sizes.
264+
test({RangeDesc{{10, 4, 2}, {5, 2, 1}}, RangeDesc{{20, 2, 4}, {5, 2, 1}},
265+
RangeDesc{{10, 2, 4}, {5, 2, 1}}});
266+
267+
// 1-D, 2-D and 3-D kernels with different global sizes.
268+
test({RangeDesc{{10}, R5}, RangeDesc{{10, 1}, {5, 1}},
269+
RangeDesc{{10, 1, 1}, {5, 1, 1}}});
270+
}

0 commit comments

Comments
 (0)