Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 87158a2

Browse files
committed
[SYCL][Fusion] Test kernel fusion and optimization
Signed-off-by: Lukas Sommer <[email protected]>
1 parent fd1af4d commit 87158a2

29 files changed

+2140
-8
lines changed

SYCL/KernelFusion/abort_fusion.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
3+
// RUN: %CPU_CHECK_PLACEHOLDER
4+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
5+
// RUN: %GPU_CHECK_PLACEHOLDER
6+
// UNSUPPORTED: cuda || hip
7+
// REQUIRES: fusion
8+
9+
// Test fusion being aborted: Different scenarios causing the JIT compiler
10+
// to abort fusion due to constraint violations for fusion. Also check that
11+
// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
12+
13+
#include <sycl/sycl.hpp>
14+
15+
using namespace sycl;
16+
17+
constexpr size_t dataSize = 512;
18+
19+
enum class Internalization { None, Local, Private };
20+
21+
template <typename Kernel1Name, typename Kernel2Name, int Kernel1Dim>
22+
void performFusion(queue &q, range<Kernel1Dim> k1Global,
23+
range<Kernel1Dim> k1Local) {
24+
int in[dataSize], tmp[dataSize], out[dataSize];
25+
26+
for (size_t i = 0; i < dataSize; ++i) {
27+
in[i] = i;
28+
tmp[i] = -1;
29+
out[i] = -1;
30+
}
31+
{
32+
buffer<int> bIn{in, range{dataSize}};
33+
buffer<int> bTmp{tmp, range{dataSize}};
34+
buffer<int> bOut{out, range{dataSize}};
35+
36+
ext::codeplay::experimental::fusion_wrapper fw(q);
37+
fw.start_fusion();
38+
39+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
40+
41+
q.submit([&](handler &cgh) {
42+
auto accIn = bIn.get_access(cgh);
43+
auto accTmp = bTmp.get_access(cgh);
44+
cgh.parallel_for<Kernel1Name>(nd_range<Kernel1Dim>{k1Global, k1Local},
45+
[=](item<Kernel1Dim> i) {
46+
auto LID = i.get_linear_id();
47+
accTmp[LID] = accIn[LID] + 5;
48+
});
49+
});
50+
51+
q.submit([&](handler &cgh) {
52+
auto accTmp = bTmp.get_access(cgh);
53+
auto accOut = bOut.get_access(cgh);
54+
cgh.parallel_for<Kernel2Name>(nd_range<1>{{dataSize}, {8}}, [=](id<1> i) {
55+
accOut[i] = accTmp[i] * 2;
56+
});
57+
});
58+
59+
fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
60+
61+
assert(!fw.is_in_fusion_mode() &&
62+
"Queue should not be in fusion mode anymore");
63+
}
64+
65+
// Check the results
66+
size_t numErrors = 0;
67+
for (size_t i = 0; i < k1Global.size(); ++i) {
68+
if (out[i] != ((i + 5) * 2)) {
69+
++numErrors;
70+
}
71+
}
72+
if (numErrors) {
73+
std::cout << "COMPUTATION ERROR\n";
74+
} else {
75+
std::cout << "COMPUTATION OK\n";
76+
}
77+
}
78+
79+
int main() {
80+
81+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
82+
83+
// Scenario: Fusing two kernels with different dimensionality should lead to
84+
// fusion being aborted.
85+
performFusion<class Kernel1_1, class Kernel2_1>(q, range<2>{32, 16},
86+
range<2>{1, 8});
87+
// CHECK: WARNING: Cannot fuse kernels with different dimensionality
88+
// CHECK-NEXT: COMPUTATION OK
89+
90+
// Scenario: Fusing two kernels with different global size should lead to
91+
// fusion being aborted.
92+
performFusion<class Kernel1_2, class Kernel2_2>(q, range<1>{256},
93+
range<1>{8});
94+
// CHECK-NEXT: WARNING: Cannot fuse kerneles with different global size
95+
// CHECK-NEXT: COMPUTATION OK
96+
97+
// Scenario: Fusing two kernels with different local size should lead to
98+
// fusion being aborted.
99+
performFusion<class Kernel1_3, class Kernel2_3>(q, range<1>{dataSize},
100+
range<1>{16});
101+
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
102+
// CHECK-NEXT: COMPUTATION OK
103+
104+
return 0;
105+
}
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
3+
// RUN: %CPU_CHECK_PLACEHOLDER
4+
// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
5+
// RUN: %GPU_CHECK_PLACEHOLDER
6+
// UNSUPPORTED: cuda || hip
7+
// REQUIRES: fusion
8+
9+
// Test incomplete internalization: Different scenarios causing the JIT compiler
10+
// to abort internalization due to target or parameter mismatch. Also check that
11+
// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
12+
13+
#include <sycl/sycl.hpp>
14+
15+
using namespace sycl;
16+
17+
constexpr size_t dataSize = 512;
18+
19+
enum class Internalization { None, Local, Private };
20+
21+
void performFusion(queue &q, Internalization intKernel1,
22+
size_t localSizeKernel1, Internalization intKernel2,
23+
size_t localSizeKernel2,
24+
bool expectInternalization = false) {
25+
int in[dataSize], tmp[dataSize], out[dataSize];
26+
for (size_t i = 0; i < dataSize; ++i) {
27+
in[i] = i;
28+
tmp[i] = -1;
29+
out[i] = -1;
30+
}
31+
{
32+
buffer<int> bIn{in, range{dataSize}};
33+
buffer<int> bTmp{tmp, range{dataSize}};
34+
buffer<int> bOut{out, range{dataSize}};
35+
36+
ext::codeplay::experimental::fusion_wrapper fw{q};
37+
fw.start_fusion();
38+
39+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
40+
41+
q.submit([&](handler &cgh) {
42+
auto accIn = bIn.get_access(cgh);
43+
property_list properties{};
44+
if (intKernel1 == Internalization::Private) {
45+
properties = {
46+
sycl::ext::codeplay::experimental::property::promote_private{}};
47+
} else if (intKernel1 == Internalization::Local) {
48+
properties = {
49+
sycl::ext::codeplay::experimental::property::promote_local{}};
50+
}
51+
accessor<int> accTmp = bTmp.get_access(cgh, properties);
52+
53+
if (localSizeKernel1 > 0) {
54+
cgh.parallel_for<class Kernel1>(
55+
nd_range<1>{{dataSize}, {localSizeKernel1}},
56+
[=](id<1> i) { accTmp[i] = accIn[i] + 5; });
57+
} else {
58+
cgh.parallel_for<class KernelOne>(
59+
dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
60+
}
61+
});
62+
63+
q.submit([&](handler &cgh) {
64+
property_list properties{};
65+
if (intKernel2 == Internalization::Private) {
66+
properties = {
67+
sycl::ext::codeplay::experimental::property::promote_private{}};
68+
} else if (intKernel2 == Internalization::Local) {
69+
properties = {
70+
sycl::ext::codeplay::experimental::property::promote_local{}};
71+
}
72+
accessor<int> accTmp = bTmp.get_access(cgh, properties);
73+
auto accOut = bOut.get_access(cgh);
74+
if (localSizeKernel2 > 0) {
75+
cgh.parallel_for<class Kernel2>(
76+
nd_range<1>{{dataSize}, {localSizeKernel2}},
77+
[=](id<1> i) { accOut[i] = accTmp[i] * 2; });
78+
} else {
79+
cgh.parallel_for<class KernelTwo>(
80+
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
81+
}
82+
});
83+
84+
fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
85+
86+
assert(!fw.is_in_fusion_mode() &&
87+
"Queue should not be in fusion mode anymore");
88+
}
89+
90+
// Check the results
91+
size_t numErrors = 0;
92+
size_t numInternalized = 0;
93+
for (size_t i = 0; i < dataSize; ++i) {
94+
if (out[i] != ((i + 5) * 2)) {
95+
++numErrors;
96+
}
97+
if (tmp[i] == -1) {
98+
++numInternalized;
99+
}
100+
}
101+
if (numErrors) {
102+
std::cout << "COMPUTATION ERROR\n";
103+
return;
104+
}
105+
if (!expectInternalization && numInternalized) {
106+
std::cout << "WRONG INTERNALIZATION\n";
107+
return;
108+
}
109+
std::cout << "COMPUTATION OK\n";
110+
}
111+
112+
int main() {
113+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
114+
115+
// Scenario: One accessor without internalization, one with local
116+
// internalization. Should fall back to no internalization and print a
117+
// warning.
118+
std::cout << "None, Local(0)\n";
119+
performFusion(q, Internalization::None, 0, Internalization::Local, 0);
120+
// CHECK: None, Local(0)
121+
// CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
122+
// CHECK-NEXT: COMPUTATION OK
123+
124+
// Scenario: One accessor without internalization, one with private
125+
// internalization. Should fall back to no internalization and print a
126+
// warning.
127+
std::cout << "None, Private\n";
128+
performFusion(q, Internalization::None, 0, Internalization::Private, 0);
129+
// CHECK-NEXT: None, Private
130+
// CHECK-NEXT: WARNING: Not performing specified private promotion, due to previous mismatch or because previous accessor specified no promotion
131+
// CHECK-NEXT: COMPUTATION OK
132+
133+
// Scenario: Both accessor with local promotion, but the second kernel does
134+
// not specify a work-group size. No promotion should happen and a warning
135+
// should be printed.
136+
std::cout << "Local(8), Local(0)\n";
137+
performFusion(q, Internalization::Local, 8, Internalization::Local, 0);
138+
// CHECK-NEXT: Local(8), Local(0)
139+
// CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
140+
// CHECK-NEXT: COMPUTATION OK
141+
142+
// Scenario: Both accessor with local promotion, but the first kernel does
143+
// not specify a work-group size. No promotion should happen and a warning
144+
// should be printed.
145+
std::cout << "Local(0), Local(8)\n";
146+
performFusion(q, Internalization::Local, 0, Internalization::Local, 8);
147+
// CHECK-NEXT: Local(0), Local(8)
148+
// CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
149+
// CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
150+
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
151+
// CHECK-NEXT: COMPUTATION OK
152+
153+
// Scenario: Both accessor with local promotion, but the kernels specify
154+
// different work-group sizes. No promotion should happen and a warning should
155+
// be printed.
156+
std::cout << "Local(8), Local(16)\n";
157+
performFusion(q, Internalization::Local, 8, Internalization::Local, 16);
158+
// CHECK-NEXT: Local(8), Local(16)
159+
// CHECK-NEXT: WARNING: Not performing specified local promotion due to work-group size mismatch
160+
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
161+
// CHECK-NEXT: COMPUTATION OK
162+
163+
// Scenario: One accessor with local internalization, one with private
164+
// internalization. Should fall back to local internalization and print a
165+
// warning.
166+
std::cout << "Local(8), Private(8)\n";
167+
performFusion(q, Internalization::Local, 8, Internalization::Private, 8,
168+
/* expectInternalization */ true);
169+
// CHECK-NEXT: Local(8), Private(8)
170+
// CHECK-NEXT: WARNING: Performing local internalization instead, because previous accessor specified local promotion
171+
// CHECK-NEXT: COMPUTATION OK
172+
173+
return 0;
174+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// UNSUPPORTED: cuda || hip
5+
// REQUIRES: fusion
6+
7+
// Test complete fusion with local internalization and a combination of kernels
8+
// that require a work-group barrier to be inserted by fusion.
9+
10+
#include <sycl/sycl.hpp>
11+
12+
using namespace sycl;
13+
14+
int main() {
15+
constexpr size_t dataSize = 512;
16+
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
17+
18+
for (size_t i = 0; i < dataSize; ++i) {
19+
in1[i] = i * 2;
20+
in2[i] = i * 3;
21+
in3[i] = i * 4;
22+
tmp[i] = -1;
23+
out[i] = -1;
24+
}
25+
26+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
27+
28+
{
29+
buffer<int> bIn1{in1, range{dataSize}};
30+
buffer<int> bIn2{in2, range{dataSize}};
31+
buffer<int> bIn3{in3, range{dataSize}};
32+
buffer<int> bTmp{
33+
tmp,
34+
range{dataSize},
35+
{sycl::ext::codeplay::experimental::property::promote_local{}}};
36+
buffer<int> bOut{out, range{dataSize}};
37+
38+
ext::codeplay::experimental::fusion_wrapper fw{q};
39+
fw.start_fusion();
40+
41+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
42+
43+
q.submit([&](handler &cgh) {
44+
auto accIn1 = bIn1.get_access(cgh);
45+
auto accIn2 = bIn2.get_access(cgh);
46+
auto accTmp = bTmp.get_access(cgh);
47+
cgh.parallel_for<class KernelOne>(
48+
nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
49+
auto workgroupSize = i.get_local_range(0);
50+
auto baseOffset = i.get_group_linear_id() * workgroupSize;
51+
auto localIndex = i.get_local_linear_id();
52+
auto localOffset = (workgroupSize - 1) - localIndex;
53+
accTmp[baseOffset + localOffset] =
54+
accIn1[baseOffset + localOffset] +
55+
accIn2[baseOffset + localOffset];
56+
});
57+
});
58+
59+
q.submit([&](handler &cgh) {
60+
auto accTmp = bTmp.get_access(cgh);
61+
auto accIn3 = bIn3.get_access(cgh);
62+
auto accOut = bOut.get_access(cgh);
63+
cgh.parallel_for<class KernelTwo>(
64+
nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
65+
auto index = i.get_global_linear_id();
66+
accOut[index] = accTmp[index] * accIn3[index];
67+
});
68+
});
69+
70+
fw.complete_fusion();
71+
72+
assert(!fw.is_in_fusion_mode() &&
73+
"Queue should not be in fusion mode anymore");
74+
}
75+
76+
// Check the results
77+
for (size_t i = 0; i < dataSize; ++i) {
78+
assert(out[i] == (20 * i * i) && "Computation error");
79+
assert(tmp[i] == -1 && "Not internalized");
80+
}
81+
82+
return 0;
83+
}

0 commit comments

Comments
 (0)