@@ -14,9 +14,16 @@ constexpr size_t dataSize = 512;
14
14
15
15
enum class Internalization { None, Local, Private };
16
16
17
- template <typename Kernel1Name, typename Kernel2Name, int Kernel1Dim>
18
- void performFusion (queue &q, range<Kernel1Dim> k1Global,
19
- range<Kernel1Dim> k1Local) {
17
+ template <typename Range> size_t getSize (Range r);
18
+
19
+ template <> size_t getSize (range<1 > r) { return r.size (); }
20
+ template <> size_t getSize (nd_range<1 > r) {
21
+ return r.get_global_range ().size ();
22
+ }
23
+
24
+ template <typename Kernel1Name, typename Kernel2Name, typename Range1,
25
+ typename Range2>
26
+ void performFusion (queue &q, Range1 R1, Range2 R2) {
20
27
int in[dataSize], tmp[dataSize], out[dataSize];
21
28
22
29
for (size_t i = 0 ; i < dataSize; ++i) {
@@ -37,19 +44,15 @@ void performFusion(queue &q, range<Kernel1Dim> k1Global,
37
44
q.submit ([&](handler &cgh) {
38
45
auto accIn = bIn.get_access (cgh);
39
46
auto accTmp = bTmp.get_access (cgh);
40
- cgh.parallel_for <Kernel1Name>(nd_range<Kernel1Dim>{k1Global, k1Local},
41
- [=](item<Kernel1Dim> i) {
42
- auto LID = i.get_linear_id ();
43
- accTmp[LID] = accIn[LID] + 5 ;
44
- });
47
+ cgh.parallel_for <Kernel1Name>(
48
+ R1, [=](item<1 > i) { accTmp[i] = accIn[i] + 5 ; });
45
49
});
46
50
47
51
q.submit ([&](handler &cgh) {
48
52
auto accTmp = bTmp.get_access (cgh);
49
53
auto accOut = bOut.get_access (cgh);
50
- cgh.parallel_for <Kernel2Name>(nd_range<1 >{{dataSize}, {8 }}, [=](id<1 > i) {
51
- accOut[i] = accTmp[i] * 2 ;
52
- });
54
+ cgh.parallel_for <Kernel2Name>(
55
+ R2, [=](id<1 > i) { accOut[i] = accTmp[i] * 2 ; });
53
56
});
54
57
55
58
fw.complete_fusion ({ext::codeplay::experimental::property::no_barriers{}});
@@ -60,7 +63,8 @@ void performFusion(queue &q, range<Kernel1Dim> k1Global,
60
63
61
64
// Check the results
62
65
size_t numErrors = 0 ;
63
- for (size_t i = 0 ; i < k1Global.size (); ++i) {
66
+ size_t size = getSize (R1);
67
+ for (size_t i = 0 ; i < size; ++i) {
64
68
if (out[i] != ((i + 5 ) * 2 )) {
65
69
++numErrors;
66
70
}
@@ -89,8 +93,9 @@ int main() {
89
93
90
94
// Scenario: Fusing two kernels with different local size should lead to
91
95
// fusion being aborted.
92
- performFusion<class Kernel1_3 , class Kernel2_3 >(q, range<1 >{dataSize},
93
- range<1 >{16 });
96
+ performFusion<class Kernel1_3 , class Kernel2_3 >(
97
+ q, nd_range<1 >{range<1 >{dataSize}, range<1 >{16 }},
98
+ nd_range<1 >{range<1 >{dataSize}, range<1 >{8 }});
94
99
// CHECK: ERROR: JIT compilation for kernel fusion failed with message:
95
100
// CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
96
101
// CHECK: COMPUTATION OK
@@ -101,5 +106,13 @@ int main() {
101
106
// CHECK-NOT: Cannot fuse kernels with different offsets or local sizes
102
107
// CHECK: WARNING: Fusion list is empty
103
108
109
+ // Scenario: Fusing two kernels that would lead to non-uniform work-group
110
+ // sizes should lead to fusion being aborted.
111
+ performFusion<class Kernel1_4 , class Kernel2_4 >(
112
+ q, nd_range<1 >{range<1 >{9 }, range<1 >{3 }}, range<1 >{dataSize});
113
+ // CHECK: ERROR: JIT compilation for kernel fusion failed with message:
114
+ // CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
115
+ // CHECK: COMPUTATION OK
116
+
104
117
return 0 ;
105
118
}
0 commit comments