Skip to content

Commit 96a24ae

Browse files
committed
Support multi-dimensional kernels.
Signed-off-by: Julian Oppermann <[email protected]>
1 parent f35a78d commit 96a24ae

File tree

5 files changed

+47
-22
lines changed

5 files changed

+47
-22
lines changed

sycl/doc/design/KernelFusionJIT.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -307,10 +307,10 @@ transparent for fusion, meaning the generated wrapper kernel with the rounded up
307307
range will be used.
308308

309309
[Private internalization](#internalization-behavior) is supported when fusing
310-
such kernels. We use the original, unrounded global size when computing the
311-
private memory size. As range rounding only applies to basic kernels
312-
(parametrized by a `sycl::range`), local internalization is not affected by the
313-
range rounding transformation.
310+
such kernels. We use the original, unrounded global size in dimension 0 when
311+
computing the private memory size. As range rounding only applies to basic
312+
kernels (parametrized by a `sycl::range`), local internalization is not affected
313+
by the range rounding transformation.
314314

315315
### Unsupported SYCL constructs
316316

sycl/source/detail/jit_compiler.cpp

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,11 @@ static std::optional<size_t> getLocalSize(NDRDescT NDRange,
210210
"Unexpected range rounding");
211211
auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
212212
if (Target == Promotion::Private) {
213-
auto NumWorkItems = UserGlobalSize.value_or(NDRange.GlobalSize.size());
213+
if (UserGlobalSize.has_value()) {
214+
// Only the first dimension is affected by range rounding.
215+
NDRange.GlobalSize[0] = *UserGlobalSize;
216+
}
217+
auto NumWorkItems = NDRange.GlobalSize.size();
214218
// For private internalization, the local size is
215219
// (Number of elements in buffer)/(number of work-items)
216220
return NumElementsMem / NumWorkItems;
@@ -706,13 +710,18 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
706710
std::optional<size_t> UserGlobalSize;
707711
if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
708712
KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
709-
!Args.empty() &&
710-
Args[0].MType == kernel_param_kind_t::kind_std_layout && Args[0].MPtr &&
711-
Args[0].MSize == sizeof(size_t)) {
712-
size_t UGS = *reinterpret_cast<size_t *>(Args[0].MPtr);
713-
assert(KernelCG->MNDRDesc.Dims == 1 &&
714-
UGS < KernelCG->MNDRDesc.GlobalSize[0]);
715-
UserGlobalSize = UGS;
713+
!Args.empty()) {
714+
auto &A0 = Args[0];
715+
int Dims = KernelCG->MNDRDesc.Dims;
716+
if (A0.MPtr && A0.MSize == (Dims * sizeof(size_t)) &&
717+
A0.MType == kernel_param_kind_t::kind_std_layout) {
718+
size_t *UGS = reinterpret_cast<size_t *>(A0.MPtr);
719+
// Range-rounding only applies to the first dimension.
720+
assert(UGS[0] > KernelCG->MNDRDesc.GlobalSize[1]);
721+
assert(Dims < 2 || UGS[1] == KernelCG->MNDRDesc.GlobalSize[1]);
722+
assert(Dims < 3 || UGS[2] == KernelCG->MNDRDesc.GlobalSize[2]);
723+
UserGlobalSize = UGS[0];
724+
}
716725
}
717726

718727
::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};

sycl/test-e2e/KernelFusion/different_nd_ranges.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// RUN: %{build} -o %t.out
2-
// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
2+
// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 \
3+
// RUN: %{run} %t.out 2>&1 | FileCheck %s
34

45
// Test complete fusion of kernels with different ND-ranges.
56

@@ -264,5 +265,10 @@ int main() {
264265
RangeDesc{{10, 1, 1}, {5, 1, 1}}});
265266

266267
// Test global sizes that trigger the rounded range kernel insertion.
267-
test({RangeDesc{3000}, RangeDesc{7727}, RangeDesc{4096}});
268+
// Note that we lower the RR threshold when running this test.
269+
test({RangeDesc{67}, RangeDesc{87}, RangeDesc{64}});
270+
271+
// Test multi-dimensional range-rounded kernels. Only the first dimension will
272+
// be rounded up.
273+
test({RangeDesc{30, 67}, RangeDesc{76, 55}, RangeDesc{64, 64}});
268274
}

sycl/test-e2e/KernelFusion/private_internalization.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
2-
// RUN: %{run} %t.out
2+
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:512 %{run} %t.out
33

44
// Test complete fusion with private internalization specified on the
55
// accessors.
@@ -71,7 +71,8 @@ int main() {
7171
test<512>();
7272

7373
// Test prime size large enough to trigger rounded-range kernel insertion.
74-
test<7727>();
74+
// Note that we lower the RR threshold when running this test.
75+
test<523>();
7576

7677
return 0;
7778
}

sycl/test-e2e/KernelFusion/two_dimensional.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
2-
// RUN: %{run} %t.out
2+
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 %{run} %t.out
33

44
// Test complete fusion with private internalization specified on the
55
// accessors for two-dimensional range.
@@ -8,9 +8,9 @@
88

99
using namespace sycl;
1010

11-
int main() {
12-
constexpr size_t sizeX = 16;
13-
constexpr size_t sizeY = 32;
11+
template <typename BaseName, size_t sizeX, size_t sizeY> class KernelName;
12+
13+
template <size_t sizeX, size_t sizeY> static void test() {
1414
constexpr size_t dataSize = sizeX * sizeY;
1515
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
1616

@@ -42,7 +42,7 @@ int main() {
4242
auto accIn2 = bIn2.get_access(cgh);
4343
auto accTmp = bTmp.get_access(
4444
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
45-
cgh.parallel_for<class KernelOne>(
45+
cgh.parallel_for<KernelName<class KernelOne, sizeX, sizeY>>(
4646
xyRange, [=](id<2> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
4747
});
4848

@@ -51,7 +51,7 @@ int main() {
5151
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
5252
auto accIn3 = bIn3.get_access(cgh);
5353
auto accOut = bOut.get_access(cgh);
54-
cgh.parallel_for<class KernelTwo>(
54+
cgh.parallel_for<KernelName<class KernelTwo, sizeX, sizeY>>(
5555
xyRange, [=](id<2> i) { accOut[i] = accTmp[i] * accIn3[i]; });
5656
});
5757

@@ -66,6 +66,15 @@ int main() {
6666
assert(out[i] == (20 * i * i) && "Computation error");
6767
assert(tmp[i] == -1 && "Not internalized");
6868
}
69+
}
70+
71+
int main() {
72+
// Test power-of-two size.
73+
test<16, 32>();
74+
75+
// Test prime sizes large enough to trigger rounded-range kernel insertion.
76+
// Note that we lower the RR threshold when running this test.
77+
test<67, 79>();
6978

7079
return 0;
7180
}

0 commit comments

Comments
 (0)