Skip to content

Commit a3e2315

Browse files
authored
[SYCL][Fusion] Enable fusion of rounded-range kernels (#12492)
Enable, test, and document the support for fusing rounded range kernels. This mostly worked already – we just have to query the original kernel's global size, and use that to compute the private memory size used for internalization. --------- Signed-off-by: Julian Oppermann <[email protected]>
1 parent 9741bdc commit a3e2315

File tree

5 files changed

+96
-24
lines changed

5 files changed

+96
-24
lines changed

sycl/doc/design/KernelFusionJIT.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,18 @@ During the fusion process at runtime, the JIT will load the LLVM IR and
300300
finalize the fused kernel to the final target. More information is available
301301
[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support).
302302

303+
### Interaction with `parallel_for` range rounding
304+
305+
DPCPP's [range rounding](./ParallelForRangeRounding.md) transformation is
306+
transparent for fusion, meaning the generated wrapper kernel with the rounded up
307+
range will be used.
308+
309+
[Private internalization](#internalization-behavior) is supported when fusing
310+
such kernels. We use the original, unrounded global size in dimension 0 when
311+
computing the private memory size. As range rounding only applies to basic
312+
kernels (parametrized by a `sycl::range`), local internalization is not affected
313+
by the range rounding transformation.
314+
303315
### Unsupported SYCL constructs
304316

305317
The following SYCL API constructs are currently not officially supported for

sycl/source/detail/jit_compiler.cpp

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,17 @@ static Promotion getInternalizationInfo(Requirement *Req) {
203203
return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
204204
}
205205

206-
static std::optional<size_t> getLocalSize(NDRDescT NDRange, Requirement *Req,
207-
Promotion Target) {
206+
static std::optional<size_t> getLocalSize(NDRDescT NDRange,
207+
std::optional<size_t> UserGlobalSize,
208+
Requirement *Req, Promotion Target) {
209+
assert((!UserGlobalSize.has_value() || Target != Promotion::Local) &&
210+
"Unexpected range rounding");
208211
auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
209212
if (Target == Promotion::Private) {
213+
if (UserGlobalSize.has_value()) {
214+
// Only the first dimension is affected by range rounding.
215+
NDRange.GlobalSize[0] = *UserGlobalSize;
216+
}
210217
auto NumWorkItems = NDRange.GlobalSize.size();
211218
// For private internalization, the local size is
212219
// (Number of elements in buffer)/(number of work-items)
@@ -237,13 +244,15 @@ static bool accessorEquals(Requirement *Req, Requirement *Other) {
237244

238245
static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
239246
unsigned ArgFunctionIndex, NDRDescT NDRange,
247+
std::optional<size_t> UserGlobalSize,
240248
PromotionMap &Promotions) {
241249
assert(Arg.MType == kernel_param_kind_t::kind_accessor);
242250

243251
Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
244252

245253
auto ThisPromotionTarget = getInternalizationInfo(Req);
246-
auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget);
254+
auto ThisLocalSize =
255+
getLocalSize(NDRange, UserGlobalSize, Req, ThisPromotionTarget);
247256

248257
if (Promotions.count(Req->MSYCLMemObj)) {
249258
// We previously encountered an accessor for the same buffer.
@@ -278,7 +287,7 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
278287
// Recompute the local size for the previous definition with adapted
279288
// promotion target.
280289
auto NewPrevLocalSize =
281-
getLocalSize(PreviousDefinition.NDRange,
290+
getLocalSize(PreviousDefinition.NDRange, std::nullopt,
282291
PreviousDefinition.Definition, Promotion::Local);
283292

284293
if (!NewPrevLocalSize.has_value()) {
@@ -316,7 +325,8 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
316325

317326
if (PreviousDefinition.PromotionTarget == Promotion::Local) {
318327
// Recompute the local size with adapted promotion target.
319-
auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local);
328+
auto ThisLocalSize =
329+
getLocalSize(NDRange, std::nullopt, Req, Promotion::Local);
320330
if (!ThisLocalSize.has_value()) {
321331
printPerformanceWarning("Work-group size for local promotion not "
322332
"specified, not performing internalization");
@@ -591,11 +601,12 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
591601
// argument is later on passed to the kernel.
592602
const size_t SizeAccField =
593603
sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
594-
// Compute the local size and use it for the range parameters.
595-
auto LocalSize = getLocalSize(NDRange, Req,
596-
(PromotedToPrivate) ? Promotion::Private
597-
: Promotion::Local);
598-
range<3> AccessRange{1, 1, LocalSize.value()};
604+
// Compute the local size and use it for the range parameters (only
605+
// relevant for local promotion).
606+
size_t LocalSize = PromotedToLocal ? *getLocalSize(NDRange, std::nullopt,
607+
Req, Promotion::Local)
608+
: 0;
609+
range<3> AccessRange{1, 1, LocalSize};
599610
auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
600611
// Use all-zero as the offset
601612
id<3> AcessOffset{0, 0, 0};
@@ -604,7 +615,7 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
604615
// Override the arguments.
605616
// 1. Override the pointer with a std-layout argument with 'nullptr' as
606617
// value. handler.cpp does the same for local accessors.
607-
int SizeInBytes = Req->MElemSize * LocalSize.value();
618+
int SizeInBytes = Req->MElemSize * LocalSize;
608619
FusedArgs[ArgIndex] =
609620
ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
610621
static_cast<int>(ArgIndex)};
@@ -694,6 +705,26 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
694705
return A.MIndex < B.MIndex;
695706
});
696707

708+
// Determine whether the kernel has been subject to DPCPP's range rounding.
709+
// If so, the first argument will be the original ("user") range.
710+
std::optional<size_t> UserGlobalSize;
711+
if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
712+
KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
713+
!Args.empty()) {
714+
auto &A0 = Args[0];
715+
auto Dims = KernelCG->MNDRDesc.Dims;
716+
assert(A0.MPtr && A0.MSize == static_cast<int>(Dims * sizeof(size_t)) &&
717+
A0.MType == kernel_param_kind_t::kind_std_layout &&
718+
"Unexpected signature for rounded range kernel");
719+
720+
size_t *UGS = reinterpret_cast<size_t *>(A0.MPtr);
721+
// Range-rounding only applies to the first dimension.
722+
assert(UGS[0] > KernelCG->MNDRDesc.GlobalSize[1]);
723+
assert(Dims < 2 || UGS[1] == KernelCG->MNDRDesc.GlobalSize[1]);
724+
assert(Dims < 3 || UGS[2] == KernelCG->MNDRDesc.GlobalSize[2]);
725+
UserGlobalSize = UGS[0];
726+
}
727+
697728
::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};
698729
size_t ArgIndex = 0;
699730
// The kernel function in SPIR-V will only have the non-eliminated
@@ -719,7 +750,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
719750
if (!Eliminated) {
720751
if (Arg.MType == kernel_param_kind_t::kind_accessor) {
721752
resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
722-
KernelCG->MNDRDesc, PromotedAccs);
753+
KernelCG->MNDRDesc, UserGlobalSize,
754+
PromotedAccs);
723755
}
724756
FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
725757
++ArgFunctionIndex;

sycl/test-e2e/KernelFusion/different_nd_ranges.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// RUN: %{build} -o %t.out
2-
// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
2+
// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 \
3+
// RUN: %{run} %t.out 2>&1 | FileCheck %s
34

45
// Test complete fusion of kernels with different ND-ranges.
56

@@ -262,4 +263,12 @@ int main() {
262263
// 1-D, 2-D and 3-D kernels with different global sizes.
263264
test({RangeDesc{{10}, R5}, RangeDesc{{10, 1}, {5, 1}},
264265
RangeDesc{{10, 1, 1}, {5, 1, 1}}});
266+
267+
// Test global sizes that trigger the rounded range kernel insertion.
268+
// Note that we lower the RR threshold when running this test.
269+
test({RangeDesc{67}, RangeDesc{87}, RangeDesc{64}});
270+
271+
// Test multi-dimensional range-rounded kernels. Only the first dimension will
272+
// be rounded up.
273+
test({RangeDesc{30, 67}, RangeDesc{76, 55}, RangeDesc{64, 64}});
265274
}

sycl/test-e2e/KernelFusion/private_internalization.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
2-
// RUN: %{run} %t.out
2+
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:512 %{run} %t.out
33

44
// Test complete fusion with private internalization specified on the
55
// accessors.
@@ -8,8 +8,9 @@
88

99
using namespace sycl;
1010

11-
int main() {
12-
constexpr size_t dataSize = 512;
11+
template <typename BaseName, size_t dataSize> class KernelName;
12+
13+
template <size_t dataSize> static void test() {
1314
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
1415

1516
for (size_t i = 0; i < dataSize; ++i) {
@@ -39,7 +40,7 @@ int main() {
3940
auto accIn2 = bIn2.get_access(cgh);
4041
auto accTmp = bTmp.get_access(
4142
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
42-
cgh.parallel_for<class KernelOne>(
43+
cgh.parallel_for<KernelName<class KernelOne, dataSize>>(
4344
dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
4445
});
4546

@@ -48,7 +49,7 @@ int main() {
4849
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
4950
auto accIn3 = bIn3.get_access(cgh);
5051
auto accOut = bOut.get_access(cgh);
51-
cgh.parallel_for<class KernelTwo>(
52+
cgh.parallel_for<KernelName<class KernelTwo, dataSize>>(
5253
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
5354
});
5455

@@ -63,6 +64,15 @@ int main() {
6364
assert(out[i] == (20 * i * i) && "Computation error");
6465
assert(tmp[i] == -1 && "Not internalized");
6566
}
67+
}
68+
69+
int main() {
70+
// Test power-of-two size.
71+
test<512>();
72+
73+
// Test prime size large enough to trigger rounded-range kernel insertion.
74+
// Note that we lower the RR threshold when running this test.
75+
test<523>();
6676

6777
return 0;
6878
}

sycl/test-e2e/KernelFusion/two_dimensional.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
2-
// RUN: %{run} %t.out
2+
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 %{run} %t.out
33

44
// Test complete fusion with private internalization specified on the
55
// accessors for two-dimensional range.
@@ -8,9 +8,9 @@
88

99
using namespace sycl;
1010

11-
int main() {
12-
constexpr size_t sizeX = 16;
13-
constexpr size_t sizeY = 32;
11+
template <typename BaseName, size_t sizeX, size_t sizeY> class KernelName;
12+
13+
template <size_t sizeX, size_t sizeY> static void test() {
1414
constexpr size_t dataSize = sizeX * sizeY;
1515
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
1616

@@ -42,7 +42,7 @@ int main() {
4242
auto accIn2 = bIn2.get_access(cgh);
4343
auto accTmp = bTmp.get_access(
4444
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
45-
cgh.parallel_for<class KernelOne>(
45+
cgh.parallel_for<KernelName<class KernelOne, sizeX, sizeY>>(
4646
xyRange, [=](id<2> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
4747
});
4848

@@ -51,7 +51,7 @@ int main() {
5151
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
5252
auto accIn3 = bIn3.get_access(cgh);
5353
auto accOut = bOut.get_access(cgh);
54-
cgh.parallel_for<class KernelTwo>(
54+
cgh.parallel_for<KernelName<class KernelTwo, sizeX, sizeY>>(
5555
xyRange, [=](id<2> i) { accOut[i] = accTmp[i] * accIn3[i]; });
5656
});
5757

@@ -66,6 +66,15 @@ int main() {
6666
assert(out[i] == (20 * i * i) && "Computation error");
6767
assert(tmp[i] == -1 && "Not internalized");
6868
}
69+
}
70+
71+
int main() {
72+
// Test power-of-two size.
73+
test<16, 32>();
74+
75+
// Test prime sizes large enough to trigger rounded-range kernel insertion.
76+
// Note that we lower the RR threshold when running this test.
77+
test<67, 79>();
6978

7079
return 0;
7180
}

0 commit comments

Comments
 (0)