Skip to content

Commit f35a78d

Browse files
committed
[SYCL][Fusion] Enable fusion of rounded-range kernels
Signed-off-by: Julian Oppermann <[email protected]>
1 parent 9b33a43 commit f35a78d

File tree

4 files changed

+63
-17
lines changed

4 files changed

+63
-17
lines changed

sycl/doc/design/KernelFusionJIT.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,18 @@ During the fusion process at runtime, the JIT will load the LLVM IR and
300300
finalize the fused kernel to the final target. More information is available
301301
[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support).
302302

303+
### Interaction with `parallel_for` range rounding
304+
305+
DPCPP's [range rounding](./ParallelForRangeRounding.md) transformation is
306+
transparent for fusion, meaning the generated wrapper kernel with the rounded up
307+
range will be used.
308+
309+
[Private internalization](#internalization-behavior) is supported when fusing
310+
such kernels. We use the original, unrounded global size when computing the
311+
private memory size. As range rounding only applies to basic kernels
312+
(parametrized by a `sycl::range`), local internalization is not affected by the
313+
range rounding transformation.
314+
303315
### Unsupported SYCL constructs
304316

305317
The following SYCL API constructs are currently not officially supported for

sycl/source/detail/jit_compiler.cpp

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -203,11 +203,14 @@ static Promotion getInternalizationInfo(Requirement *Req) {
203203
return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
204204
}
205205

206-
static std::optional<size_t> getLocalSize(NDRDescT NDRange, Requirement *Req,
207-
Promotion Target) {
206+
static std::optional<size_t> getLocalSize(NDRDescT NDRange,
207+
std::optional<size_t> UserGlobalSize,
208+
Requirement *Req, Promotion Target) {
209+
assert((!UserGlobalSize.has_value() || Target != Promotion::Local) &&
210+
"Unexpected range rounding");
208211
auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
209212
if (Target == Promotion::Private) {
210-
auto NumWorkItems = NDRange.GlobalSize.size();
213+
auto NumWorkItems = UserGlobalSize.value_or(NDRange.GlobalSize.size());
211214
// For private internalization, the local size is
212215
// (Number of elements in buffer)/(number of work-items)
213216
return NumElementsMem / NumWorkItems;
@@ -237,13 +240,15 @@ static bool accessorEquals(Requirement *Req, Requirement *Other) {
237240

238241
static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
239242
unsigned ArgFunctionIndex, NDRDescT NDRange,
243+
std::optional<size_t> UserGlobalSize,
240244
PromotionMap &Promotions) {
241245
assert(Arg.MType == kernel_param_kind_t::kind_accessor);
242246

243247
Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
244248

245249
auto ThisPromotionTarget = getInternalizationInfo(Req);
246-
auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget);
250+
auto ThisLocalSize =
251+
getLocalSize(NDRange, UserGlobalSize, Req, ThisPromotionTarget);
247252

248253
if (Promotions.count(Req->MSYCLMemObj)) {
249254
// We previously encountered an accessor for the same buffer.
@@ -278,7 +283,7 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
278283
// Recompute the local size for the previous definition with adapted
279284
// promotion target.
280285
auto NewPrevLocalSize =
281-
getLocalSize(PreviousDefinition.NDRange,
286+
getLocalSize(PreviousDefinition.NDRange, std::nullopt,
282287
PreviousDefinition.Definition, Promotion::Local);
283288

284289
if (!NewPrevLocalSize.has_value()) {
@@ -316,7 +321,8 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
316321

317322
if (PreviousDefinition.PromotionTarget == Promotion::Local) {
318323
// Recompute the local size with adapted promotion target.
319-
auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local);
324+
auto ThisLocalSize =
325+
getLocalSize(NDRange, std::nullopt, Req, Promotion::Local);
320326
if (!ThisLocalSize.has_value()) {
321327
printPerformanceWarning("Work-group size for local promotion not "
322328
"specified, not performing internalization");
@@ -591,11 +597,12 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
591597
// argument is later on passed to the kernel.
592598
const size_t SizeAccField =
593599
sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
594-
// Compute the local size and use it for the range parameters.
595-
auto LocalSize = getLocalSize(NDRange, Req,
596-
(PromotedToPrivate) ? Promotion::Private
597-
: Promotion::Local);
598-
range<3> AccessRange{1, 1, LocalSize.value()};
600+
// Compute the local size and use it for the range parameters (only
601+
// relevant for local promotion).
602+
size_t LocalSize = PromotedToLocal ? *getLocalSize(NDRange, std::nullopt,
603+
Req, Promotion::Local)
604+
: 0;
605+
range<3> AccessRange{1, 1, LocalSize};
599606
auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
600607
// Use all-zero as the offset
601608
id<3> AcessOffset{0, 0, 0};
@@ -604,7 +611,7 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
604611
// Override the arguments.
605612
// 1. Override the pointer with a std-layout argument with 'nullptr' as
606613
// value. handler.cpp does the same for local accessors.
607-
int SizeInBytes = Req->MElemSize * LocalSize.value();
614+
int SizeInBytes = Req->MElemSize * LocalSize;
608615
FusedArgs[ArgIndex] =
609616
ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
610617
static_cast<int>(ArgIndex)};
@@ -694,6 +701,20 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
694701
return A.MIndex < B.MIndex;
695702
});
696703

704+
// Determine whether the kernel has been subject to DPCPP's range rounding.
705+
// If so, the first argument will be the original ("user") range.
706+
std::optional<size_t> UserGlobalSize;
707+
if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
708+
KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
709+
!Args.empty() &&
710+
Args[0].MType == kernel_param_kind_t::kind_std_layout && Args[0].MPtr &&
711+
Args[0].MSize == sizeof(size_t)) {
712+
size_t UGS = *reinterpret_cast<size_t *>(Args[0].MPtr);
713+
assert(KernelCG->MNDRDesc.Dims == 1 &&
714+
UGS < KernelCG->MNDRDesc.GlobalSize[0]);
715+
UserGlobalSize = UGS;
716+
}
717+
697718
::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};
698719
size_t ArgIndex = 0;
699720
// The kernel function in SPIR-V will only have the non-eliminated
@@ -719,7 +740,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
719740
if (!Eliminated) {
720741
if (Arg.MType == kernel_param_kind_t::kind_accessor) {
721742
resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
722-
KernelCG->MNDRDesc, PromotedAccs);
743+
KernelCG->MNDRDesc, UserGlobalSize,
744+
PromotedAccs);
723745
}
724746
FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
725747
++ArgFunctionIndex;

sycl/test-e2e/KernelFusion/different_nd_ranges.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,4 +262,7 @@ int main() {
262262
// 1-D, 2-D and 3-D kernels with different global sizes.
263263
test({RangeDesc{{10}, R5}, RangeDesc{{10, 1}, {5, 1}},
264264
RangeDesc{{10, 1, 1}, {5, 1, 1}}});
265+
266+
// Test global sizes that trigger the rounded range kernel insertion.
267+
test({RangeDesc{3000}, RangeDesc{7727}, RangeDesc{4096}});
265268
}

sycl/test-e2e/KernelFusion/private_internalization.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88

99
using namespace sycl;
1010

11-
int main() {
12-
constexpr size_t dataSize = 512;
11+
template <typename BaseName, size_t dataSize> class KernelName;
12+
13+
template <size_t dataSize> static void test() {
1314
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
1415

1516
for (size_t i = 0; i < dataSize; ++i) {
@@ -39,7 +40,7 @@ int main() {
3940
auto accIn2 = bIn2.get_access(cgh);
4041
auto accTmp = bTmp.get_access(
4142
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
42-
cgh.parallel_for<class KernelOne>(
43+
cgh.parallel_for<KernelName<class KernelOne, dataSize>>(
4344
dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
4445
});
4546

@@ -48,7 +49,7 @@ int main() {
4849
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
4950
auto accIn3 = bIn3.get_access(cgh);
5051
auto accOut = bOut.get_access(cgh);
51-
cgh.parallel_for<class KernelTwo>(
52+
cgh.parallel_for<KernelName<class KernelTwo, dataSize>>(
5253
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
5354
});
5455

@@ -63,6 +64,14 @@ int main() {
6364
assert(out[i] == (20 * i * i) && "Computation error");
6465
assert(tmp[i] == -1 && "Not internalized");
6566
}
67+
}
68+
69+
int main() {
70+
// Test power-of-two size.
71+
test<512>();
72+
73+
// Test prime size large enough to trigger rounded-range kernel insertion.
74+
test<7727>();
6675

6776
return 0;
6877
}

0 commit comments

Comments
 (0)