@@ -203,11 +203,14 @@ static Promotion getInternalizationInfo(Requirement *Req) {
203
203
return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
204
204
}
205
205
206
- static std::optional<size_t > getLocalSize (NDRDescT NDRange, Requirement *Req,
207
- Promotion Target) {
206
+ static std::optional<size_t > getLocalSize (NDRDescT NDRange,
207
+ std::optional<size_t > UserGlobalSize,
208
+ Requirement *Req, Promotion Target) {
209
+ assert ((!UserGlobalSize.has_value () || Target != Promotion::Local) &&
210
+ " Unexpected range rounding" );
208
211
auto NumElementsMem = static_cast <SYCLMemObjT *>(Req->MSYCLMemObj )->size ();
209
212
if (Target == Promotion::Private) {
210
- auto NumWorkItems = NDRange.GlobalSize .size ();
213
+ auto NumWorkItems = UserGlobalSize. value_or ( NDRange.GlobalSize .size () );
211
214
// For private internalization, the local size is
212
215
// (Number of elements in buffer)/(number of work-items)
213
216
return NumElementsMem / NumWorkItems;
@@ -237,13 +240,15 @@ static bool accessorEquals(Requirement *Req, Requirement *Other) {
237
240
238
241
static void resolveInternalization (ArgDesc &Arg, unsigned KernelIndex,
239
242
unsigned ArgFunctionIndex, NDRDescT NDRange,
243
+ std::optional<size_t > UserGlobalSize,
240
244
PromotionMap &Promotions) {
241
245
assert (Arg.MType == kernel_param_kind_t ::kind_accessor);
242
246
243
247
Requirement *Req = static_cast <Requirement *>(Arg.MPtr );
244
248
245
249
auto ThisPromotionTarget = getInternalizationInfo (Req);
246
- auto ThisLocalSize = getLocalSize (NDRange, Req, ThisPromotionTarget);
250
+ auto ThisLocalSize =
251
+ getLocalSize (NDRange, UserGlobalSize, Req, ThisPromotionTarget);
247
252
248
253
if (Promotions.count (Req->MSYCLMemObj )) {
249
254
// We previously encountered an accessor for the same buffer.
@@ -278,7 +283,7 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
278
283
// Recompute the local size for the previous definition with adapted
279
284
// promotion target.
280
285
auto NewPrevLocalSize =
281
- getLocalSize (PreviousDefinition.NDRange ,
286
+ getLocalSize (PreviousDefinition.NDRange , std::nullopt,
282
287
PreviousDefinition.Definition , Promotion::Local);
283
288
284
289
if (!NewPrevLocalSize.has_value ()) {
@@ -316,7 +321,8 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
316
321
317
322
if (PreviousDefinition.PromotionTarget == Promotion::Local) {
318
323
// Recompute the local size with adapted promotion target.
319
- auto ThisLocalSize = getLocalSize (NDRange, Req, Promotion::Local);
324
+ auto ThisLocalSize =
325
+ getLocalSize (NDRange, std::nullopt, Req, Promotion::Local);
320
326
if (!ThisLocalSize.has_value ()) {
321
327
printPerformanceWarning (" Work-group size for local promotion not "
322
328
" specified, not performing internalization" );
@@ -591,11 +597,12 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
591
597
// argument is later on passed to the kernel.
592
598
const size_t SizeAccField =
593
599
sizeof (size_t ) * (Req->MDims == 0 ? 1 : Req->MDims );
594
- // Compute the local size and use it for the range parameters.
595
- auto LocalSize = getLocalSize (NDRange, Req,
596
- (PromotedToPrivate) ? Promotion::Private
597
- : Promotion::Local);
598
- range<3 > AccessRange{1 , 1 , LocalSize.value ()};
600
+ // Compute the local size and use it for the range parameters (only
601
+ // relevant for local promotion).
602
+ size_t LocalSize = PromotedToLocal ? *getLocalSize (NDRange, std::nullopt,
603
+ Req, Promotion::Local)
604
+ : 0 ;
605
+ range<3 > AccessRange{1 , 1 , LocalSize};
599
606
auto *RangeArg = storePlainArg (FusedArgStorage, AccessRange);
600
607
// Use all-zero as the offset
601
608
id<3 > AcessOffset{0 , 0 , 0 };
@@ -604,7 +611,7 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
604
611
// Override the arguments.
605
612
// 1. Override the pointer with a std-layout argument with 'nullptr' as
606
613
// value. handler.cpp does the same for local accessors.
607
- int SizeInBytes = Req->MElemSize * LocalSize. value () ;
614
+ int SizeInBytes = Req->MElemSize * LocalSize;
608
615
FusedArgs[ArgIndex] =
609
616
ArgDesc{kernel_param_kind_t ::kind_std_layout, nullptr , SizeInBytes,
610
617
static_cast <int >(ArgIndex)};
@@ -694,6 +701,20 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
694
701
return A.MIndex < B.MIndex ;
695
702
});
696
703
704
+ // Determine whether the kernel has been subject to DPCPP's range rounding.
705
+ // If so, the first argument will be the original ("user") range.
706
+ std::optional<size_t > UserGlobalSize;
707
+ if ((KernelName.find (" _ZTSN4sycl3_V16detail18RoundedRangeKernel" ) == 0 ||
708
+ KernelName.find (" _ZTSN4sycl3_V16detail19__pf_kernel_wrapper" ) == 0 ) &&
709
+ !Args.empty () &&
710
+ Args[0 ].MType == kernel_param_kind_t ::kind_std_layout && Args[0 ].MPtr &&
711
+ Args[0 ].MSize == sizeof (size_t )) {
712
+ size_t UGS = *reinterpret_cast <size_t *>(Args[0 ].MPtr );
713
+ assert (KernelCG->MNDRDesc .Dims == 1 &&
714
+ UGS < KernelCG->MNDRDesc .GlobalSize [0 ]);
715
+ UserGlobalSize = UGS;
716
+ }
717
+
697
718
::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size ()};
698
719
size_t ArgIndex = 0 ;
699
720
// The kernel function in SPIR-V will only have the non-eliminated
@@ -719,7 +740,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
719
740
if (!Eliminated) {
720
741
if (Arg.MType == kernel_param_kind_t ::kind_accessor) {
721
742
resolveInternalization (Arg, KernelIndex, ArgFunctionIndex,
722
- KernelCG->MNDRDesc , PromotedAccs);
743
+ KernelCG->MNDRDesc , UserGlobalSize,
744
+ PromotedAccs);
723
745
}
724
746
FusedParams.emplace_back (Arg, KernelIndex, ArgFunctionIndex, true );
725
747
++ArgFunctionIndex;
0 commit comments