[SYCL][Fusion] Enable fusion of rounded-range kernels

jopperm · jopperm · commit f35a78d1dc10 · 2024-01-30T09:50:31.000Z
Signed-off-by: Julian Oppermann &lt;julian.oppermann@codeplay.com&gt;
diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
@@ -300,6 +300,18 @@ During the fusion process at runtime, the JIT will load the LLVM IR and
 finalize the fused kernel to the final target. More information is available
 [here](./CompilerAndRuntimeDesign.md#kernel-fusion-support).
 
+### Interaction with `parallel_for` range rounding
+
+DPCPP's [range rounding](./ParallelForRangeRounding.md) transformation is
+transparent for fusion, meaning the generated wrapper kernel with the rounded up
+range will be used.
+
+[Private internalization](#internalization-behavior) is supported when fusing
+such kernels. We use the original, unrounded global size when computing the
+private memory size. As range rounding only applies to basic kernels
+(parametrized by a `sycl::range`), local internalization is not affected by the
+range rounding transformation.
+
 ### Unsupported SYCL constructs
 
 The following SYCL API constructs are currently not officially supported for
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
@@ -203,11 +203,14 @@ static Promotion getInternalizationInfo(Requirement *Req) {
   return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
 }
 
-static std::optional<size_t> getLocalSize(NDRDescT NDRange, Requirement *Req,
-                                          Promotion Target) {
+static std::optional<size_t> getLocalSize(NDRDescT NDRange,
+                                          std::optional<size_t> UserGlobalSize,
+                                          Requirement *Req, Promotion Target) {
+  assert((!UserGlobalSize.has_value() || Target != Promotion::Local) &&
+         "Unexpected range rounding");
   auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
   if (Target == Promotion::Private) {
-    auto NumWorkItems = NDRange.GlobalSize.size();
+    auto NumWorkItems = UserGlobalSize.value_or(NDRange.GlobalSize.size());
     // For private internalization, the local size is
     // (Number of elements in buffer)/(number of work-items)
     return NumElementsMem / NumWorkItems;
@@ -237,13 +240,15 @@ static bool accessorEquals(Requirement *Req, Requirement *Other) {
 
 static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
                                    unsigned ArgFunctionIndex, NDRDescT NDRange,
+                                   std::optional<size_t> UserGlobalSize,
                                    PromotionMap &Promotions) {
   assert(Arg.MType == kernel_param_kind_t::kind_accessor);
 
   Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
 
   auto ThisPromotionTarget = getInternalizationInfo(Req);
-  auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget);
+  auto ThisLocalSize =
+      getLocalSize(NDRange, UserGlobalSize, Req, ThisPromotionTarget);
 
   if (Promotions.count(Req->MSYCLMemObj)) {
     // We previously encountered an accessor for the same buffer.
@@ -278,7 +283,7 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
         // Recompute the local size for the previous definition with adapted
         // promotion target.
         auto NewPrevLocalSize =
-            getLocalSize(PreviousDefinition.NDRange,
+            getLocalSize(PreviousDefinition.NDRange, std::nullopt,
                          PreviousDefinition.Definition, Promotion::Local);
 
         if (!NewPrevLocalSize.has_value()) {
@@ -316,7 +321,8 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
 
       if (PreviousDefinition.PromotionTarget == Promotion::Local) {
         // Recompute the local size with adapted promotion target.
-        auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local);
+        auto ThisLocalSize =
+            getLocalSize(NDRange, std::nullopt, Req, Promotion::Local);
         if (!ThisLocalSize.has_value()) {
           printPerformanceWarning("Work-group size for local promotion not "
                                   "specified, not performing internalization");
@@ -591,11 +597,12 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
       // argument is later on passed to the kernel.
       const size_t SizeAccField =
           sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
-      // Compute the local size and use it for the range parameters.
-      auto LocalSize = getLocalSize(NDRange, Req,
-                                    (PromotedToPrivate) ? Promotion::Private
-                                                        : Promotion::Local);
-      range<3> AccessRange{1, 1, LocalSize.value()};
+      // Compute the local size and use it for the range parameters (only
+      // relevant for local promotion).
+      size_t LocalSize = PromotedToLocal ? *getLocalSize(NDRange, std::nullopt,
+                                                         Req, Promotion::Local)
+                                         : 0;
+      range<3> AccessRange{1, 1, LocalSize};
       auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
       // Use all-zero as the offset
       id<3> AcessOffset{0, 0, 0};
@@ -604,7 +611,7 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
       // Override the arguments.
       // 1. Override the pointer with a std-layout argument with 'nullptr' as
       // value. handler.cpp does the same for local accessors.
-      int SizeInBytes = Req->MElemSize * LocalSize.value();
+      int SizeInBytes = Req->MElemSize * LocalSize;
       FusedArgs[ArgIndex] =
           ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
                   static_cast<int>(ArgIndex)};
@@ -694,6 +701,20 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       return A.MIndex < B.MIndex;
     });
 
+    // Determine whether the kernel has been subject to DPCPP's range rounding.
+    // If so, the first argument will be the original ("user") range.
+    std::optional<size_t> UserGlobalSize;
+    if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
+         KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
+        !Args.empty() &&
+        Args[0].MType == kernel_param_kind_t::kind_std_layout && Args[0].MPtr &&
+        Args[0].MSize == sizeof(size_t)) {
+      size_t UGS = *reinterpret_cast<size_t *>(Args[0].MPtr);
+      assert(KernelCG->MNDRDesc.Dims == 1 &&
+             UGS < KernelCG->MNDRDesc.GlobalSize[0]);
+      UserGlobalSize = UGS;
+    }
+
     ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};
     size_t ArgIndex = 0;
     // The kernel function in SPIR-V will only have the non-eliminated
@@ -719,7 +740,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       if (!Eliminated) {
         if (Arg.MType == kernel_param_kind_t::kind_accessor) {
           resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
-                                 KernelCG->MNDRDesc, PromotedAccs);
+                                 KernelCG->MNDRDesc, UserGlobalSize,
+                                 PromotedAccs);
         }
         FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
         ++ArgFunctionIndex;
diff --git a/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp b/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp
@@ -262,4 +262,7 @@ int main() {
   // 1-D, 2-D and 3-D kernels with different global sizes.
   test({RangeDesc{{10}, R5}, RangeDesc{{10, 1}, {5, 1}},
         RangeDesc{{10, 1, 1}, {5, 1, 1}}});
+
+  // Test global sizes that trigger the rounded range kernel insertion.
+  test({RangeDesc{3000}, RangeDesc{7727}, RangeDesc{4096}});
 }
diff --git a/sycl/test-e2e/KernelFusion/private_internalization.cpp b/sycl/test-e2e/KernelFusion/private_internalization.cpp
@@ -8,8 +8,9 @@
 
 using namespace sycl;
 
-int main() {
-  constexpr size_t dataSize = 512;
+template <typename BaseName, size_t dataSize> class KernelName;
+
+template <size_t dataSize> static void test() {
   int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
 
   for (size_t i = 0; i < dataSize; ++i) {
@@ -39,7 +40,7 @@ int main() {
       auto accIn2 = bIn2.get_access(cgh);
       auto accTmp = bTmp.get_access(
           cgh, sycl::ext::codeplay::experimental::property::promote_private{});
-      cgh.parallel_for<class KernelOne>(
+      cgh.parallel_for<KernelName<class KernelOne, dataSize>>(
           dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
     });
 
@@ -48,7 +49,7 @@ int main() {
           cgh, sycl::ext::codeplay::experimental::property::promote_private{});
       auto accIn3 = bIn3.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<class KernelTwo>(
+      cgh.parallel_for<KernelName<class KernelTwo, dataSize>>(
           dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
     });
 
@@ -63,6 +64,14 @@ int main() {
     assert(out[i] == (20 * i * i) && "Computation error");
     assert(tmp[i] == -1 && "Not internalized");
   }
+}
+
+int main() {
+  // Test power-of-two size.
+  test<512>();
+
+  // Test prime size large enough to trigger rounded-range kernel insertion.
+  test<7727>();
 
   return 0;
 }

Original file line number	Diff line number	Diff line change
`@@ -262,4 +262,7 @@ int main() {`
`262`	`262`	`// 1-D, 2-D and 3-D kernels with different global sizes.`
`263`	`263`	`test({RangeDesc{{10}, R5}, RangeDesc{{10, 1}, {5, 1}},`
`264`	`264`	`RangeDesc{{10, 1, 1}, {5, 1, 1}}});`
	`265`	`+`
	`266`	`+ // Test global sizes that trigger the rounded range kernel insertion.`
	`267`	`+ test({RangeDesc{3000}, RangeDesc{7727}, RangeDesc{4096}});`
`265`	`268`	`}`