[SYCL][Fusion] Enable fusion of rounded-range kernels (#12492)

jopperm · web-flow · commit a3e231587ceb · 2024-01-31T08:41:37.000+01:00
Enable, test, and document the support for fusing rounded range kernels.
This mostly worked already – we just have to query the original kernel's
global size, and use that to compute the private memory size used for
internalization.

---------

Signed-off-by: Julian Oppermann &lt;julian.oppermann@codeplay.com&gt;
diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
@@ -300,6 +300,18 @@ During the fusion process at runtime, the JIT will load the LLVM IR and
 finalize the fused kernel to the final target. More information is available
 [here](./CompilerAndRuntimeDesign.md#kernel-fusion-support).
 
+### Interaction with `parallel_for` range rounding
+
+DPCPP's [range rounding](./ParallelForRangeRounding.md) transformation is
+transparent for fusion, meaning the generated wrapper kernel with the rounded up
+range will be used.
+
+[Private internalization](#internalization-behavior) is supported when fusing
+such kernels. We use the original, unrounded global size in dimension 0 when
+computing the private memory size. As range rounding only applies to basic
+kernels (parametrized by a `sycl::range`), local internalization is not affected
+by the range rounding transformation.
+
 ### Unsupported SYCL constructs
 
 The following SYCL API constructs are currently not officially supported for
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
@@ -203,10 +203,17 @@ static Promotion getInternalizationInfo(Requirement *Req) {
   return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
 }
 
-static std::optional<size_t> getLocalSize(NDRDescT NDRange, Requirement *Req,
-                                          Promotion Target) {
+static std::optional<size_t> getLocalSize(NDRDescT NDRange,
+                                          std::optional<size_t> UserGlobalSize,
+                                          Requirement *Req, Promotion Target) {
+  assert((!UserGlobalSize.has_value() || Target != Promotion::Local) &&
+         "Unexpected range rounding");
   auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
   if (Target == Promotion::Private) {
+    if (UserGlobalSize.has_value()) {
+      // Only the first dimension is affected by range rounding.
+      NDRange.GlobalSize[0] = *UserGlobalSize;
+    }
     auto NumWorkItems = NDRange.GlobalSize.size();
     // For private internalization, the local size is
     // (Number of elements in buffer)/(number of work-items)
@@ -237,13 +244,15 @@ static bool accessorEquals(Requirement *Req, Requirement *Other) {
 
 static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
                                    unsigned ArgFunctionIndex, NDRDescT NDRange,
+                                   std::optional<size_t> UserGlobalSize,
                                    PromotionMap &Promotions) {
   assert(Arg.MType == kernel_param_kind_t::kind_accessor);
 
   Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
 
   auto ThisPromotionTarget = getInternalizationInfo(Req);
-  auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget);
+  auto ThisLocalSize =
+      getLocalSize(NDRange, UserGlobalSize, Req, ThisPromotionTarget);
 
   if (Promotions.count(Req->MSYCLMemObj)) {
     // We previously encountered an accessor for the same buffer.
@@ -278,7 +287,7 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
         // Recompute the local size for the previous definition with adapted
         // promotion target.
         auto NewPrevLocalSize =
-            getLocalSize(PreviousDefinition.NDRange,
+            getLocalSize(PreviousDefinition.NDRange, std::nullopt,
                          PreviousDefinition.Definition, Promotion::Local);
 
         if (!NewPrevLocalSize.has_value()) {
@@ -316,7 +325,8 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
 
       if (PreviousDefinition.PromotionTarget == Promotion::Local) {
         // Recompute the local size with adapted promotion target.
-        auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local);
+        auto ThisLocalSize =
+            getLocalSize(NDRange, std::nullopt, Req, Promotion::Local);
         if (!ThisLocalSize.has_value()) {
           printPerformanceWarning("Work-group size for local promotion not "
                                   "specified, not performing internalization");
@@ -591,11 +601,12 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
       // argument is later on passed to the kernel.
       const size_t SizeAccField =
           sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
-      // Compute the local size and use it for the range parameters.
-      auto LocalSize = getLocalSize(NDRange, Req,
-                                    (PromotedToPrivate) ? Promotion::Private
-                                                        : Promotion::Local);
-      range<3> AccessRange{1, 1, LocalSize.value()};
+      // Compute the local size and use it for the range parameters (only
+      // relevant for local promotion).
+      size_t LocalSize = PromotedToLocal ? *getLocalSize(NDRange, std::nullopt,
+                                                         Req, Promotion::Local)
+                                         : 0;
+      range<3> AccessRange{1, 1, LocalSize};
       auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
       // Use all-zero as the offset
       id<3> AcessOffset{0, 0, 0};
@@ -604,7 +615,7 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
       // Override the arguments.
       // 1. Override the pointer with a std-layout argument with 'nullptr' as
       // value. handler.cpp does the same for local accessors.
-      int SizeInBytes = Req->MElemSize * LocalSize.value();
+      int SizeInBytes = Req->MElemSize * LocalSize;
       FusedArgs[ArgIndex] =
           ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
                   static_cast<int>(ArgIndex)};
@@ -694,6 +705,26 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       return A.MIndex < B.MIndex;
     });
 
+    // Determine whether the kernel has been subject to DPCPP's range rounding.
+    // If so, the first argument will be the original ("user") range.
+    std::optional<size_t> UserGlobalSize;
+    if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
+         KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
+        !Args.empty()) {
+      auto &A0 = Args[0];
+      auto Dims = KernelCG->MNDRDesc.Dims;
+      assert(A0.MPtr && A0.MSize == static_cast<int>(Dims * sizeof(size_t)) &&
+             A0.MType == kernel_param_kind_t::kind_std_layout &&
+             "Unexpected signature for rounded range kernel");
+
+      size_t *UGS = reinterpret_cast<size_t *>(A0.MPtr);
+      // Range-rounding only applies to the first dimension.
+      assert(UGS[0] > KernelCG->MNDRDesc.GlobalSize[1]);
+      assert(Dims < 2 || UGS[1] == KernelCG->MNDRDesc.GlobalSize[1]);
+      assert(Dims < 3 || UGS[2] == KernelCG->MNDRDesc.GlobalSize[2]);
+      UserGlobalSize = UGS[0];
+    }
+
     ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};
     size_t ArgIndex = 0;
     // The kernel function in SPIR-V will only have the non-eliminated
@@ -719,7 +750,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       if (!Eliminated) {
         if (Arg.MType == kernel_param_kind_t::kind_accessor) {
           resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
-                                 KernelCG->MNDRDesc, PromotedAccs);
+                                 KernelCG->MNDRDesc, UserGlobalSize,
+                                 PromotedAccs);
         }
         FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
         ++ArgFunctionIndex;
diff --git a/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp b/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp
@@ -1,5 +1,6 @@
 // RUN: %{build} -o %t.out
-// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
+// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 \
+// RUN:   %{run} %t.out 2>&1 | FileCheck %s
 
 // Test complete fusion of kernels with different ND-ranges.
 
@@ -262,4 +263,12 @@ int main() {
   // 1-D, 2-D and 3-D kernels with different global sizes.
   test({RangeDesc{{10}, R5}, RangeDesc{{10, 1}, {5, 1}},
         RangeDesc{{10, 1, 1}, {5, 1, 1}}});
+
+  // Test global sizes that trigger the rounded range kernel insertion.
+  // Note that we lower the RR threshold when running this test.
+  test({RangeDesc{67}, RangeDesc{87}, RangeDesc{64}});
+
+  // Test multi-dimensional range-rounded kernels. Only the first dimension will
+  // be rounded up.
+  test({RangeDesc{30, 67}, RangeDesc{76, 55}, RangeDesc{64, 64}});
 }
diff --git a/sycl/test-e2e/KernelFusion/private_internalization.cpp b/sycl/test-e2e/KernelFusion/private_internalization.cpp
@@ -1,5 +1,5 @@
 // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
-// RUN: %{run} %t.out
+// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:512 %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
 // accessors.
@@ -8,8 +8,9 @@
 
 using namespace sycl;
 
-int main() {
-  constexpr size_t dataSize = 512;
+template <typename BaseName, size_t dataSize> class KernelName;
+
+template <size_t dataSize> static void test() {
   int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
 
   for (size_t i = 0; i < dataSize; ++i) {
@@ -39,7 +40,7 @@ int main() {
       auto accIn2 = bIn2.get_access(cgh);
       auto accTmp = bTmp.get_access(
           cgh, sycl::ext::codeplay::experimental::property::promote_private{});
-      cgh.parallel_for<class KernelOne>(
+      cgh.parallel_for<KernelName<class KernelOne, dataSize>>(
           dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
     });
 
@@ -48,7 +49,7 @@ int main() {
           cgh, sycl::ext::codeplay::experimental::property::promote_private{});
       auto accIn3 = bIn3.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<class KernelTwo>(
+      cgh.parallel_for<KernelName<class KernelTwo, dataSize>>(
           dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
     });
 
@@ -63,6 +64,15 @@ int main() {
     assert(out[i] == (20 * i * i) && "Computation error");
     assert(tmp[i] == -1 && "Not internalized");
   }
+}
+
+int main() {
+  // Test power-of-two size.
+  test<512>();
+
+  // Test prime size large enough to trigger rounded-range kernel insertion.
+  // Note that we lower the RR threshold when running this test.
+  test<523>();
 
   return 0;
 }
diff --git a/sycl/test-e2e/KernelFusion/two_dimensional.cpp b/sycl/test-e2e/KernelFusion/two_dimensional.cpp
@@ -1,5 +1,5 @@
 // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
-// RUN: %{run} %t.out
+// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
 // accessors for two-dimensional range.
@@ -8,9 +8,9 @@
 
 using namespace sycl;
 
-int main() {
-  constexpr size_t sizeX = 16;
-  constexpr size_t sizeY = 32;
+template <typename BaseName, size_t sizeX, size_t sizeY> class KernelName;
+
+template <size_t sizeX, size_t sizeY> static void test() {
   constexpr size_t dataSize = sizeX * sizeY;
   int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
 
@@ -42,7 +42,7 @@ int main() {
       auto accIn2 = bIn2.get_access(cgh);
       auto accTmp = bTmp.get_access(
           cgh, sycl::ext::codeplay::experimental::property::promote_private{});
-      cgh.parallel_for<class KernelOne>(
+      cgh.parallel_for<KernelName<class KernelOne, sizeX, sizeY>>(
           xyRange, [=](id<2> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
     });
 
@@ -51,7 +51,7 @@ int main() {
           cgh, sycl::ext::codeplay::experimental::property::promote_private{});
       auto accIn3 = bIn3.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<class KernelTwo>(
+      cgh.parallel_for<KernelName<class KernelTwo, sizeX, sizeY>>(
           xyRange, [=](id<2> i) { accOut[i] = accTmp[i] * accIn3[i]; });
     });
 
@@ -66,6 +66,15 @@ int main() {
     assert(out[i] == (20 * i * i) && "Computation error");
     assert(tmp[i] == -1 && "Not internalized");
   }
+}
+
+int main() {
+  // Test power-of-two size.
+  test<16, 32>();
+
+  // Test prime sizes large enough to trigger rounded-range kernel insertion.
+  // Note that we lower the RR threshold when running this test.
+  test<67, 79>();
 
   return 0;
 }