Support multi-dimensional kernels.

jopperm · jopperm · commit 96a24aef9d06 · 2024-01-30T09:50:31.000Z
Signed-off-by: Julian Oppermann &lt;julian.oppermann@codeplay.com&gt;
diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
@@ -307,10 +307,10 @@ transparent for fusion, meaning the generated wrapper kernel with the rounded up
 range will be used.
 
 [Private internalization](#internalization-behavior) is supported when fusing
-such kernels. We use the original, unrounded global size when computing the
-private memory size. As range rounding only applies to basic kernels
-(parametrized by a `sycl::range`), local internalization is not affected by the
-range rounding transformation.
+such kernels. We use the original, unrounded global size in dimension 0 when
+computing the private memory size. As range rounding only applies to basic
+kernels (parametrized by a `sycl::range`), local internalization is not affected
+by the range rounding transformation.
 
 ### Unsupported SYCL constructs
 
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
@@ -210,7 +210,11 @@ static std::optional<size_t> getLocalSize(NDRDescT NDRange,
          "Unexpected range rounding");
   auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
   if (Target == Promotion::Private) {
-    auto NumWorkItems = UserGlobalSize.value_or(NDRange.GlobalSize.size());
+    if (UserGlobalSize.has_value()) {
+      // Only the first dimension is affected by range rounding.
+      NDRange.GlobalSize[0] = *UserGlobalSize;
+    }
+    auto NumWorkItems = NDRange.GlobalSize.size();
     // For private internalization, the local size is
     // (Number of elements in buffer)/(number of work-items)
     return NumElementsMem / NumWorkItems;
@@ -706,13 +710,18 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
     std::optional<size_t> UserGlobalSize;
     if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
          KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
-        !Args.empty() &&
-        Args[0].MType == kernel_param_kind_t::kind_std_layout && Args[0].MPtr &&
-        Args[0].MSize == sizeof(size_t)) {
-      size_t UGS = *reinterpret_cast<size_t *>(Args[0].MPtr);
-      assert(KernelCG->MNDRDesc.Dims == 1 &&
-             UGS < KernelCG->MNDRDesc.GlobalSize[0]);
-      UserGlobalSize = UGS;
+        !Args.empty()) {
+      auto &A0 = Args[0];
+      int Dims = KernelCG->MNDRDesc.Dims;
+      if (A0.MPtr && A0.MSize == (Dims * sizeof(size_t)) &&
+          A0.MType == kernel_param_kind_t::kind_std_layout) {
+        size_t *UGS = reinterpret_cast<size_t *>(A0.MPtr);
+        // Range-rounding only applies to the first dimension.
+        assert(UGS[0] > KernelCG->MNDRDesc.GlobalSize[1]);
+        assert(Dims < 2 || UGS[1] == KernelCG->MNDRDesc.GlobalSize[1]);
+        assert(Dims < 3 || UGS[2] == KernelCG->MNDRDesc.GlobalSize[2]);
+        UserGlobalSize = UGS[0];
+      }
     }
 
     ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};
diff --git a/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp b/sycl/test-e2e/KernelFusion/different_nd_ranges.cpp
@@ -1,5 +1,6 @@
 // RUN: %{build} -o %t.out
-// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
+// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 \
+// RUN:   %{run} %t.out 2>&1 | FileCheck %s
 
 // Test complete fusion of kernels with different ND-ranges.
 
@@ -264,5 +265,10 @@ int main() {
         RangeDesc{{10, 1, 1}, {5, 1, 1}}});
 
   // Test global sizes that trigger the rounded range kernel insertion.
-  test({RangeDesc{3000}, RangeDesc{7727}, RangeDesc{4096}});
+  // Note that we lower the RR threshold when running this test.
+  test({RangeDesc{67}, RangeDesc{87}, RangeDesc{64}});
+
+  // Test multi-dimensional range-rounded kernels. Only the first dimension will
+  // be rounded up.
+  test({RangeDesc{30, 67}, RangeDesc{76, 55}, RangeDesc{64, 64}});
 }
diff --git a/sycl/test-e2e/KernelFusion/private_internalization.cpp b/sycl/test-e2e/KernelFusion/private_internalization.cpp
@@ -1,5 +1,5 @@
 // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
-// RUN: %{run} %t.out
+// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:512 %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
 // accessors.
@@ -71,7 +71,8 @@ int main() {
   test<512>();
 
   // Test prime size large enough to trigger rounded-range kernel insertion.
-  test<7727>();
+  // Note that we lower the RR threshold when running this test.
+  test<523>();
 
   return 0;
 }
diff --git a/sycl/test-e2e/KernelFusion/two_dimensional.cpp b/sycl/test-e2e/KernelFusion/two_dimensional.cpp
@@ -1,5 +1,5 @@
 // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
-// RUN: %{run} %t.out
+// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
 // accessors for two-dimensional range.
@@ -8,9 +8,9 @@
 
 using namespace sycl;
 
-int main() {
-  constexpr size_t sizeX = 16;
-  constexpr size_t sizeY = 32;
+template <typename BaseName, size_t sizeX, size_t sizeY> class KernelName;
+
+template <size_t sizeX, size_t sizeY> static void test() {
   constexpr size_t dataSize = sizeX * sizeY;
   int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
 
@@ -42,7 +42,7 @@ int main() {
       auto accIn2 = bIn2.get_access(cgh);
       auto accTmp = bTmp.get_access(
           cgh, sycl::ext::codeplay::experimental::property::promote_private{});
-      cgh.parallel_for<class KernelOne>(
+      cgh.parallel_for<KernelName<class KernelOne, sizeX, sizeY>>(
           xyRange, [=](id<2> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
     });
 
@@ -51,7 +51,7 @@ int main() {
           cgh, sycl::ext::codeplay::experimental::property::promote_private{});
       auto accIn3 = bIn3.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<class KernelTwo>(
+      cgh.parallel_for<KernelName<class KernelTwo, sizeX, sizeY>>(
           xyRange, [=](id<2> i) { accOut[i] = accTmp[i] * accIn3[i]; });
     });
 
@@ -66,6 +66,15 @@ int main() {
     assert(out[i] == (20 * i * i) && "Computation error");
     assert(tmp[i] == -1 && "Not internalized");
   }
+}
+
+int main() {
+  // Test power-of-two size.
+  test<16, 32>();
+
+  // Test prime sizes large enough to trigger rounded-range kernel insertion.
+  // Note that we lower the RR threshold when running this test.
+  test<67, 79>();
 
   return 0;
 }