intel · steffenlarsen · Dec 8, 2023 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023
@@ -87,6 +87,16 @@ bool jit_compiler::isHeterogeneousList(ArrayRef<NDRange> NDRanges) {
   return any_of(NDRanges, [&ND](const auto &Other) { return ND != Other; });
 }
 
+static bool wouldYieldUniformWorkGroupSize(const Indices &LocalSize,
+                                           llvm::ArrayRef<NDRange> NDRanges) {
+  const auto GlobalSize = getMaximalGlobalSize(NDRanges);
+  return llvm::all_of(llvm::zip_equal(GlobalSize, LocalSize),
+                      [](const std::tuple<std::size_t, std::size_t> &P) {
+                        const auto &[GlobalSize, LocalSize] = P;
+                        return GlobalSize % LocalSize == 0;
+                      });
+}
+
 bool jit_compiler::isValidCombination(llvm::ArrayRef<NDRange> NDRanges) {
   if (NDRanges.empty()) {
     return false;
@@ -95,9 +105,14 @@ bool jit_compiler::isValidCombination(llvm::ArrayRef<NDRange> NDRanges) {
   const auto &ND = FirstSpecifiedLocalSize == NDRanges.end()
                        ? NDRanges.front()
                        : *FirstSpecifiedLocalSize;
-  return llvm::all_of(NDRanges, [&ND](const auto &Other) {
-    return compatibleRanges(ND, Other);
-  });
+  return llvm::all_of(NDRanges,
+                      [&ND](const auto &Other) {
+                        return compatibleRanges(ND, Other);
+                      }) &&
+         // Either no local size is specified or the maximal global size is
+         // compatible with the specified local size.
+         (FirstSpecifiedLocalSize == NDRanges.end() ||
+          wouldYieldUniformWorkGroupSize(ND.getLocalSize(), NDRanges));
 }
 
 bool jit_compiler::requireIDRemapping(const NDRange &LHS, const NDRange &RHS) {

@@ -87,8 +87,9 @@ FusionResult KernelFusion::fuseKernels(
 
   if (!isValidCombination(NDRanges)) {
     return FusionResult{
-        "Cannot fuse kernels with different offsets or local sizes or "
-        "different global sizes in dimensions [2, N) and non-zero offsets"};
+        "Cannot fuse kernels with different offsets or local sizes, or "
+        "different global sizes in dimensions [2, N) and non-zero offsets, "
+        "or those whose fusion would yield non-uniform work-groups sizes"};
   }
 
   bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges);

@@ -14,9 +14,16 @@ constexpr size_t dataSize = 512;
 
 enum class Internalization { None, Local, Private };
 
-template <typename Kernel1Name, typename Kernel2Name, int Kernel1Dim>
-void performFusion(queue &q, range<Kernel1Dim> k1Global,
-                   range<Kernel1Dim> k1Local) {
+template <typename Range> size_t getSize(Range r);
+
+template <> size_t getSize(range<1> r) { return r.size(); }
+template <> size_t getSize(nd_range<1> r) {
+  return r.get_global_range().size();
+}
+
+template <typename Kernel1Name, typename Kernel2Name, typename Range1,
+          typename Range2>
+void performFusion(queue &q, Range1 R1, Range2 R2) {
   int in[dataSize], tmp[dataSize], out[dataSize];
 
   for (size_t i = 0; i < dataSize; ++i) {
@@ -37,19 +44,15 @@ void performFusion(queue &q, range<Kernel1Dim> k1Global,
     q.submit([&](handler &cgh) {
       auto accIn = bIn.get_access(cgh);
       auto accTmp = bTmp.get_access(cgh);
-      cgh.parallel_for<Kernel1Name>(nd_range<Kernel1Dim>{k1Global, k1Local},
-                                    [=](item<Kernel1Dim> i) {
-                                      auto LID = i.get_linear_id();
-                                      accTmp[LID] = accIn[LID] + 5;
-                                    });
+      cgh.parallel_for<Kernel1Name>(
+          R1, [=](item<1> i) { accTmp[i] = accIn[i] + 5; });
     });
 
     q.submit([&](handler &cgh) {
       auto accTmp = bTmp.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<Kernel2Name>(nd_range<1>{{dataSize}, {8}}, [=](id<1> i) {
-        accOut[i] = accTmp[i] * 2;
-      });
+      cgh.parallel_for<Kernel2Name>(
+          R2, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
     });
 
     fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
@@ -60,7 +63,8 @@ void performFusion(queue &q, range<Kernel1Dim> k1Global,
 
   // Check the results
   size_t numErrors = 0;
-  for (size_t i = 0; i < k1Global.size(); ++i) {
+  size_t size = getSize(R1);
+  for (size_t i = 0; i < size; ++i) {
     if (out[i] != ((i + 5) * 2)) {
       ++numErrors;
     }
@@ -89,8 +93,9 @@ int main() {
 
   // Scenario: Fusing two kernels with different local size should lead to
   // fusion being aborted.
-  performFusion<class Kernel1_3, class Kernel2_3>(q, range<1>{dataSize},
-                                                  range<1>{16});
+  performFusion<class Kernel1_3, class Kernel2_3>(
+      q, nd_range<1>{range<1>{dataSize}, range<1>{16}},
+      nd_range<1>{range<1>{dataSize}, range<1>{8}});
   // CHECK: ERROR: JIT compilation for kernel fusion failed with message:
   // CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
   // CHECK: COMPUTATION OK
@@ -101,5 +106,13 @@ int main() {
   // CHECK-NOT: Cannot fuse kernels with different offsets or local sizes
   // CHECK: WARNING: Fusion list is empty
 
+  // Scenario: Fusing two kernels that would lead to non-uniform work-group
+  // sizes should lead to fusion being aborted.
+  performFusion<class Kernel1_4, class Kernel2_4>(
+      q, nd_range<1>{range<1>{9}, range<1>{3}}, range<1>{dataSize});
+  // CHECK: ERROR: JIT compilation for kernel fusion failed with message:
+  // CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
+  // CHECK: COMPUTATION OK
+
   return 0;
 }