[SYCL][CUDA] Add missing barrier to collectives (#2990)

Pennycook · web-flow · commit 2b6f2cd7ba0a · 2021-01-14T12:50:44.000+03:00
SYCL sub-group and group functions should act as synchronization points.
Group collectives need a barrier at the end to ensure that back-to-back
collectives do not lead to a race condition.

Note that the barrier at the beginning of each collective occurs after
each work-item writes its partial results to the scratch space. This is
assumed safe because only the collective functions can access the space,
and collective functions must be encountered in uniform control flow; any
work-item encountering a collective function can assume it is safe to use
the scratch space, because all work-items in the same work-group must have
either executed no collective functions or the barrier at the end of a previous
collective function.

Signed-off-by: John Pennycook john.pennycook@intel.com
diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
@@ -260,6 +260,7 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, -DBL_MAX)
         result = OP(sg_x, sg_prefix);                                          \
       }                                                                        \
     }                                                                          \
+    __spirv_ControlBarrier(Workgroup, 0, 0);                                   \
     return result;                                                             \
   }
 
diff --git a/sycl/test/on-device/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp
@@ -0,0 +1,70 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %RUN_ON_HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <numeric>
+#include <vector>
+using namespace cl::sycl;
+using namespace cl::sycl::ONEAPI;
+
+class back_to_back;
+
+int main() {
+  queue q;
+  if (q.get_device().is_host()) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  // Use max work-group size to maximize chance of race
+  program prog(q.get_context());
+  prog.build_with_kernel_type<back_to_back>();
+  kernel k = prog.get_kernel<back_to_back>();
+  device d = q.get_device();
+  int N = k.get_info<info::kernel_device_specific::work_group_size>(d);
+
+  std::vector<int> Input(N), Sum(N), EScan(N), IScan(N);
+  std::iota(Input.begin(), Input.end(), 0);
+  std::fill(Sum.begin(), Sum.end(), 0);
+  std::fill(EScan.begin(), EScan.end(), 0);
+  std::fill(IScan.begin(), IScan.end(), 0);
+
+  {
+    buffer<int> InputBuf(Input.data(), N);
+    buffer<int> SumBuf(Sum.data(), N);
+    buffer<int> EScanBuf(EScan.data(), N);
+    buffer<int> IScanBuf(IScan.data(), N);
+    q.submit([&](handler &h) {
+      auto Input = InputBuf.get_access<access::mode::read>(h);
+      auto Sum = SumBuf.get_access<access::mode::write>(h);
+      auto EScan = EScanBuf.get_access<access::mode::write>(h);
+      auto IScan = IScanBuf.get_access<access::mode::write>(h);
+      h.parallel_for<back_to_back>(nd_range<1>(N, N), [=](nd_item<1> it) {
+        size_t i = it.get_global_id(0);
+        auto g = it.get_group();
+        // Loop to increase number of back-to-back calls
+        for (int r = 0; r < 10; ++r) {
+          Sum[i] = reduce(g, Input[i], plus<>());
+          EScan[i] = exclusive_scan(g, Input[i], plus<>());
+          IScan[i] = inclusive_scan(g, Input[i], plus<>());
+        }
+      });
+    });
+  }
+
+  int sum = 0;
+  bool passed = true;
+  for (int i = 0; i < N; ++i) {
+    passed &= (sum == EScan[i]);
+    sum += i;
+    passed &= (sum == IScan[i]);
+  }
+  for (int i = 0; i < N; ++i) {
+    passed &= (sum == Sum[i]);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}

Original file line number	Diff line number	Diff line change
`@@ -260,6 +260,7 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, -DBL_MAX)`
`260`	`260`	`result = OP(sg_x, sg_prefix); \`
`261`	`261`	`} \`
`262`	`262`	`} \`
	`263`	`+ __spirv_ControlBarrier(Workgroup, 0, 0); \`
`263`	`264`	`return result; \`
`264`	`265`	`}`
`265`	`266`