intel · pvchupin · Oct 11, 2019 · Sep 10, 2019 · Oct 11, 2019
@@ -55,12 +55,12 @@ extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInSubgroupL
   };                                                                           \
                                                                                \
   template <class DstT> struct InitSizesST##POSTFIX<2, DstT> {                 \
-    static DstT initSize() { return {get##POSTFIX<0>(), get##POSTFIX<1>()}; }  \
+    static DstT initSize() { return {get##POSTFIX<1>(), get##POSTFIX<0>()}; }  \
   };                                                                           \
                                                                                \
   template <class DstT> struct InitSizesST##POSTFIX<3, DstT> {                 \
     static DstT initSize() {                                                   \
-      return {get##POSTFIX<0>(), get##POSTFIX<1>(), get##POSTFIX<2>()};        \
+      return {get##POSTFIX<2>(), get##POSTFIX<1>(), get##POSTFIX<0>()};        \
     }                                                                          \
   };                                                                           \
                                                                                \

@@ -185,19 +185,12 @@ class HostKernel : public HostKernelBase {
   template <class ArgT = KernelArgType>
   typename std::enable_if<std::is_same<ArgT, sycl::id<Dims>>::value>::type
   runOnHost(const NDRDescT &NDRDesc) {
-    size_t XYZ[3] = {0};
-    sycl::id<Dims> ID;
-    for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
-      XYZ[1] = 0;
-      for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
-        XYZ[0] = 0;
-        for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
-          for (int I = 0; I < Dims; ++I)
-            ID[I] = XYZ[I];
-          MKernel(ID);
-        }
-      }
-    }
+    sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
+    for (int I = 0; I < Dims; ++I)
+      Range[I] = NDRDesc.GlobalSize[I];
+
+    detail::NDLoop<Dims>::iterate(
+        Range, [&](const sycl::id<Dims> &ID) { MKernel(ID); });
   }
 
   template <class ArgT = KernelArgType>
@@ -210,20 +203,11 @@ class HostKernel : public HostKernelBase {
     for (int I = 0; I < Dims; ++I)
       Range[I] = NDRDesc.GlobalSize[I];
 
-    for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
-      XYZ[1] = 0;
-      for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
-        XYZ[0] = 0;
-        for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
-          for (int I = 0; I < Dims; ++I)
-            ID[I] = XYZ[I];
-
-          sycl::item<Dims, /*Offset=*/false> Item =
-              IDBuilder::createItem<Dims, false>(Range, ID);
-          MKernel(Item);
-        }
-      }
-    }
+    detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> ID) {
+      sycl::item<Dims, /*Offset=*/false> Item =
+          IDBuilder::createItem<Dims, false>(Range, ID);
+      MKernel(Item);
+    });
   }
 
   template <class ArgT = KernelArgType>
@@ -236,22 +220,13 @@ class HostKernel : public HostKernelBase {
       Range[I] = NDRDesc.GlobalSize[I];
       Offset[I] = NDRDesc.GlobalOffset[I];
     }
-    size_t XYZ[3] = {0};
-    sycl::id<Dims> ID;
-    for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
-      XYZ[1] = 0;
-      for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
-        XYZ[0] = 0;
-        for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
-          for (int I = 0; I < Dims; ++I)
-            ID[I] = XYZ[I] + Offset[I];
-
-          sycl::item<Dims, /*Offset=*/true> Item =
-              IDBuilder::createItem<Dims, true>(Range, ID, Offset);
-          MKernel(Item);
-        }
-      }
-    }
+
+    detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> &ID) {
+      sycl::id<Dims> OffsetID = ID + Offset;
+      sycl::item<Dims, /*Offset=*/true> Item =
+          IDBuilder::createItem<Dims, true>(Range, OffsetID, Offset);
+      MKernel(Item);
+    });
   }
 
   template <class ArgT = KernelArgType>

@@ -158,9 +158,9 @@ struct NDLoopIterateImpl {
                     const LoopBoundTy<NDIMS> &Stride,
                     const LoopBoundTy<NDIMS> &UpperBound, FuncTy f,
                     LoopIndexTy<NDIMS> &Index) {
-
-    for (Index[DIM] = LowerBound[DIM]; Index[DIM] < UpperBound[DIM];
-         Index[DIM] += Stride[DIM]) {
+    constexpr size_t AdjIdx = NDIMS - 1 - DIM;
+    for (Index[AdjIdx] = LowerBound[AdjIdx]; Index[AdjIdx] < UpperBound[AdjIdx];
+         Index[AdjIdx] += Stride[AdjIdx]) {
 
       NDLoopIterateImpl<NDIMS, DIM - 1, LoopBoundTy, FuncTy, LoopIndexTy>{
           LowerBound, Stride, UpperBound, f, Index};
@@ -177,8 +177,9 @@ struct NDLoopIterateImpl<NDIMS, 0, LoopBoundTy, FuncTy, LoopIndexTy> {
                     const LoopBoundTy<NDIMS> &UpperBound, FuncTy f,
                     LoopIndexTy<NDIMS> &Index) {
 
-    for (Index[0] = LowerBound[0]; Index[0] < UpperBound[0];
-         Index[0] += Stride[0]) {
+    constexpr size_t AdjIdx = NDIMS - 1;
+    for (Index[AdjIdx] = LowerBound[AdjIdx]; Index[AdjIdx] < UpperBound[AdjIdx];
+         Index[AdjIdx] += Stride[AdjIdx]) {
 
       f(Index);
     }
@@ -190,6 +191,7 @@ struct NDLoopIterateImpl<NDIMS, 0, LoopBoundTy, FuncTy, LoopIndexTy> {
 /// over a multi-dimensional space - it allows to avoid generating unnecessary
 /// outer loops like 'for (int z=0; z<1; z++)' in case of 1D and 2D iteration
 /// spaces or writing specializations of the algorithms for 1D, 2D and 3D cases.
+/// Loop is unrolled in a reverse directions, i.e. ID = 0 is the inner-most one.
 template <int NDIMS> struct NDLoop {
   /// Generates ND loop nest with {0,..0} .. \c UpperBound bounds with unit
   /// stride. Applies \c f at each iteration, passing current index of

@@ -21,7 +21,7 @@ template <int dimensions = 1> class id : public detail::array<dimensions> {
 private:
   using base = detail::array<dimensions>;
   static_assert(dimensions >= 1 && dimensions <= 3,
-                "id can only be 1, 2, or 3 dimentional.");
+                "id can only be 1, 2, or 3 dimensional.");
   template <int N, int val, typename T>
   using ParamTy = detail::enable_if_t<(N == val), T>;
 

@@ -17,7 +17,7 @@ namespace sycl {
 template <int dimensions> class id;
 template <int dimensions = 1> class range : public detail::array<dimensions> {
   static_assert(dimensions >= 1 && dimensions <= 3,
-                "range can only be 1, 2, or 3 dimentional.");
+                "range can only be 1, 2, or 3 dimensional.");
   using base = detail::array<dimensions>;
 
 public:

@@ -578,6 +578,23 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
   NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
 }
 
+// We have the following mapping between dimensions with SPIRV builtins:
+// 1D: id[0] -> x
+// 2D: id[0] -> y, id[1] -> x
+// 3D: id[0] -> z, id[1] -> y, id[2] -> x
+// So in order to ensure the correctness we update all the kernel
+// parameters accordingly.
+// Initially we keep the order of NDRDescT as it provided by the user, this
+// simplifies overall handling and do the reverse only when
+// the kernel is enqueued.
+static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
+  if (NDR.Dims > 1) {
+    std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
+    std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
+    std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
+  }
+}
+
 // The function initialize accessors and calls lambda.
 // The function is used as argument to piEnqueueNativeKernel which requires
 // that the passed function takes one void* argument.
@@ -803,10 +820,15 @@ cl_int ExecCGCommand::enqueueImp() {
         getSyclObjImpl(Context)->getUSMDispatch();
     USMDispatch->setKernelIndirectAccess(Kernel, MQueue->getHandleRef());
 
+    // Remember this information before the range dimensions are reversed
+    const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
+
+    ReverseRangeDimensionsForKernel(NDRDesc);
+
     PI_CALL(RT::piEnqueueKernelLaunch(
         MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
         &NDRDesc.GlobalSize[0],
-        NDRDesc.LocalSize[0] ? &NDRDesc.LocalSize[0] : nullptr,
+        HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
         RawEvents.size(),
         RawEvents.empty() ? nullptr : &RawEvents[0], &Event));
 

@@ -0,0 +1,50 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s
+//==--------------- linear-host-dev.cpp - SYCL linear id test -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+
+// Check that linear id is monotincally increased on host device.
+// Only there we can reliable check that. Since the kernel has a restriction
+// regarding usage of global variables, use stream to log the linear id
+// and ensure that they're monotonically increased.
+//
+// Note: This test heavily relies on the current implementation of
+// host device(single-threaded ordered executio). So if the implementation
+// is somehow changed so it's no longer possible to run this test reliable
+// it can be removed.
+
+namespace s = cl::sycl;
+
+int main(int argc, char *argv[]) {
+  s::queue q;
+
+  const size_t outer = 3;
+  const size_t inner = 2;
+  const s::range<2> rng = {outer, inner};
+
+  q.submit([&](s::handler &h) {
+    s::stream out(1024, 80, h);
+
+    h.parallel_for<class linear_id>(s::range<2>(rng), [=](s::item<2> item) {
+      // CHECK: 0
+      // CHECK-NEXT: 1
+      // CHECK-NEXT: 2
+      // CHECK-NEXT: 3
+      // CHECK-NEXT: 4
+      // CHECK-NEXT: 5
+      out << item.get_linear_id() << "\n";
+    });
+  });
+
+  return 0;
+}
@@ -0,0 +1,54 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--------------- linear-sub_group.cpp - SYCL linear id test -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../sub_group/helper.hpp"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+
+using namespace cl::sycl;
+
+int main(int argc, char *argv[]) {
+  queue q;
+  if (!core_sg_supported(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  // Fill output array with sub-group IDs
+  const uint32_t outer = 2;
+  const uint32_t inner = 8;
+  std::vector<int> output(outer * inner, 0);
+  {
+    buffer<int, 2> output_buf(output.data(), range<2>(outer, inner));
+    q.submit([&](handler &cgh) {
+      auto output = output_buf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class linear_id>(
+          nd_range<2>(range<2>(outer, inner), range<2>(outer, inner)),
+          [=](nd_item<2> it) {
+            id<2> idx = it.get_global_id();
+            intel::sub_group sg = it.get_sub_group();
+            output[idx] = sg.get_group_id()[0] * sg.get_local_range()[0] +
+                          sg.get_local_id()[0];
+          });
+    });
+  }
+
+  // Compare with expected result
+  for (int idx = 0; idx < outer * inner; ++idx) {
+    assert(output[idx] == idx);
+  }
+  std::cout << "Test passed.\n";
+  return 0;
+}
@@ -0,0 +1,74 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==---------------- opencl-interop.cpp - SYCL linear id test --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <numeric>
+
+using namespace cl::sycl;
+
+int main(int argc, char *argv[]) {
+  queue q;
+  if (q.is_host()) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  // Compute expected answer.
+  const uint32_t dimA = 2;
+  const uint32_t dimB = 8;
+  std::vector<int> input(dimA * dimB), output(dimA), expected(dimA);
+  std::iota(input.begin(), input.end(), 0);
+  for (int j = 0; j < dimA; ++j) {
+    int sum = 0;
+    for (int i = 0; i < dimB; ++i) {
+      sum += input[j * dimB + i];
+    }
+    expected[j] = sum;
+  }
+
+  // Compute sum using one work-group per element of dimA
+  program prog(q.get_context(), {q.get_device()});
+  prog.build_with_source("__kernel void sum(__global const int* input, "
+                         "__global int* output, const int dimA, const int dimB)"
+                         "{"
+                         "  int j = get_global_id(1);"
+                         "  int i = get_global_id(0);"
+                         "  int sum = work_group_reduce_add(input[j*dimB+i]);"
+                         "  if (get_local_id(0) == 0)"
+                         "  {"
+                         "    output[j] = sum;"
+                         "  }"
+                         "}",
+                         "-cl-std=CL2.0");
+  kernel sum = prog.get_kernel("sum");
+  {
+    buffer<int, 2> input_buf(input.data(), range<2>(dimA, dimB)),
+        output_buf(output.data(), range<2>(dimA, dimB));
+    q.submit([&](handler &cgh) {
+      auto input = input_buf.get_access<access::mode::read>(cgh);
+      auto output = output_buf.get_access<access::mode::discard_write>(cgh);
+      cgh.set_args(input, output, dimA, dimB);
+      cgh.parallel_for(nd_range<2>(range<2>(dimA, dimB), range<2>(1, dimB)),
+                       sum);
+    });
+  }
+
+  // Compare with expected result
+  for (int j = 0; j < dimA; ++j) {
+    assert(output[j] == expected[j]);
+  }
+  std::cout << "Test passed.\n";
+  return 0;
+}