intel · aelovikov-intel · Apr 2, 2024 · Mar 28, 2024
@@ -1326,8 +1326,6 @@ class __SYCL_EXPORT handler {
                   "Kernel argument cannot have a sycl::nd_item type in "
                   "sycl::parallel_for with sycl::range");
 
-#if defined(SYCL2020_CONFORMANT_APIS) ||                                       \
-    defined(__INTEL_PREVIEW_BREAKING_CHANGES)
     static_assert(std::is_convertible_v<item<Dims>, LambdaArgType> ||
                       std::is_convertible_v<item<Dims, false>, LambdaArgType>,
                   "sycl::parallel_for(sycl::range) kernel must have the "
@@ -1340,7 +1338,6 @@ class __SYCL_EXPORT handler {
          std::is_invocable_v<KernelType, RefLambdaArgType, kernel_handler>),
         "SYCL kernel lambda/functor has an unexpected signature, it should be "
         "invocable with sycl::item and optionally sycl::kernel_handler");
-#endif
 
     // TODO: Properties may change the kernel function, so in order to avoid
     //       conflicts they should be included in the name.
@@ -1432,19 +1429,11 @@ class __SYCL_EXPORT handler {
     verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
     using LambdaArgType =
         sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
-#if defined(SYCL2020_CONFORMANT_APIS) ||                                       \
-    defined(__INTEL_PREVIEW_BREAKING_CHANGES)
     static_assert(
         std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
         "Kernel argument of a sycl::parallel_for with sycl::nd_range "
         "must be either sycl::nd_item or be convertible from sycl::nd_item");
     using TransformedArgType = sycl::nd_item<Dims>;
-#else
-    // If user type is convertible from sycl::item/sycl::nd_item, use
-    // sycl::item/sycl::nd_item to transport item information
-    using TransformedArgType =
-        typename TransformUserItemType<Dims, LambdaArgType>::type;
-#endif
 
     (void)ExecutionRange;
     (void)Props;

@@ -32,11 +32,12 @@ void enqueueKernel_1_fromFile1(queue *Q) {
 
     CGH.parallel_for<class Kernel_1>(
         sycl::nd_range(Buf.get_range(), sycl::range<1>(4)),
-        [=](sycl::id<1> wiID) {
+        [=](sycl::nd_item<1> ndi) {
+          auto gid = ndi.get_global_id(0);
           int X = 0;
-          if (wiID == 5)
+          if (gid == 5)
             X = checkFunction();
-          Acc[wiID] = X;
+          Acc[gid] = X;
         });
   });
 }

@@ -59,8 +59,8 @@ int main(int Argc, const char *Argv[]) {
       sycl::local_accessor<int, 1> LocalAcc(LocalMemSize, CGH);
 
       CGH.parallel_for<class kernel_using_local_memory>(
-          NDRange, [=](sycl::item<1> itemID) {
-            size_t i = itemID.get_id(0);
+          NDRange, [=](sycl::nd_item<1> ndi) {
+            size_t i = ndi.get_global_id(0);
             int *Ptr = LocalAcc.get_pointer();
             Ptr[i] = i + 5;
             Harray[i] = Ptr[i] + 5;

@@ -107,16 +107,17 @@ struct DeviceFunc {
   DeviceFunc(const T *In0, const T *In1, const T *In2, T *Out)
       : In0(In0), In1(In1), In2(In2), Out(Out) {}
 
-  void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
-    unsigned int Offset = I * N;
+  void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
+    auto gid = ndi.get_global_id(0);
+    unsigned int Offset = gid * N;
     esimd::simd<T, N> V0;
     esimd::simd<T, N> V1;
     esimd::simd<T, N> V2;
     V0.copy_from(In0 + Offset);
     V1.copy_from(In1 + Offset);
     V2.copy_from(In2 + Offset);
 
-    if (I.get(0) % 2 == 0) {
+    if (gid % 2 == 0) {
       for (int J = 0; J < N; J++) {
         Kernel<T, N, Op, AllSca> DevF{};
         T Val0 = V0[J];

@@ -238,12 +238,13 @@ struct UnaryDeviceFunc {
 
   UnaryDeviceFunc(AccIn &In, AccOut &Out) : In(In), Out(Out) {}
 
-  void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
-    unsigned int Offset = I * N * sizeof(T);
+  void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
+    auto gid = ndi.get_global_id(0);
+    unsigned int Offset = gid * N * sizeof(T);
     esimd::simd<T, N> Vx;
     Vx.copy_from(In, Offset);
 
-    if (I.get(0) % 2 == 0) {
+    if (gid % 2 == 0) {
       for (int J = 0; J < N; J++) {
         Kernel<T, N, Op, AllSca> DevF{};
         T Val = Vx[J];
@@ -269,13 +270,14 @@ struct BinaryDeviceFunc {
   BinaryDeviceFunc(AccIn &In1, AccIn &In2, AccOut &Out)
       : In1(In1), In2(In2), Out(Out) {}
 
-  void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
-    unsigned int Offset = I * N * sizeof(T);
+  void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
+    auto gid = ndi.get_global_id(0);
+    unsigned int Offset = gid * N * sizeof(T);
     esimd::simd<T, N> V1(In1, Offset);
     esimd::simd<T, N> V2(In2, Offset);
     esimd::simd<T, N> V;
 
-    if (I.get(0) % 2 == 0) {
+    if (gid % 2 == 0) {
       int Ind = 0;
       {
         Kernel<T, N, Op, AllSca> DevF{};

@@ -50,7 +50,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
     q.submit([&](handler &cgh) {
        auto acc = buf.template get_access<access::mode::read_write>(cgh);
        auto LocalAcc = local_accessor<T, 1>(size * STRIDE, cgh);
-       cgh.parallel_for(glob_range, [=](id<1> i) SYCL_ESIMD_KERNEL {
+       cgh.parallel_for(glob_range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
          using namespace sycl::ext::intel::esimd;
          simd<T, VL> valsIn;
          valsIn.copy_from(acc, 0);

@@ -89,7 +89,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
        auto OutAcc = OutBuf.template get_access<access::mode::read_write>(cgh);
        auto LocalAcc = local_accessor<T, 1>(VL * NUM_RGBA_CHANNELS, cgh);
 
-       cgh.parallel_for(Range, [=](id<1> i) SYCL_ESIMD_KERNEL {
+       cgh.parallel_for(Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
          using namespace sycl::ext::intel::esimd;
          constexpr int numChannels = get_num_channels_enabled(CH_MASK);
 
@@ -135,7 +135,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
                -1;
          }
 
-         uint32_t global_offset = i * VL * NUM_RGBA_CHANNELS;
+         uint32_t global_offset = ndi.get_global_id(0) * VL * NUM_RGBA_CHANNELS;
          valsOut.copy_to(OutAcc, global_offset);
        });
      }).wait();

@@ -212,8 +212,8 @@ bool test(queue q, const Config &cfg) {
   try {
     auto e = q.submit([&](handler &cgh) {
       cgh.parallel_for<TestID<T, N, ImplF>>(
-          rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
-            int i = ii;
+          rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+            int i = ndi.get_global_id(0);
 #ifndef USE_SCALAR_OFFSET
             simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
                                      cfg.stride * sizeof(T));
@@ -332,8 +332,8 @@ bool test(queue q, const Config &cfg) {
     auto e = q.submit([&](handler &cgh) {
       auto accessor = buf.template get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<TestID<T, N, ImplF>>(
-          rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
-            int i = ii;
+          rng, [=](nd_item<1> gid) SYCL_ESIMD_KERNEL {
+            int i = gid.get_global_id(0);
 #ifndef USE_SCALAR_OFFSET
             simd<Toffset, N> offsets(start * sizeof(T), stride * sizeof(T));
 #else

@@ -52,7 +52,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
     q.submit([&](handler &cgh) {
        auto acc = buf.template get_access<access::mode::read_write>(cgh);
        auto LocalAcc = local_accessor<T, 1>(size * STRIDE, cgh);
-       cgh.parallel_for(glob_range, [=](id<1> i) SYCL_ESIMD_KERNEL {
+       cgh.parallel_for(glob_range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
          using namespace sycl::ext::intel::esimd;
          simd<T, VL> valsIn;
          valsIn.copy_from(acc, 0);

@@ -153,8 +153,8 @@ bool test_usm(queue q, const Config &cfg) {
 
   try {
     auto e = q.submit([&](handler &cgh) {
-      cgh.parallel_for(rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
-        int i = ii;
+      cgh.parallel_for(rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+        int i = ndi.get_global_id(0);
         simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
                                  cfg.stride * sizeof(T));
         simd_mask<N> m = 1;
@@ -287,8 +287,8 @@ bool test_acc(queue q, const Config &cfg) {
       auto e = q.submit([&](handler &cgh) {
         auto arr_acc =
             arr_buf.template get_access<access::mode::read_write>(cgh);
-        cgh.parallel_for(rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
-          int i = ii;
+        cgh.parallel_for(rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+          int i = ndi.get_global_id(0);
           simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
                                    cfg.stride * sizeof(T));
           simd_mask<N> m = 1;

@@ -25,12 +25,16 @@ template <int Dimensions> size_t getSize(nd_range<Dimensions> r) {
   return r.get_global_range().size();
 }
 
+template <int N> auto global_linear_id(sycl::nd_item<N> ndi) {
+  return ndi.get_global_linear_id();
+}
+template <int N> auto global_linear_id(sycl::item<N> i) {
+  return i.get_linear_id();
+}
+
 template <typename Kernel1Name, typename Kernel2Name, typename Range1,
           typename Range2>
 void performFusion(queue &q, Range1 R1, Range2 R2) {
-  using IndexTy1 = item<Range1::dimensions>;
-  using IndexTy2 = item<Range2::dimensions>;
-
   int in[dataSize], tmp[dataSize], out[dataSize];
 
   for (size_t i = 0; i < dataSize; ++i) {
@@ -51,17 +55,17 @@ void performFusion(queue &q, Range1 R1, Range2 R2) {
     q.submit([&](handler &cgh) {
       auto accIn = bIn.get_access(cgh);
       auto accTmp = bTmp.get_access(cgh);
-      cgh.parallel_for<Kernel1Name>(R1, [=](IndexTy1 i) {
-        size_t j = i.get_linear_id();
+      cgh.parallel_for<Kernel1Name>(R1, [=](auto i) {
+        size_t j = global_linear_id(i);
         accTmp[j] = accIn[j] + 5;
       });
     });
 
     q.submit([&](handler &cgh) {
       auto accTmp = bTmp.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<Kernel2Name>(R2, [=](IndexTy2 i) {
-        size_t j = i.get_linear_id();
+      cgh.parallel_for<Kernel2Name>(R2, [=](auto i) {
+        size_t j = global_linear_id(i);
         accOut[j] = accTmp[j] * 2;
       });
     });

@@ -49,8 +49,10 @@ void performFusion(queue &q, Internalization intKernel1,
 
       if (localSizeKernel1 > 0) {
         cgh.parallel_for<class Kernel1>(
-            nd_range<1>{{dataSize}, {localSizeKernel1}},
-            [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
+            nd_range<1>{{dataSize}, {localSizeKernel1}}, [=](nd_item<1> ndi) {
+              auto i = ndi.get_global_id(0);
+              accTmp[i] = accIn[i] + 5;
+            });
       } else {
         cgh.parallel_for<class KernelOne>(
             dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
@@ -70,8 +72,10 @@ void performFusion(queue &q, Internalization intKernel1,
       auto accOut = bOut.get_access(cgh);
       if (localSizeKernel2 > 0) {
         cgh.parallel_for<class Kernel2>(
-            nd_range<1>{{dataSize}, {localSizeKernel2}},
-            [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
+            nd_range<1>{{dataSize}, {localSizeKernel2}}, [=](nd_item<1> ndi) {
+              auto i = ndi.get_global_id(0);
+              accOut[i] = accTmp[i] * 2;
+            });
       } else {
         cgh.parallel_for<class KernelTwo>(
             dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });

@@ -16,7 +16,10 @@ struct AddKernel {
   accessor<int, 1> accIn2;
   accessor<int, 1> accOut;
 
-  void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
+  void operator()(nd_item<1> ndi) const {
+    auto i = ndi.get_global_id(0);
+    accOut[i] = accIn1[i] + accIn2[i];
+  }
 };
 
 int main() {
@@ -71,17 +74,21 @@ int main() {
       auto accTmp1 = bTmp1.get_access(cgh);
       auto accIn3 = bIn3.get_access(cgh);
       auto accTmp2 = bTmp2.get_access(cgh);
-      cgh.parallel_for<class KernelOne>(
-          nd_range<1>{{dataSize}, {16}},
-          [=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
+      cgh.parallel_for<class KernelOne>(nd_range<1>{{dataSize}, {16}},
+                                        [=](nd_item<1> ndi) {
+                                          auto i = ndi.get_global_id(0);
+                                          accTmp2[i] = accTmp1[i] * accIn3[i];
+                                        });
     });
 
     q.submit([&](handler &cgh) {
       auto accTmp1 = bTmp1.get_access(cgh);
       auto accTmp3 = bTmp3.get_access(cgh);
-      cgh.parallel_for<class KernelTwo>(
-          nd_range<1>{{dataSize}, {16}},
-          [=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
+      cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
+                                        [=](nd_item<1> ndi) {
+                                          auto i = ndi.get_global_id(0);
+                                          accTmp3[i] = accTmp1[i] * 5;
+                                        });
     });
 
     q.submit([&](handler &cgh) {

@@ -56,9 +56,11 @@ int main() {
           cgh, sycl::ext::codeplay::experimental::property::promote_local{});
       auto accIn3 = bIn3.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<class KernelTwo>(
-          nd_range<1>{{dataSize}, {16}},
-          [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+      cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
+                                        [=](nd_item<1> ndi) {
+                                          auto i = ndi.get_global_id(0);
+                                          accOut[i] = accTmp[i] * accIn3[i];
+                                        });
     });
 
     fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});

@@ -84,7 +84,8 @@ int main() {
       auto accTmp = bTmp.get_access(
           cgh, sycl::ext::codeplay::experimental::property::promote_local{});
       cgh.parallel_for<class KernelOne>(
-          nd_range<1>{{dataSize}, {4}}, [=](id<1> id) {
+          nd_range<1>{{dataSize}, {4}}, [=](nd_item<1> ndi) {
+            auto id = ndi.get_global_id();
             const auto &accIn1Wrapp = accIn1[id];
             const auto &accIn2Wrapp = accIn2[id];
             auto &accTmpWrapp = accTmp[id];
@@ -105,7 +106,8 @@ int main() {
       auto accIn3 = bIn3.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
       cgh.parallel_for<class KernelTwo>(
-          nd_range<1>{{dataSize}, {4}}, [=](id<1> id) {
+          nd_range<1>{{dataSize}, {4}}, [=](nd_item<1> ndi) {
+            auto id = ndi.get_global_id();
             const auto &tmpWrapp = accTmp[id];
             const auto &accIn3Wrapp = accIn3[id];
             auto &accOutWrapp = accOut[id];

@@ -41,19 +41,23 @@ int main() {
       auto accIn2 = bIn2.get_access(cgh);
       auto accTmp = bTmp.get_access(
           cgh, sycl::ext::codeplay::experimental::property::promote_local{});
-      cgh.parallel_for<class KernelOne>(
-          nd_range<1>{{dataSize}, {16}},
-          [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+      cgh.parallel_for<class KernelOne>(nd_range<1>{{dataSize}, {16}},
+                                        [=](nd_item<1> ndi) {
+                                          auto i = ndi.get_global_id(0);
+                                          accTmp[i] = accIn1[i] + accIn2[i];
+                                        });
     });
 
     q.submit([&](handler &cgh) {
       auto accTmp = bTmp.get_access(
           cgh, sycl::ext::codeplay::experimental::property::promote_local{});
       auto accIn3 = bIn3.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
-      cgh.parallel_for<class KernelTwo>(
-          nd_range<1>{{dataSize}, {16}},
-          [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+      cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
+                                        [=](nd_item<1> ndi) {
+                                          auto i = ndi.get_global_id(0);
+                                          accOut[i] = accTmp[i] * accIn3[i];
+                                        });
     });
 
     fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});

@@ -42,8 +42,8 @@ int main() {
       auto accTmp = bTmp.get_access(
           cgh, sycl::ext::codeplay::experimental::property::promote_local{});
       cgh.parallel_for<class KernelOne>(
-          nd_range<1>{{128}, {8}}, [=](item<1> i) {
-            auto baseOffset = i.get_linear_id() * 4;
+          nd_range<1>{{128}, {8}}, [=](nd_item<1> ndi) {
+            auto baseOffset = ndi.get_global_linear_id() * 4;
             for (size_t j = 0; j < 4; ++j) {
               accTmp[baseOffset + j] =
                   accIn1[baseOffset + j] + accIn2[baseOffset + j];
@@ -57,8 +57,8 @@ int main() {
       auto accIn3 = bIn3.get_access(cgh);
       auto accOut = bOut.get_access(cgh);
       cgh.parallel_for<class KernelTwo>(
-          nd_range<1>{{128}, {8}}, [=](item<1> i) {
-            auto baseOffset = i.get_linear_id() * 4;
+          nd_range<1>{{128}, {8}}, [=](nd_item<1> ndi) {
+            auto baseOffset = ndi.get_global_linear_id() * 4;
             for (size_t j = 0; j < 4; ++j) {
               accOut[baseOffset + j] =
                   accTmp[baseOffset + j] * accIn3[baseOffset + j];