intel · v-klochkov · Jan 31, 2023 · Jan 20, 2023 · Jan 20, 2023 · Jan 20, 2023
@@ -61,7 +61,11 @@ template <typename T, int N> bool test(queue &Q) {
   try {
     Q.submit([&](handler &CGH) {
        CGH.parallel_for(sycl::range<1>{1}, [=](id<1>) SYCL_ESIMD_KERNEL {
+#ifndef USE_SCALAR_OFFSET
          simd<Toffset, N> Offsets(0u, sizeof(T));
+#else
+             Toffset Offsets = 0;
+#endif
          scatter<T, N>(Dst, Offsets, gather<T, N>(Src, Offsets));
        });
      }).wait();
@@ -72,7 +76,11 @@ template <typename T, int N> bool test(queue &Q) {
 
   unsigned NumErrs = 0;
   for (int I = 0; I < N; ++I)
+#ifndef USE_SCALAR_OFFSET
     if (Dst[I] != Src[I])
+#else
+    if ((Dst[I] != Src[I] && I == 0) || (I != 0 && Dst[I] != 0))
+#endif
       if (++NumErrs <= 10)
         std::cout << "failed at " << I << ": " << Dst[I]
                   << " (Dst) != " << Src[I] << " (Src)\n";
@@ -90,48 +98,59 @@ int main(void) {
   bool Pass = true;
 
   Pass &= test<int8_t, 1>(Q);
+#ifndef USE_SCALAR_OFFSET
   Pass &= test<int8_t, 2>(Q);
   Pass &= test<int8_t, 4>(Q);
   Pass &= test<int8_t, 8>(Q);
   Pass &= test<int8_t, 16>(Q);
   Pass &= test<int8_t, 32>(Q);
+#endif
 
   Pass &= test<int16_t, 1>(Q);
+#ifndef USE_SCALAR_OFFSET
   Pass &= test<int16_t, 2>(Q);
   Pass &= test<int16_t, 4>(Q);
   Pass &= test<int16_t, 8>(Q);
   Pass &= test<int16_t, 16>(Q);
   Pass &= test<int16_t, 32>(Q);
+#endif
 
   Pass &= test<int32_t, 1>(Q);
+#ifndef USE_SCALAR_OFFSET
   Pass &= test<int32_t, 2>(Q);
   Pass &= test<int32_t, 4>(Q);
   Pass &= test<int32_t, 8>(Q);
   Pass &= test<int32_t, 16>(Q);
   Pass &= test<int32_t, 32>(Q);
-
+#endif
   if (Dev.has(aspect::fp16)) {
     Pass &= test<half, 1>(Q);
+#ifndef USE_SCALAR_OFFSET
     Pass &= test<half, 2>(Q);
     Pass &= test<half, 4>(Q);
     Pass &= test<half, 8>(Q);
     Pass &= test<half, 16>(Q);
     Pass &= test<half, 32>(Q);
+#endif
   }
 
   Pass &= test<bfloat16, 1>(Q);
+#ifndef USE_SCALAR_OFFSET
   Pass &= test<bfloat16, 2>(Q);
   Pass &= test<bfloat16, 4>(Q);
   Pass &= test<bfloat16, 8>(Q);
   Pass &= test<bfloat16, 16>(Q);
   Pass &= test<bfloat16, 32>(Q);
+#endif
 #ifdef USE_TF32
   Pass &= test<tfloat32, 1>(Q);
+#ifndef USE_SCALAR_OFFSET
   Pass &= test<tfloat32, 2>(Q);
   Pass &= test<tfloat32, 4>(Q);
   Pass &= test<tfloat32, 8>(Q);
   Pass &= test<tfloat32, 16>(Q);
   Pass &= test<tfloat32, 32>(Q);
+#endif
 #endif
 
   std::cout << (Pass ? "Test Passed\n" : "Test FAILED\n");

@@ -0,0 +1,18 @@
+//==------ svm_gather_scatter_scalar_off.cpp  - DPC++ ESIMD on-device test -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu && !gpu-intel-pvc
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// Regression test for gather/scatter API.
+// scalar offset variant of the test - uses scalar offset.
+
+#define USE_SCALAR_OFFSET
+
+#include "svm_gather_scatter.cpp"
@@ -0,0 +1,27 @@
+//==-- dword_atomic_cmpxchg_scalar_off.cpp  - DPC++ ESIMD on-device test -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This test checks LSC atomic operations.
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
+// UNSUPPORTED: esimd_emulator
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This macro enables only cmpxch tests. They may require more time to execute,
+// and have higher probablity to hit kernel execution time limit, so they are
+// separated.
+#define CMPXCHG_TEST
+
+// This macro enforces usage of dword atomics in the included test.
+#define USE_DWORD_ATOMICS
+// This macro enforces scalar offset variant of the test - uses scalar offsets.
+#define USE_SCALAR_OFFSET
+
+#include "lsc/atomic_smoke.cpp"
@@ -0,0 +1,21 @@
+//==--- dword_atomic_smoke_scalar_off.cpp  - DPC++ ESIMD on-device test ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This test checks LSC atomic operations.
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
+// UNSUPPORTED: esimd_emulator
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// scalar offset variant of the test - uses scalar offsets.
+
+#define USE_DWORD_ATOMICS
+#define USE_SCALAR_OFFSET
+
+#include "lsc/atomic_smoke.cpp"
@@ -95,7 +95,11 @@ bool test(uint32_t pmask = 0xffffffff) {
               lsc_block_store<T, VS, lsc_data_size::default_size>(
                   out + elem_off, vals);
             } else {
+#ifndef USE_SCALAR_OFFSET
               simd<Toffset, VL> offset(byte_off, VS * sizeof(T));
+#else
+              Toffset offset = byte_off;
+#endif
               simd_mask<VL> pred;
               for (int i = 0; i < VL; i++)
                 pred.template select<1, 1>(i) = (pmask >> i) & 1;
@@ -142,8 +146,17 @@ bool test(uint32_t pmask = 0xffffffff) {
     for (int i = 0; i < Size; i++) {
       Tuint in_val = sycl::bit_cast<Tuint>(in[i]);
       Tuint out_val = sycl::bit_cast<Tuint>(out[i]);
+#ifndef USE_SCALAR_OFFSET
       Tuint e = (pmask >> ((i / VS) % VL)) & 1 ? in_val & vmask
                                                : sycl::bit_cast<Tuint>(old_val);
+#else
+      // Calculate the mask to identify the areas that were actually updated
+      constexpr uint16_t mask =
+          1U << ((sycl::bit_cast<uint32_t>((float)VL) >> 23) - 126);
+      Tuint e = ((i / VS) % VL == 0) && (pmask >> ((i / VS) % VL)) & (mask - 1)
+                    ? in_val & vmask
+                    : sycl::bit_cast<Tuint>(old_val);
+#endif
       if (out_val != e) {
         passed = false;
         std::cout << "out[" << i << "] = 0x" << std::hex << out_val

@@ -0,0 +1,23 @@
+//==---- atomic_cmpxchg_scalar_off.cpp  - DPC++ ESIMD on-device test ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This test checks LSC compare-and-exchange atomic operations.
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
+// UNSUPPORTED: esimd_emulator
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This macro enables only cmpxch tests. They may require more time to execute,
+// and have higher probablity to hit kernel execution time limit, so they are
+// separated.
+#define CMPXCHG_TEST
+// This macro enforces scalar offset variant of the test - uses scalar offsets.
+#define USE_SCALAR_OFFSET
+
+#include "atomic_smoke.cpp"
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
 // TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
+// TODO: esimd_emulator doesn't support xchg operation
 // UNSUPPORTED: esimd_emulator
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -216,8 +217,12 @@ bool test(queue q, const Config &cfg) {
       cgh.parallel_for<TestID<T, N, ImplF>>(
           rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
             int i = ii;
+#ifndef USE_SCALAR_OFFSET
             simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
                                      cfg.stride * sizeof(T));
+#else
+            Toffset offsets = 0;
+#endif
             simd_mask<N> m = 1;
             m[cfg.masked_lane] = 0;
         // barrier to achieve better contention:
@@ -318,8 +323,14 @@ template <class T, int N> struct ImplInc {
   static T init(int i, const Config &cfg) { return (T)0; }
 
   static T gold(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     T gold = is_updated(i, N, cfg)
                  ? (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups)
+#else
+    T gold =
+        i == 0
+            ? (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups * (N - 1))
+#endif
                  : init(i, cfg);
     return gold;
   }
@@ -331,11 +342,20 @@ template <class T, int N> struct ImplDec {
   static constexpr int base = 5;
 
   static T init(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     return (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups + base);
+#else
+    return (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups * (N - 1) +
+               base);
+#endif
   }
 
   static T gold(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     T gold = is_updated(i, N, cfg) ? (T)base : init(i, cfg);
+#else
+    T gold = i == 0 ? (T)base : init(i, cfg);
+#endif
     return gold;
   }
 };
@@ -364,7 +384,11 @@ template <class T, int N> struct ImplStore {
   static T init(int i, const Config &cfg) { return 0; }
 
   static T gold(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     T gold = is_updated(i, N, cfg) ? base : init(i, cfg);
+#else
+    T gold = i == 0 ? base : init(i, cfg);
+#endif
     return gold;
   }
 
@@ -378,9 +402,15 @@ template <class T, int N, class C, C Op> struct ImplAdd {
   static T init(int i, const Config &cfg) { return 0; }
 
   static T gold(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     T gold = is_updated(i, N, cfg) ? (T)(cfg.repeat * cfg.threads_per_group *
                                          cfg.n_groups * (T)(1 + FPDELTA))
                                    : init(i, cfg);
+#else
+    T gold = i == 0 ? (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups *
+                          (N - 1) * (T)(1 + FPDELTA))
+                    : init(i, cfg);
+#endif
     return gold;
   }
 
@@ -393,13 +423,23 @@ template <class T, int N, class C, C Op> struct ImplSub {
   static constexpr T base = (T)(5 + FPDELTA);
 
   static T init(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     return (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups *
                    (T)(1 + FPDELTA) +
                base);
+#else
+    return (T)(cfg.repeat * cfg.threads_per_group * cfg.n_groups * (N - 1) *
+                   (T)(1 + FPDELTA) +
+               base);
+#endif
   }
 
   static T gold(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     T gold = is_updated(i, N, cfg) ? base : init(i, cfg);
+#else
+    T gold = i == 0 ? base : init(i, cfg);
+#endif
     return gold;
   }
 
@@ -416,7 +456,11 @@ template <class T, int N, class C, C Op> struct ImplMin {
   }
 
   static T gold(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     T gold = is_updated(i, N, cfg) ? (T)MIN : init(i, cfg);
+#else
+    T gold = i == 0 ? (T)MIN : init(i, cfg);
+#endif
     return gold;
   }
 
@@ -431,7 +475,11 @@ template <class T, int N, class C, C Op> struct ImplMax {
   static T init(int i, const Config &cfg) { return (T)FPDELTA; }
 
   static T gold(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     T gold = is_updated(i, N, cfg)
+#else
+    T gold = i == 0
+#endif
                  ? (T)(cfg.threads_per_group * cfg.n_groups - 1 + FPDELTA)
                  : init(i, cfg);
     return gold;
@@ -482,7 +530,11 @@ template <class T, int N, class C, C Op> struct ImplCmpxchgBase {
   static T init(int i, const Config &cfg) { return base - 1; }
 
   static T gold(int i, const Config &cfg) {
+#ifndef USE_SCALAR_OFFSET
     T gold = is_updated(i, N, cfg)
+#else
+    T gold = i == 0
+#endif
                  ? (T)(cfg.threads_per_group * cfg.n_groups - 1 + base)
                  : init(i, cfg);
     return gold;
@@ -606,12 +658,14 @@ int main(void) {
 
   // Check load/store operations
   passed &= test_int_types<8, ImplLoad>(q, cfg);
+#ifndef USE_SCALAR_OFFSET
   if (q.get_backend() != sycl::backend::ext_intel_esimd_emulator)
     passed &= test_int_types<8, ImplStore>(q, cfg);
 #ifndef USE_DWORD_ATOMICS
   if (q.get_backend() != sycl::backend::ext_intel_esimd_emulator)
     passed &= test<float, 8, ImplStore>(q, cfg);
 #endif // USE_DWORD_ATOMICS
+#endif
-#endif
+#endif // !USE_SCALAR_OFFSET
-#endif
+#endif // !USE_SCALAR_OFFSET
   // TODO: check double other vector lengths in LSC mode.
 
   std::cout << (passed ? "Passed\n" : "FAILED\n");

@@ -0,0 +1,21 @@
+//==---------------- atomic_smoke_scalar_off.cpp  - DPC++ ESIMD on-device test
+//-----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This test checks LSC atomic operations.
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
+// UNSUPPORTED: esimd_emulator
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// scalar offset variant of the test - uses scalar offsets.
+
+#define USE_SCALAR_OFFSET
+
+#include "atomic_smoke.cpp"