intel
diff --git a/‎SYCL/AtomicRef/add.cpp renamed to ‎SYCL/AtomicRef/add-emulated.cpp
Lines changed: 2 additions & 7 deletions b/‎SYCL/AtomicRef/add.cpp renamed to ‎SYCL/AtomicRef/add-emulated.cpp
Lines changed: 2 additions & 7 deletions
diff --git a/‎SYCL/AtomicRef/add-native.cpp
Lines changed: 198 additions & 0 deletions b/‎SYCL/AtomicRef/add-native.cpp
Lines changed: 198 additions & 0 deletions
diff --git a/‎SYCL/AtomicRef/sub.cpp renamed to ‎SYCL/AtomicRef/sub-emulated.cpp
Lines changed: 2 additions & 7 deletions b/‎SYCL/AtomicRef/sub.cpp renamed to ‎SYCL/AtomicRef/sub-emulated.cpp
Lines changed: 2 additions & 7 deletions
@@ -1,12 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s \
-// RUN: -o %t.out %gpu_atomics_config
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// TODO: Drop the separate "emulation" compilation once "native" OpenCL CPU
-// support is provided.
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s \
-// RUN: -o %t.out.emulated -D__SYCL_EMULATE_FLOAT_ATOMICS__=1
-// RUN: %CPU_RUN_PLACEHOLDER %t.out.emulated
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
 
 #include <CL/sycl.hpp>
 #include <algorithm>
 
@@ -0,0 +1,198 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -DSYCL_USE_NATIVE_FP_ATOMICS \
+// RUN: -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// TODO: Remove XFAIL/UNSUPPORTED once corresponding backends support
+// "native" SPIR-V
+// XFAIL: cpu
+// UNSUPPORTED: cuda
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::ONEAPI;
+
+template <typename T, typename Difference = T>
+void add_fetch_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
+                              ONEAPI::memory_scope::device,
+                              access::address_space::global_space>(sum[0]);
+        out[gid] = atm.fetch_add(Difference(1));
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // Fetch returns original value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(0) && *max_e == T(N - 1));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T>
+void add_plus_equal_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
+                              ONEAPI::memory_scope::device,
+                              access::address_space::global_space>(sum[0]);
+        out[gid] = atm += Difference(1);
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // += returns updated value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(1) && *max_e == T(N));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T>
+void add_pre_inc_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
+                              ONEAPI::memory_scope::device,
+                              access::address_space::global_space>(sum[0]);
+        out[gid] = ++atm;
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // Pre-increment returns updated value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(1) && *max_e == T(N));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T>
+void add_post_inc_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
+                              ONEAPI::memory_scope::device,
+                              access::address_space::global_space>(sum[0]);
+        out[gid] = atm++;
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // Post-increment returns original value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(0) && *max_e == T(N - 1));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T>
+void add_test(queue q, size_t N) {
+  add_fetch_test<T, Difference>(q, N);
+  add_plus_equal_test<T, Difference>(q, N);
+  add_pre_inc_test<T, Difference>(q, N);
+  add_post_inc_test<T, Difference>(q, N);
+}
+
+// Floating-point types do not support pre- or post-increment
+template <> void add_test<float>(queue q, size_t N) {
+  add_fetch_test<float>(q, N);
+  add_plus_equal_test<float>(q, N);
+}
+template <> void add_test<double>(queue q, size_t N) {
+  add_fetch_test<double>(q, N);
+  add_plus_equal_test<double>(q, N);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+
+  constexpr int N = 32;
+  add_test<int>(q, N);
+  add_test<unsigned int>(q, N);
+  add_test<long>(q, N);
+  add_test<unsigned long>(q, N);
+  add_test<long long>(q, N);
+  add_test<unsigned long long>(q, N);
+  add_test<float>(q, N);
+  add_test<double>(q, N);
+  add_test<char *, ptrdiff_t>(q, N);
+
+  std::cout << "Test passed." << std::endl;
+}
@@ -1,12 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s \
-// RUN: -o %t.out %gpu_atomics_config
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// TODO: Drop the separate "emulation" compilation once "native" OpenCL CPU
-// support is provided.
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s \
-// RUN: -o %t.out.emulated -D__SYCL_EMULATE_FLOAT_ATOMICS__=1
-// RUN: %CPU_RUN_PLACEHOLDER %t.out.emulated
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
 
 #include <CL/sycl.hpp>
 #include <algorithm>