[SYCL][CUDA] Adds tests for atomic memory ordering (#363)

steffenlarsen · Steffen Larsen · web-flow · commit 82a7eec6d8e3 · 2021-08-10T11:08:27.000+03:00
* [SYCL][CUDA] Adds tests for atomic memory ordering

Signed-off-by: Steffen Larsen &lt;steffen.larsen@codeplay.com&gt;

* Fix formatting

Signed-off-by: Steffen Larsen &lt;steffen.larsen@codeplay.com&gt;

Co-authored-by: Steffen Larsen &lt;steffen.larsen@codeplay.com&gt;
diff --git a/SYCL/AtomicRef/accessor.cpp b/SYCL/AtomicRef/accessor.cpp
@@ -26,21 +26,21 @@ template <typename T> void accessor_test(queue q, size_t N) {
       static_assert(
           std::is_same<decltype(atomic_accessor(sum_buf, cgh, relaxed_order,
                                                 device_scope)),
-                       atomic_accessor<T, 1, ONEAPI::memory_order::relaxed,
-                                       ONEAPI::memory_scope::device>>::value,
+                       atomic_accessor<T, 1, memory_order::relaxed,
+                                       memory_scope::device>>::value,
           "atomic_accessor type incorrectly deduced");
 #endif
-      auto sum = atomic_accessor<T, 1, ONEAPI::memory_order::relaxed,
-                                 ONEAPI::memory_scope::device>(sum_buf, cgh);
+      auto sum =
+          atomic_accessor<T, 1, memory_order::relaxed, memory_scope::device>(
+              sum_buf, cgh);
       auto out =
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
         static_assert(
             std::is_same<
                 decltype(sum[0]),
-                atomic_ref<T, ONEAPI::memory_order::relaxed,
-                           ONEAPI::memory_scope::device,
+                atomic_ref<T, memory_order::relaxed, memory_scope::device,
                            access::address_space::global_space>>::value,
             "atomic_accessor returns incorrect atomic_ref");
         out[gid] = sum[0].fetch_add(T(1));
@@ -70,19 +70,18 @@ void local_accessor_test(queue q, size_t N, size_t L = 8) {
     buffer<T> output_buf(output.data(), output.size());
     q.submit([&](handler &cgh) {
       auto sum =
-          atomic_accessor<T, 1, ONEAPI::memory_order::relaxed,
-                          ONEAPI::memory_scope::device, access::target::local>(
-              1, cgh);
+          atomic_accessor<T, 1, memory_order::relaxed, memory_scope::device,
+                          access::target::local>(1, cgh);
       auto out = output_buf.template get_access<access::mode::read_write>(cgh);
       cgh.parallel_for(nd_range<1>(N, L), [=](nd_item<1> it) {
         int grp = it.get_group(0);
         sum[0].store(0);
         it.barrier();
         static_assert(
-            std::is_same<decltype(sum[0]),
-                         atomic_ref<T, ONEAPI::memory_order::relaxed,
-                                    ONEAPI::memory_scope::device,
-                                    access::address_space::local_space>>::value,
+            std::is_same<
+                decltype(sum[0]),
+                atomic_ref<T, memory_order::relaxed, memory_scope::device,
+                           access::address_space::local_space>>::value,
             "local atomic_accessor returns incorrect atomic_ref");
         T result = sum[0].fetch_add(T(1));
         if (result == it.get_local_range(0) - 1) {
diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
@@ -28,8 +28,7 @@ void add_fetch_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(sum[0]);
         out[gid] = atm.fetch_add(Difference(1));
       });
@@ -64,8 +63,7 @@ void add_plus_equal_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(sum[0]);
         out[gid] = atm += Difference(1);
       });
@@ -100,8 +98,7 @@ void add_pre_inc_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(sum[0]);
         out[gid] = ++atm;
       });
@@ -136,8 +133,7 @@ void add_post_inc_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(sum[0]);
         out[gid] = atm++;
       });
diff --git a/SYCL/AtomicRef/atomic_memory_order.cpp b/SYCL/AtomicRef/atomic_memory_order.cpp
@@ -0,0 +1,149 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// L0, OpenCL, and ROCm backends don't currently support
+// info::device::atomic_memory_order_capabilities and aspect::atomic64
+// XFAIL: level_zero || opencl || rocm
+
+// NOTE: Tests load and store for supported memory orderings.
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::ONEAPI;
+
+template <typename T, memory_order MO> class memory_order_kernel;
+
+template <typename T> void acq_rel_test(queue q, size_t N) {
+  T a = 0;
+  {
+    buffer<T> a_buf(&a, 1);
+
+    q.submit([&](handler &cgh) {
+      auto a_acc = a_buf.template get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<memory_order_kernel<T, memory_order::acq_rel>>(
+          range<1>(N), [=](item<1> it) {
+            int gid = it.get_id(0);
+            auto aar =
+                atomic_ref<T, memory_order::acq_rel, memory_scope::device,
+                           access::address_space::global_space>(a_acc[0]);
+            auto ld = aar.load();
+            ld += 1;
+            aar.store(ld);
+          });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(a == T(N));
+}
+
+template <typename T> void seq_cst_test(queue q, size_t N) {
+  T a = 0;
+  T b = 0;
+  {
+    buffer<T> a_buf(&a, 1);
+    buffer<T> b_buf(&b, 1);
+
+    q.submit([&](handler &cgh) {
+      auto a_acc = a_buf.template get_access<access::mode::read_write>(cgh);
+      auto b_acc = b_buf.template get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<memory_order_kernel<T, memory_order::seq_cst>>(
+          range<1>(N), [=](item<1> it) {
+            int gid = it.get_id(0);
+            auto aar =
+                atomic_ref<T, memory_order::seq_cst, memory_scope::device,
+                           access::address_space::global_space>(a_acc[0]);
+            auto bar =
+                atomic_ref<T, memory_order::seq_cst, memory_scope::device,
+                           access::address_space::global_space>(b_acc[0]);
+            auto ald = aar.load();
+            auto bld = bar.load();
+            ald += 1;
+            bld += ald;
+            bar.store(bld);
+            aar.store(ald);
+          });
+    });
+  }
+
+  // All work-items increment a by 1, so final value should be equal to N
+  assert(a == T(N));
+  // b is the sum of [1..N]
+  size_t rsum = 0;
+  for (size_t i = 1; i <= N; ++i)
+    rsum += i;
+  assert(b == T(rsum));
+}
+
+bool is_supported(std::vector<memory_order> capabilities,
+                  memory_order mem_order) {
+  return std::find(capabilities.begin(), capabilities.end(), mem_order) !=
+         capabilities.end();
+}
+
+int main() {
+  queue q;
+
+  std::vector<memory_order> supported_memory_orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+  bool atomic64_support = q.get_device().has(aspect::atomic64);
+
+  constexpr int N = 32;
+
+  // Relaxed memory order must be supported. This ordering is used in other
+  // tests.
+  assert(is_supported(supported_memory_orders, memory_order::relaxed));
+
+  if (is_supported(supported_memory_orders, memory_order::acq_rel)) {
+    // Acquire-release memory order must also support both acquire and release
+    // orderings.
+    assert(is_supported(supported_memory_orders, memory_order::acquire) &&
+           is_supported(supported_memory_orders, memory_order::release));
+    acq_rel_test<int>(q, N);
+    acq_rel_test<unsigned int>(q, N);
+    acq_rel_test<float>(q, N);
+    if (sizeof(long) == 4) {
+      // long is 32-bit
+      acq_rel_test<long>(q, N);
+      acq_rel_test<unsigned long>(q, N);
+    }
+    if (atomic64_support) {
+      if (sizeof(long) == 8) {
+        // long is 64-bit
+        acq_rel_test<long>(q, N);
+        acq_rel_test<unsigned long>(q, N);
+      }
+      acq_rel_test<long long>(q, N);
+      acq_rel_test<unsigned long long>(q, N);
+      acq_rel_test<double>(q, N);
+    }
+  }
+
+  if (is_supported(supported_memory_orders, memory_order::seq_cst)) {
+    seq_cst_test<int>(q, N);
+    seq_cst_test<unsigned int>(q, N);
+    seq_cst_test<float>(q, N);
+    if (sizeof(long) == 4) {
+      // long is 32-bit
+      seq_cst_test<long>(q, N);
+      seq_cst_test<unsigned long>(q, N);
+    }
+    if (atomic64_support) {
+      if (sizeof(long) == 8) {
+        // long is 64-bit
+        seq_cst_test<long>(q, N);
+        seq_cst_test<unsigned long>(q, N);
+      }
+      seq_cst_test<long long>(q, N);
+      seq_cst_test<unsigned long long>(q, N);
+      seq_cst_test<double>(q, N);
+    }
+  }
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
@@ -31,9 +31,9 @@ template <typename T> void compare_exchange_test(queue q, size_t N) {
       cgh.parallel_for<compare_exchange_kernel<T>>(
           range<1>(N), [=](item<1> it) {
             size_t gid = it.get_id(0);
-            auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                                  ONEAPI::memory_scope::device,
-                                  access::address_space::global_space>(exc[0]);
+            auto atm =
+                atomic_ref<T, memory_order::relaxed, memory_scope::device,
+                           access::address_space::global_space>(exc[0]);
             T result = T(N); // Avoid copying pointer
             bool success = atm.compare_exchange_strong(result, (T)gid);
             if (success) {
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
@@ -29,8 +29,7 @@ template <typename T> void exchange_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for<exchange_kernel<T>>(range<1>(N), [=](item<1> it) {
         size_t gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(exc[0]);
         out[gid] = atm.exchange(T(gid));
       });
diff --git a/SYCL/AtomicRef/load.cpp b/SYCL/AtomicRef/load.cpp
@@ -28,8 +28,7 @@ template <typename T> void load_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for<load_kernel<T>>(range<1>(N), [=](item<1> it) {
         size_t gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(ld[0]);
         out[gid] = atm.load();
       });
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
@@ -27,8 +27,7 @@ template <typename T> void max_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(val[0]);
 
         // +1 accounts for lowest() returning 0 for unsigned types
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
@@ -27,8 +27,7 @@ template <typename T> void min_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(val[0]);
         out[gid] = atm.fetch_min(T(gid));
       });
diff --git a/SYCL/AtomicRef/store.cpp b/SYCL/AtomicRef/store.cpp
@@ -22,8 +22,7 @@ template <typename T> void store_test(queue q, size_t N) {
       auto st = store_buf.template get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<store_kernel<T>>(range<1>(N), [=](item<1> it) {
         size_t gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(st[0]);
         atm.store(T(gid));
       });
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
@@ -28,8 +28,7 @@ void sub_fetch_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(val[0]);
         out[gid] = atm.fetch_sub(Difference(1));
       });
@@ -64,8 +63,7 @@ void sub_plus_equal_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(val[0]);
         out[gid] = atm -= Difference(1);
       });
@@ -100,8 +98,7 @@ void sub_pre_dec_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(val[0]);
         out[gid] = --atm;
       });
@@ -136,8 +133,7 @@ void sub_post_dec_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, ONEAPI::memory_order::relaxed,
-                              ONEAPI::memory_scope::device,
+        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
                               access::address_space::global_space>(val[0]);
         out[gid] = atm--;
       });