[SYCL] Specialize atomic fetch_add for floating point types (#2765)

Artem Gindinson · web-flow · commit 37a9a2a9dbb8 · 2021-01-21T19:54:16.000+03:00
The new EXT/SPV_EXT_shader_atomic_float_add SPIR-V extension allows us to further specialize atomic::fetch_add() for floating point types. In device mode, we'll now be creating an external call to a built-in-like __spirv_AtomicFAddEXT(). This is similar to what is done for other atomic binary instructions, e.g. the integer specialization of fetch_add() being mapped onto __spirv_AtomicIAdd(). Furthermore, atomic::fetch_sub() is also re-implemented to use __spirv_AtomicFAddEXT(), the added operand being a negation of the original one. The new implementation can be exposed if a dedicated macro is defined: SYCL_USE_NATIVE_FP_ATOMICS. Otherwise, a fallback is used, where the atomic operation is done via spinlock emulation. At the moment of committing this, only Intel GPUs support the "native" implementation, which relies on a SPIR-V extension. Tests for the feature have been finalized in intel/llvm-test-suite#104. Signed-off-by: Artem Gindinson artem.gindinson@intel.com
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -79,6 +79,10 @@ extern SYCL_EXTERNAL TempRetT __spirv_ImageSampleExplicitLod(SampledType,
   extern SYCL_EXTERNAL Type __spirv_AtomicISub(                                \
       AS Type *P, __spv::Scope::Flag S, __spv::MemorySemanticsMask::Flag O,    \
       Type V);
+#define __SPIRV_ATOMIC_FADD(AS, Type)                                          \
+  extern SYCL_EXTERNAL Type __spirv_AtomicFAddEXT(                             \
+      AS Type *P, __spv::Scope::Flag S, __spv::MemorySemanticsMask::Flag O,    \
+      Type V);
 #define __SPIRV_ATOMIC_SMIN(AS, Type)                                          \
   extern SYCL_EXTERNAL Type __spirv_AtomicSMin(                                \
       AS Type *P, __spv::Scope::Flag S, __spv::MemorySemanticsMask::Flag O,    \
@@ -109,6 +113,7 @@ extern SYCL_EXTERNAL TempRetT __spirv_ImageSampleExplicitLod(SampledType,
       Type V);
 
 #define __SPIRV_ATOMIC_FLOAT(AS, Type)                                         \
+  __SPIRV_ATOMIC_FADD(AS, Type)                                                \
   __SPIRV_ATOMIC_LOAD(AS, Type)                                                \
   __SPIRV_ATOMIC_STORE(AS, Type)                                               \
   __SPIRV_ATOMIC_EXCHANGE(AS, Type)
diff --git a/sycl/include/CL/sycl/ONEAPI/atomic_ref.hpp b/sycl/include/CL/sycl/ONEAPI/atomic_ref.hpp
@@ -453,6 +453,11 @@ class atomic_ref_impl<
 
   T fetch_add(T operand, memory_order order = default_read_modify_write_order,
               memory_scope scope = default_scope) const noexcept {
+// TODO: Remove the "native atomics" macro check once implemented for all
+// backends
+#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_USE_NATIVE_FP_ATOMICS)
+    return detail::spirv::AtomicFAdd(ptr, scope, order, operand);
+#else
     auto load_order = detail::getLoadOrder(order);
     T expected;
     T desired;
@@ -462,6 +467,7 @@ class atomic_ref_impl<
       desired = expected + operand;
     } while (!compare_exchange_weak(expected, desired, order, scope));
     return expected;
+#endif
   }
 
   T operator+=(T operand) const noexcept {
@@ -470,13 +476,19 @@ class atomic_ref_impl<
 
   T fetch_sub(T operand, memory_order order = default_read_modify_write_order,
               memory_scope scope = default_scope) const noexcept {
+// TODO: Remove the "native atomics" macro check once implemented for all
+// backends
+#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_USE_NATIVE_FP_ATOMICS)
+    return detail::spirv::AtomicFAdd(ptr, scope, order, -operand);
+#else
     auto load_order = detail::getLoadOrder(order);
     T expected = load(load_order, scope);
     T desired;
     do {
       desired = expected - operand;
     } while (!compare_exchange_weak(expected, desired, order, scope));
     return expected;
+#endif
   }
 
   T operator-=(T operand) const noexcept {
diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp
@@ -385,6 +385,16 @@ AtomicISub(multi_ptr<T, AddressSpace> MPtr, ONEAPI::memory_scope Scope,
   return __spirv_AtomicISub(Ptr, SPIRVScope, SPIRVOrder, Value);
 }
 
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_floating_point<T>::value, T>
+AtomicFAdd(multi_ptr<T, AddressSpace> MPtr, ONEAPI::memory_scope Scope,
+           ONEAPI::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicFAddEXT(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
 template <typename T, access::address_space AddressSpace>
 inline typename detail::enable_if_t<std::is_integral<T>::value, T>
 AtomicAnd(multi_ptr<T, AddressSpace> MPtr, ONEAPI::memory_scope Scope,
diff --git a/sycl/test/atomic_ref/add.cpp b/sycl/test/atomic_ref/add.cpp
@@ -1,5 +1,10 @@
+// TODO: Once NVPTX accepts the __spirv_AtomicF*() IR, remove the XFAIL mark
+// XFAIL: cuda
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -DSYCL_USE_NATIVE_FP_ATOMICS \
+// RUN:  -fsycl-device-only -S %s -o - | FileCheck %s --check-prefix=CHECK-LLVM
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-device-only -S %s -o - \
-// RUN: | FileCheck %s --check-prefix=CHECK-LLVM
+// RUN: | FileCheck %s --check-prefix=CHECK-LLVM-EMU
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %RUN_ON_HOST %t.out
 
@@ -167,22 +172,24 @@ void add_test(queue q, size_t N) {
 // Floating-point types do not support pre- or post-increment
 template <> void add_test<float>(queue q, size_t N) {
   add_fetch_test<float>(q, N);
-  // CHECK-LLVM: declare dso_local spir_func i32
-  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicLoad
-  // CHECK-LLVM-SAME: (i32 addrspace(1)*, i32, i32)
-  // CHECK-LLVM: declare dso_local spir_func i32
-  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicCompareExchange
-  // CHECK-LLVM-SAME: (i32 addrspace(1)*, i32, i32, i32, i32, i32)
+  // CHECK-LLVM: declare dso_local spir_func float
+  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicFAddEXT
+  // CHECK-LLVM-SAME: (float addrspace(1)*, i32, i32, float)
+  // CHECK-LLVM-EMU: declare {{.*}} i32 @{{.*}}__spirv_AtomicLoad
+  // CHECK-LLVM-EMU-SAME: (i32 addrspace(1)*, i32, i32)
+  // CHECK-LLVM-EMU: declare {{.*}} i32 @{{.*}}__spirv_AtomicCompareExchange
+  // CHECK-LLVM-EMU-SAME: (i32 addrspace(1)*, i32, i32, i32, i32, i32)
   add_plus_equal_test<float>(q, N);
 }
 template <> void add_test<double>(queue q, size_t N) {
   add_fetch_test<double>(q, N);
-  // CHECK-LLVM: declare dso_local spir_func i64
-  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicLoad
-  // CHECK-LLVM-SAME: (i64 addrspace(1)*, i32, i32)
-  // CHECK-LLVM: declare dso_local spir_func i64
-  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicCompareExchange
-  // CHECK-LLVM-SAME: (i64 addrspace(1)*, i32, i32, i32, i64, i64)
+  // CHECK-LLVM: declare dso_local spir_func double
+  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicFAddEXT
+  // CHECK-LLVM-SAME: double addrspace(1)*, i32, i32, double)
+  // CHECK-LLVM-EMU: declare {{.*}} i64 @{{.*}}__spirv_AtomicLoad
+  // CHECK-LLVM-EMU-SAME: (i64 addrspace(1)*, i32, i32)
+  // CHECK-LLVM-EMU: declare {{.*}} i64 @{{.*}}__spirv_AtomicCompareExchange
+  // CHECK-LLVM-EMU-SAME: (i64 addrspace(1)*, i32, i32, i32, i64, i64)
   add_plus_equal_test<double>(q, N);
 }
 
@@ -219,9 +226,15 @@ int main() {
   // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicIAdd
   // CHECK-LLVM-SAME: (i64 addrspace(1)*, i32, i32, i64)
   add_test<unsigned long long>(q, N);
-  // The remaining functions have been instantiated earlier
+  // Floating point-typed functions have been instantiated earlier
   add_test<float>(q, N);
   add_test<double>(q, N);
+  // CHECK-LLVM: declare dso_local spir_func i64
+  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicLoad
+  // CHECK-LLVM-SAME: i64 addrspace(1)*, i32, i32)
+  // CHECK-LLVM: declare dso_local spir_func i64
+  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicCompareExchange
+  // CHECK-LLVM-SAME: i64 addrspace(1)*, i32, i32, i32, i64, i64)
   add_test<char *, ptrdiff_t>(q, N);
 
   std::cout << "Test passed." << std::endl;
diff --git a/sycl/test/atomic_ref/sub.cpp b/sycl/test/atomic_ref/sub.cpp
@@ -1,3 +1,10 @@
+// TODO: Once NVPTX accepts the __spirv_AtomicF*() IR, remove the XFAIL mark
+// XFAIL: cuda
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -DSYCL_USE_NATIVE_FP_ATOMICS \
+// RUN:  -fsycl-device-only -S %s -o - | FileCheck %s --check-prefix=CHECK-LLVM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-device-only -S %s -o - \
+// RUN: | FileCheck %s --check-prefix=CHECK-LLVM-EMU
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %RUN_ON_HOST %t.out
 
@@ -165,22 +172,24 @@ void sub_test(queue q, size_t N) {
 // Floating-point types do not support pre- or post-decrement
 template <> void sub_test<float>(queue q, size_t N) {
   sub_fetch_test<float>(q, N);
-  // CHECK-LLVM: declare dso_local spir_func i32
-  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicLoad
-  // CHECK-LLVM-SAME: (i32 addrspace(1)*, i32, i32)
-  // CHECK-LLVM: declare dso_local spir_func i32
-  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicCompareExchange
-  // CHECK-LLVM-SAME: (i32 addrspace(1)*, i32, i32, i32, i32, i32)
+  // CHECK-LLVM: declare dso_local spir_func float
+  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicFAddEXT
+  // CHECK-LLVM-SAME: (float addrspace(1)*, i32, i32, float)
+  // CHECK-LLVM-EMU: declare {{.*}} i32 @{{.*}}__spirv_AtomicLoad
+  // CHECK-LLVM-EMU-SAME: (i32 addrspace(1)*, i32, i32)
+  // CHECK-LLVM-EMU: declare {{.*}} i32 @{{.*}}__spirv_AtomicCompareExchange
+  // CHECK-LLVM-EMU-SAME: (i32 addrspace(1)*, i32, i32, i32, i32, i32)
   sub_plus_equal_test<float>(q, N);
 }
 template <> void sub_test<double>(queue q, size_t N) {
   sub_fetch_test<double>(q, N);
-  // CHECK-LLVM: declare dso_local spir_func i64
-  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicLoad
-  // CHECK-LLVM-SAME: (i64 addrspace(1)*, i32, i32)
-  // CHECK-LLVM: declare dso_local spir_func i64
-  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicCompareExchange
-  // CHECK-LLVM-SAME: (i64 addrspace(1)*, i32, i32, i32, i64, i64)
+  // CHECK-LLVM: declare dso_local spir_func double
+  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicFAddEXT
+  // CHECK-LLVM-SAME: (double addrspace(1)*, i32, i32, double)
+  // CHECK-LLVM-EMU: declare {{.*}} i64 @{{.*}}__spirv_AtomicLoad
+  // CHECK-LLVM-EMU-SAME: (i64 addrspace(1)*, i32, i32)
+  // CHECK-LLVM-EMU: declare {{.*}} i64 @{{.*}}__spirv_AtomicCompareExchange
+  // CHECK-LLVM-EMU-SAME: (i64 addrspace(1)*, i32, i32, i32, i64, i64)
   sub_plus_equal_test<double>(q, N);
 }
 
@@ -217,9 +226,15 @@ int main() {
   // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicISub
   // CHECK-LLVM-SAME: (i64 addrspace(1)*, i32, i32, i64)
   sub_test<unsigned long long>(q, N);
-  // The remaining functions have been instantiated earlier
+  // Floating point-typed functions have been instantiated earlier
   sub_test<float>(q, N);
   sub_test<double>(q, N);
+  // CHECK-LLVM: declare dso_local spir_func i64
+  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicLoad
+  // CHECK-LLVM-SAME: (i64 addrspace(1)*, i32, i32)
+  // CHECK-LLVM: declare dso_local spir_func i64
+  // CHECK-LLVM-SAME: @_Z{{[0-9]+}}__spirv_AtomicCompareExchange
+  // CHECK-LLVM-SAME: (i64 addrspace(1)*, i32, i32, i32, i64, i64)
   sub_test<char *, ptrdiff_t>(q, N);
 
   std::cout << "Test passed." << std::endl;