Skip to content

Commit 7bc8447

Browse files
authored
[SYCL][CUDA][libclc] Add software atomics implementations for lower sm versions (#5998)
Adds software implementations for various atomics for sm versions blow ones where they are supported natively. Now all atomics, except for the ones using system scope are supported regardless of the sm version. Now `SYCL_USE_NATIVE_FP_ATOMICS` is also defined for CUDA by default. Closes #5936. Tested by: intel/llvm-test-suite#985
1 parent 0f0c5d1 commit 7bc8447

File tree

9 files changed

+220
-31
lines changed

9 files changed

+220
-31
lines changed

clang/lib/Frontend/InitPreprocessor.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,8 +1289,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
12891289

12901290
const llvm::Triple &DeviceTriple = TI.getTriple();
12911291
const llvm::Triple::SubArchType DeviceSubArch = DeviceTriple.getSubArch();
1292-
if (DeviceTriple.isSPIR() &&
1293-
DeviceSubArch != llvm::Triple::SPIRSubArch_fpga)
1292+
if (DeviceTriple.isNVPTX() ||
1293+
(DeviceTriple.isSPIR() &&
1294+
DeviceSubArch != llvm::Triple::SPIRSubArch_fpga))
12941295
Builder.defineMacro("SYCL_USE_NATIVE_FP_ATOMICS");
12951296
// Enable generation of USM address spaces for FPGA.
12961297
if (DeviceSubArch == llvm::Triple::SPIRSubArch_fpga) {

clang/test/Preprocessor/sycl-macro-target-specific.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
// RUN: %clang_cc1 %s -fsycl-is-device -triple spir64_fpga-unknown-unknown -E -dM \
2121
// RUN: | FileCheck --check-prefix=CHECK-SYCL-FP-ATOMICS-NEG %s
2222
// RUN: %clang_cc1 %s -fsycl-is-device -triple nvptx64-nvidia-nvcl -E -dM \
23-
// RUN: | FileCheck --check-prefix=CHECK-SYCL-FP-ATOMICS-NEG %s
23+
// RUN: | FileCheck --check-prefix=CHECK-SYCL-FP-ATOMICS %s
2424
// CHECK-SYCL-FP-ATOMICS: #define SYCL_USE_NATIVE_FP_ATOMICS
2525
// CHECK-SYCL-FP-ATOMICS-NEG-NOT: #define SYCL_USE_NATIVE_FP_ATOMICS
2626

libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,98 @@ __CLC_NVVM_ATOMIC(ulong, m, long, l, add, _Z18__spirv_AtomicIAdd)
1717

1818
__CLC_NVVM_ATOMIC(float, f, float, f, add, _Z21__spirv_AtomicFAddEXT)
1919
#ifdef cl_khr_int64_base_atomics
20-
__CLC_NVVM_ATOMIC(double, d, double, d, add, _Z21__spirv_AtomicFAddEXT)
20+
21+
#define __CLC_NVVM_ATOMIC_ADD_DOUBLE_IMPL(ADDR_SPACE, ADDR_SPACE_MANGLED, \
22+
ADDR_SPACE_NV, SUBSTITUTION1, \
23+
SUBSTITUTION2) \
24+
long \
25+
_Z18__spirv_AtomicLoadP##ADDR_SPACE_MANGLED##KlN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagE( \
26+
volatile ADDR_SPACE const long *, enum Scope, \
27+
enum MemorySemanticsMask); \
28+
long \
29+
_Z29__spirv_AtomicCompareExchange##P##ADDR_SPACE_MANGLED##lN5__spv5Scope4FlagENS##SUBSTITUTION1##_19MemorySemanticsMask4FlagES##SUBSTITUTION2##_ll( \
30+
volatile ADDR_SPACE long *, enum Scope, enum MemorySemanticsMask, \
31+
enum MemorySemanticsMask, long, long); \
32+
__attribute__((always_inline)) _CLC_DECL double \
33+
_Z21__spirv_AtomicFAddEXT##P##ADDR_SPACE_MANGLED##d##N5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagE##d( \
34+
volatile ADDR_SPACE double *pointer, enum Scope scope, \
35+
enum MemorySemanticsMask semantics, double value) { \
36+
/* Semantics mask may include memory order, storage class and other info \
37+
Memory order is stored in the lowest 5 bits */ \
38+
unsigned int order = semantics & 0x1F; \
39+
if (__clc_nvvm_reflect_arch() >= 600) { \
40+
switch (order) { \
41+
case None: \
42+
__CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \
43+
ADDR_SPACE_NV, ) \
44+
break; \
45+
case Acquire: \
46+
if (__clc_nvvm_reflect_arch() >= 700) { \
47+
__CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \
48+
ADDR_SPACE_NV, _acquire) \
49+
} else { \
50+
__CLC_NVVM_ATOMIC_IMPL_ACQUIRE_FENCE(double, double, d, add, \
51+
ADDR_SPACE, ADDR_SPACE_NV) \
52+
} \
53+
break; \
54+
case Release: \
55+
if (__clc_nvvm_reflect_arch() >= 700) { \
56+
__CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \
57+
ADDR_SPACE_NV, _release) \
58+
} else { \
59+
__spirv_MemoryBarrier(scope, Release); \
60+
__CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \
61+
ADDR_SPACE_NV, ) \
62+
} \
63+
break; \
64+
case AcquireRelease: \
65+
if (__clc_nvvm_reflect_arch() >= 700) { \
66+
__CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \
67+
ADDR_SPACE_NV, _acq_rel) \
68+
} else { \
69+
__spirv_MemoryBarrier(scope, Release); \
70+
__CLC_NVVM_ATOMIC_IMPL_ACQUIRE_FENCE(double, double, d, add, \
71+
ADDR_SPACE, ADDR_SPACE_NV) \
72+
} \
73+
break; \
74+
} \
75+
__builtin_trap(); \
76+
__builtin_unreachable(); \
77+
} else { \
78+
enum MemorySemanticsMask load_order; \
79+
switch (semantics) { \
80+
case SequentiallyConsistent: \
81+
load_order = SequentiallyConsistent; \
82+
break; \
83+
case Acquire: \
84+
case AcquireRelease: \
85+
load_order = Acquire; \
86+
break; \
87+
default: \
88+
load_order = None; \
89+
} \
90+
volatile ADDR_SPACE long *pointer_int = \
91+
(volatile ADDR_SPACE long *)pointer; \
92+
long old_int; \
93+
long new_val_int; \
94+
do { \
95+
old_int = \
96+
_Z18__spirv_AtomicLoadP##ADDR_SPACE_MANGLED##KlN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagE( \
97+
pointer_int, scope, load_order); \
98+
double new_val = *(double *)&old_int + *(double *)&value; \
99+
new_val_int = *(long *)&new_val; \
100+
} while ( \
101+
_Z29__spirv_AtomicCompareExchange##P##ADDR_SPACE_MANGLED##lN5__spv5Scope4FlagENS##SUBSTITUTION1##_19MemorySemanticsMask4FlagES##SUBSTITUTION2##_ll( \
102+
pointer_int, scope, semantics, semantics, new_val_int, \
103+
old_int) != old_int); \
104+
return *(double *)&old_int; \
105+
} \
106+
}
107+
108+
__CLC_NVVM_ATOMIC_ADD_DOUBLE_IMPL(, , _gen_, 0, 4)
109+
__CLC_NVVM_ATOMIC_ADD_DOUBLE_IMPL(__global, U3AS1, _global_, 1, 5)
110+
__CLC_NVVM_ATOMIC_ADD_DOUBLE_IMPL(__local, U3AS3, _shared_, 1, 5)
111+
21112
#endif
22113

23114
#undef __CLC_NVVM_ATOMIC_TYPES

0 commit comments

Comments
 (0)