Skip to content

Commit a432eca

Browse files
author
Hugh Delaney
committed
WIP
1 parent e4e5a01 commit a432eca

File tree

2 files changed

+27
-17
lines changed

2 files changed

+27
-17
lines changed

libclc/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
339339
elseif( ${ARCH} STREQUAL amdgcn )
340340
# AMDGCN needs generic address space for atomics
341341
set( flags "SHELL:-Xclang -cl-std=CL2.0")
342+
set( flags "SHELL:-mcpu=gfx940")
342343
else()
343344
set ( flags )
344345
endif()

libclc/amdgcn-amdhsa/libspirv/atomic/atomic_add.cl

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,36 +10,37 @@
1010
#include <spirv/spirv.h>
1111
#include <spirv/spirv_types.h>
1212

13-
extern int __oclc_ISA_version;
14-
15-
//extern float __builtin_amdgcn_global_atomic_fadd_f32(global float *, float);
13+
extern constant int __oclc_ISA_version;
1614

1715
AMDGPU_ATOMIC(_Z18__spirv_AtomicIAdd, int, i, __hip_atomic_fetch_add)
1816
AMDGPU_ATOMIC(_Z18__spirv_AtomicIAdd, unsigned int, j, __hip_atomic_fetch_add)
1917
AMDGPU_ATOMIC(_Z18__spirv_AtomicIAdd, long, l, __hip_atomic_fetch_add)
2018
AMDGPU_ATOMIC(_Z18__spirv_AtomicIAdd, unsigned long, m, __hip_atomic_fetch_add)
21-
//AMDGPU_ATOMIC(_Z21__spirv_AtomicFAddEXT, float, f, __hip_atomic_fetch_add)
2219

23-
#define AMDGPU_ATOMIC_FP32_IMPL(AS, AS_MANGLED, SUB1) \
20+
#define AMDGPU_ATOMIC_FP32_ADD_IMPL(AS, AS_MANGLED, SUB1, CHECK, NEW_BUILTIN) \
2421
_CLC_DEF float \
2522
_Z21__spirv_AtomicFAddEXT##P##AS_MANGLED##fN5__spv5Scope4FlagENS##SUB1##_19MemorySemanticsMask4FlagEf( \
2623
volatile AS float *p, enum Scope scope, \
2724
enum MemorySemanticsMask semantics, float val) { \
25+
if (CHECK) { \
26+
float ret = NEW_BUILTIN(p, val); \
27+
return *(float *)&ret; \
28+
} \
2829
int atomic_scope = 0, memory_order = 0; \
2930
GET_ATOMIC_SCOPE_AND_ORDER(scope, atomic_scope, semantics, memory_order) \
30-
float ret; \
31-
if (__oclc_ISA_version > 9010) \
32-
ret = __builtin_amdgcn_global_atomic_fadd_f32(p, val); \
33-
else \
34-
ret = __hip_atomic_fetch_add(p, val, memory_order, atomic_scope); \
31+
float ret = __hip_atomic_fetch_add(p, val, memory_order, atomic_scope); \
3532
return *(float *)&ret; \
3633
}
3734

38-
AMDGPU_ATOMIC_FP32_IMPL(global, U3AS1, 1)
39-
// AMDGPU_ATOMIC_FP32_IMPL(local, U3AS3, 1)
40-
// AMDGPU_ATOMIC_FP32_IMPL(, , 0)
35+
AMDGPU_ATOMIC_FP32_ADD_IMPL(global, U3AS1, 1, __oclc_ISA_version >= 9010,
36+
__builtin_amdgcn_global_atomic_fadd_f32)
37+
AMDGPU_ATOMIC_FP32_ADD_IMPL(local, U3AS3, 1, __oclc_ISA_version >= 8000,
38+
__builtin_amdgcn_ds_atomic_fadd_f32)
39+
AMDGPU_ATOMIC_FP32_ADD_IMPL(, , 0, __oclc_ISA_version >= 9400,
40+
__builtin_amdgcn_flat_atomic_fadd_f32)
4141

42-
#define AMDGPU_ATOMIC_FP64_ADD_IMPL(AS, AS_MANGLED, SUB1, SUB2) \
42+
#define AMDGPU_ATOMIC_FP64_ADD_IMPL(AS, AS_MANGLED, SUB1, SUB2, CHECK, \
43+
NEW_BUILTIN) \
4344
_CLC_DEF long \
4445
_Z29__spirv_AtomicCompareExchangeP##AS_MANGLED##lN5__spv5Scope4FlagENS##SUB1##_19MemorySemanticsMask4FlagES##SUB2##_ll( \
4546
volatile AS long *, enum Scope, enum MemorySemanticsMask, \
@@ -51,6 +52,10 @@ AMDGPU_ATOMIC_FP32_IMPL(global, U3AS1, 1)
5152
_Z21__spirv_AtomicFAddEXTP##AS_MANGLED##dN5__spv5Scope4FlagENS##SUB1##_19MemorySemanticsMask4FlagEd( \
5253
volatile AS double *p, enum Scope scope, \
5354
enum MemorySemanticsMask semantics, double val) { \
55+
if (CHECK) { \
56+
double ret = NEW_BUILTIN(p, val); \
57+
return *(double *)&ret; \
58+
} \
5459
int atomic_scope = 0, memory_order = 0; \
5560
volatile AS long *int_pointer = (volatile AS long *)p; \
5661
long old_int_val = 0, new_int_val = 0; \
@@ -69,12 +74,16 @@ AMDGPU_ATOMIC_FP32_IMPL(global, U3AS1, 1)
6974
}
7075

7176
#ifdef cl_khr_int64_base_atomics
72-
AMDGPU_ATOMIC_FP64_ADD_IMPL(global, U3AS1, 1, 5)
73-
AMDGPU_ATOMIC_FP64_ADD_IMPL(local, U3AS3, 1, 5)
74-
AMDGPU_ATOMIC_FP64_ADD_IMPL(, , 0, 4)
77+
AMDGPU_ATOMIC_FP64_ADD_IMPL(global, U3AS1, 1, 5, __oclc_ISA_version >= 9010,
78+
__builtin_amdgcn_global_atomic_fadd_f64)
79+
AMDGPU_ATOMIC_FP64_ADD_IMPL(local, U3AS3, 1, 5, __oclc_ISA_version >= 9010,
80+
__builtin_amdgcn_ds_atomic_fadd_f64)
81+
AMDGPU_ATOMIC_FP64_ADD_IMPL(, , 0, 4, __oclc_ISA_version >= 9400,
82+
__builtin_amdgcn_flat_atomic_fadd_f64)
7583
#endif
7684

7785
#undef AMDGPU_ATOMIC
7886
#undef AMDGPU_ATOMIC_IMPL
87+
#undef AMDGPU_ATOMIC_FP32_ADD_IMPL
7988
#undef AMDGPU_ATOMIC_FP64_ADD_IMPL
8089
#undef GET_ATOMIC_SCOPE_AND_ORDER

0 commit comments

Comments
 (0)