Skip to content

Commit 099aec4

Browse files
committed
clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64}
Need to emit syncscope and new metadata to get the native instruction, most of the time.
1 parent e398da2 commit 099aec4

File tree

5 files changed

+34
-19
lines changed

5 files changed

+34
-19
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include "llvm/IR/MDBuilder.h"
5959
#include "llvm/IR/MatrixBuilder.h"
6060
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+
#include "llvm/Support/AMDGPUAddrSpace.h"
6162
#include "llvm/Support/ConvertUTF.h"
6263
#include "llvm/Support/MathExtras.h"
6364
#include "llvm/Support/ScopedPrinter.h"
@@ -18919,8 +18920,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1891918920
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1892018921
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1892118922
}
18922-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18923-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1892418923
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1892518924
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1892618925
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18932,18 +18931,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1893218931
Intrinsic::ID IID;
1893318932
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1893418933
switch (BuiltinID) {
18935-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18936-
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18937-
IID = Intrinsic::amdgcn_global_atomic_fadd;
18938-
break;
1893918934
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1894018935
ArgTy = llvm::FixedVectorType::get(
1894118936
llvm::Type::getHalfTy(getLLVMContext()), 2);
1894218937
IID = Intrinsic::amdgcn_global_atomic_fadd;
1894318938
break;
18944-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18945-
IID = Intrinsic::amdgcn_global_atomic_fadd;
18946-
break;
1894718939
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1894818940
IID = Intrinsic::amdgcn_global_atomic_fmin;
1894918941
break;
@@ -19366,7 +19358,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1936619358
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1936719359
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1936819360
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19369-
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19361+
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19362+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19363+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1937019364
llvm::AtomicRMWInst::BinOp BinOp;
1937119365
switch (BuiltinID) {
1937219366
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19382,6 +19376,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1938219376
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1938319377
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1938419378
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19379+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19380+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1938519381
BinOp = llvm::AtomicRMWInst::FAdd;
1938619382
break;
1938719383
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19416,8 +19412,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1941619412
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1941719413
EmitScalarExpr(E->getArg(3)), AO, SSID);
1941819414
} else {
19419-
// The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19420-
SSID = llvm::SyncScope::System;
19415+
// Most of the builtins do not have syncscope/order arguments. For DS
19416+
// atomics the scope doesn't really matter, as they implicitly operate at
19417+
// workgroup scope.
19418+
//
19419+
// The global/flat cases need to use agent scope to consistently produce
19420+
// the native instruction instead of a cmpxchg expansion.
19421+
SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1942119422
AO = AtomicOrdering::SequentiallyConsistent;
1942219423

1942319424
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19432,6 +19433,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1943219433
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1943319434
if (Volatile)
1943419435
RMW->setVolatile(true);
19436+
19437+
unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19438+
if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19439+
// Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19440+
// instruction for flat and global operations.
19441+
llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19442+
RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19443+
19444+
// Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19445+
// instruction, but this only matters for float fadd.
19446+
if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19447+
RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19448+
}
19449+
1943519450
return Builder.CreateBitCast(RMW, OrigTy);
1943619451
}
1943719452
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
4949
}
5050

5151
// CHECK-LABEL: @test_global_add_f32
52-
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
52+
// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
5353
void test_global_add_f32(float *rtn, global float *addr, float x) {
5454
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
5555
}

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
1111

1212
// CHECK-LABEL: test_local_add_2bf16
1313
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
14-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
14+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
1515
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
1616

1717
// GFX12-LABEL: test_local_add_2bf16
@@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
2222

2323
// CHECK-LABEL: test_local_add_2bf16_noret
2424
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
25-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
25+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
2626
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
2727

2828
// GFX12-LABEL: test_local_add_2bf16_noret

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
typedef half __attribute__((ext_vector_type(2))) half2;
1010

1111
// CHECK-LABEL: test_global_add_f64
12-
// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %{{.*}}, double %{{.*}})
12+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
1313
// GFX90A-LABEL: test_global_add_f64$local:
1414
// GFX90A: global_atomic_add_f64
1515
void test_global_add_f64(__global double *addr, double x) {
@@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){
117117
}
118118

119119
// CHECK-LABEL: @test_global_add_f32
120-
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
120+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
121121
void test_global_add_f32(float *rtn, global float *addr, float x) {
122122
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
123123
}

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
4444
// CHECK-LABEL: test_local_add_2bf16
4545

4646
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
47-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
47+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4{{$}}
4848
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
4949

5050
// GFX940-LABEL: test_local_add_2bf16
@@ -70,7 +70,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
7070
}
7171

7272
// CHECK-LABEL: @test_global_add_f32
73-
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
73+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
7474
void test_global_add_f32(float *rtn, global float *addr, float x) {
7575
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
7676
}

0 commit comments

Comments
 (0)