Skip to content

Commit f5747ae

Browse files
committed
clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64}
Need to emit syncscope and new metadata to get the native instruction, most of the time.
1 parent d942afd commit f5747ae

File tree

5 files changed

+34
-19
lines changed

5 files changed

+34
-19
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include "llvm/IR/MDBuilder.h"
5959
#include "llvm/IR/MatrixBuilder.h"
6060
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+
#include "llvm/Support/AMDGPUAddrSpace.h"
6162
#include "llvm/Support/ConvertUTF.h"
6263
#include "llvm/Support/MathExtras.h"
6364
#include "llvm/Support/ScopedPrinter.h"
@@ -18790,8 +18791,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1879018791
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1879118792
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1879218793
}
18793-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18794-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1879518794
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1879618795
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1879718796
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18803,18 +18802,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1880318802
Intrinsic::ID IID;
1880418803
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1880518804
switch (BuiltinID) {
18806-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18807-
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18808-
IID = Intrinsic::amdgcn_global_atomic_fadd;
18809-
break;
1881018805
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1881118806
ArgTy = llvm::FixedVectorType::get(
1881218807
llvm::Type::getHalfTy(getLLVMContext()), 2);
1881318808
IID = Intrinsic::amdgcn_global_atomic_fadd;
1881418809
break;
18815-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18816-
IID = Intrinsic::amdgcn_global_atomic_fadd;
18817-
break;
1881818810
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1881918811
IID = Intrinsic::amdgcn_global_atomic_fmin;
1882018812
break;
@@ -19237,7 +19229,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1923719229
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1923819230
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1923919231
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19240-
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19232+
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19233+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19234+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1924119235
llvm::AtomicRMWInst::BinOp BinOp;
1924219236
switch (BuiltinID) {
1924319237
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19253,6 +19247,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1925319247
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1925419248
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1925519249
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19250+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19251+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1925619252
BinOp = llvm::AtomicRMWInst::FAdd;
1925719253
break;
1925819254
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19287,8 +19283,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1928719283
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1928819284
EmitScalarExpr(E->getArg(3)), AO, SSID);
1928919285
} else {
19290-
// The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19291-
SSID = llvm::SyncScope::System;
19286+
// Most of the builtins do not have syncscope/order arguments. For DS
19287+
// atomics the scope doesn't really matter, as they implicitly operate at
19288+
// workgroup scope.
19289+
//
19290+
// The global/flat cases need to use agent scope to consistently produce
19291+
// the native instruction instead of a cmpxchg expansion.
19292+
SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1929219293
AO = AtomicOrdering::SequentiallyConsistent;
1929319294

1929419295
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19303,6 +19304,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1930319304
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1930419305
if (Volatile)
1930519306
RMW->setVolatile(true);
19307+
19308+
unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19309+
if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19310+
// Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19311+
// instruction for flat and global operations.
19312+
llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19313+
RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19314+
19315+
// Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19316+
// instruction, but this only matters for float fadd.
19317+
if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19318+
RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19319+
}
19320+
1930619321
return Builder.CreateBitCast(RMW, OrigTy);
1930719322
}
1930819323
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
4949
}
5050

5151
// CHECK-LABEL: @test_global_add_f32
52-
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
52+
// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
5353
void test_global_add_f32(float *rtn, global float *addr, float x) {
5454
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
5555
}

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
1111

1212
// CHECK-LABEL: test_local_add_2bf16
1313
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
14-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
14+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
1515
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
1616

1717
// GFX12-LABEL: test_local_add_2bf16
@@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
2222

2323
// CHECK-LABEL: test_local_add_2bf16_noret
2424
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
25-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
25+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
2626
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
2727

2828
// GFX12-LABEL: test_local_add_2bf16_noret

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
typedef half __attribute__((ext_vector_type(2))) half2;
1010

1111
// CHECK-LABEL: test_global_add_f64
12-
// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %{{.*}}, double %{{.*}})
12+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
1313
// GFX90A-LABEL: test_global_add_f64$local:
1414
// GFX90A: global_atomic_add_f64
1515
void test_global_add_f64(__global double *addr, double x) {
@@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){
117117
}
118118

119119
// CHECK-LABEL: @test_global_add_f32
120-
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
120+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
121121
void test_global_add_f32(float *rtn, global float *addr, float x) {
122122
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
123123
}

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
4444
// CHECK-LABEL: test_local_add_2bf16
4545

4646
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
47-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
47+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4{{$}}
4848
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
4949

5050
// GFX940-LABEL: test_local_add_2bf16
@@ -70,7 +70,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
7070
}
7171

7272
// CHECK-LABEL: @test_global_add_f32
73-
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
73+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
7474
void test_global_add_f32(float *rtn, global float *addr, float x) {
7575
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
7676
}

0 commit comments

Comments
 (0)