Skip to content

Commit bd298a4

Browse files
committed
clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64}
Need to emit syncscope and new metadata to get the native instruction, most of the time.
1 parent 917be5c commit bd298a4

File tree

5 files changed

+34
-19
lines changed

5 files changed

+34
-19
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include "llvm/IR/MDBuilder.h"
5959
#include "llvm/IR/MatrixBuilder.h"
6060
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+
#include "llvm/Support/AMDGPUAddrSpace.h"
6162
#include "llvm/Support/ConvertUTF.h"
6263
#include "llvm/Support/MathExtras.h"
6364
#include "llvm/Support/ScopedPrinter.h"
@@ -18654,8 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1865418655
Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
1865518656
return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
1865618657
}
18657-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18658-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1865918658
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1866018659
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1866118660
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18667,18 +18666,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1866718666
Intrinsic::ID IID;
1866818667
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1866918668
switch (BuiltinID) {
18670-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18671-
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18672-
IID = Intrinsic::amdgcn_global_atomic_fadd;
18673-
break;
1867418669
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1867518670
ArgTy = llvm::FixedVectorType::get(
1867618671
llvm::Type::getHalfTy(getLLVMContext()), 2);
1867718672
IID = Intrinsic::amdgcn_global_atomic_fadd;
1867818673
break;
18679-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18680-
IID = Intrinsic::amdgcn_global_atomic_fadd;
18681-
break;
1868218674
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1868318675
IID = Intrinsic::amdgcn_global_atomic_fmin;
1868418676
break;
@@ -19091,7 +19083,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1909119083
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1909219084
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1909319085
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19094-
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
19086+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19087+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19088+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1909519089
llvm::AtomicRMWInst::BinOp BinOp;
1909619090
switch (BuiltinID) {
1909719091
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19107,6 +19101,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1910719101
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1910819102
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1910919103
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19104+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19105+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1911019106
BinOp = llvm::AtomicRMWInst::FAdd;
1911119107
break;
1911219108
}
@@ -19133,8 +19129,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1913319129
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1913419130
EmitScalarExpr(E->getArg(3)), AO, SSID);
1913519131
} else {
19136-
// The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19137-
SSID = llvm::SyncScope::System;
19132+
// Most of the builtins do not have syncscope/order arguments. For DS
19133+
// atomics the scope doesn't really matter, as they implicitly operate at
19134+
// workgroup scope.
19135+
//
19136+
// The global/flat cases need to use agent scope to consistently produce
19137+
// the native instruction instead of a cmpxchg expansion.
19138+
SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1913819139
AO = AtomicOrdering::SequentiallyConsistent;
1913919140

1914019141
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19149,6 +19150,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1914919150
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1915019151
if (Volatile)
1915119152
RMW->setVolatile(true);
19153+
19154+
unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19155+
if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19156+
// Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19157+
// instruction for flat and global operations.
19158+
llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19159+
RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19160+
19161+
// Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19162+
// instruction, but this only matters for float fadd.
19163+
if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19164+
RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19165+
}
19166+
1915219167
return Builder.CreateBitCast(RMW, OrigTy);
1915319168
}
1915419169
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
4949
}
5050

5151
// CHECK-LABEL: @test_global_add_f32
52-
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
52+
// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
5353
void test_global_add_f32(float *rtn, global float *addr, float x) {
5454
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
5555
}

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
1111

1212
// CHECK-LABEL: test_local_add_2bf16
1313
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
14-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
14+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
1515
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
1616

1717
// GFX12-LABEL: test_local_add_2bf16
@@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
2222

2323
// CHECK-LABEL: test_local_add_2bf16_noret
2424
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
25-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
25+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
2626
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
2727

2828
// GFX12-LABEL: test_local_add_2bf16_noret

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
typedef half __attribute__((ext_vector_type(2))) half2;
1010

1111
// CHECK-LABEL: test_global_add_f64
12-
// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %{{.*}}, double %{{.*}})
12+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
1313
// GFX90A-LABEL: test_global_add_f64$local:
1414
// GFX90A: global_atomic_add_f64
1515
void test_global_add_f64(__global double *addr, double x) {
@@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){
117117
}
118118

119119
// CHECK-LABEL: @test_global_add_f32
120-
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
120+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
121121
void test_global_add_f32(float *rtn, global float *addr, float x) {
122122
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
123123
}

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
4444
// CHECK-LABEL: test_local_add_2bf16
4545

4646
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
47-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
47+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4{{$}}
4848
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
4949

5050
// GFX940-LABEL: test_local_add_2bf16
@@ -70,7 +70,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
7070
}
7171

7272
// CHECK-LABEL: @test_global_add_f32
73-
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
73+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
7474
void test_global_add_f32(float *rtn, global float *addr, float x) {
7575
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
7676
}

0 commit comments

Comments
 (0)