Skip to content

Commit ef284fd

Browse files
committed
clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64}
Need to emit syncscope and new metadata to get the native instruction, most of the time.
1 parent f1a1777 commit ef284fd

File tree

5 files changed

+34
-19
lines changed

5 files changed

+34
-19
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include "llvm/IR/MDBuilder.h"
5959
#include "llvm/IR/MatrixBuilder.h"
6060
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+
#include "llvm/Support/AMDGPUAddrSpace.h"
6162
#include "llvm/Support/ConvertUTF.h"
6263
#include "llvm/Support/MathExtras.h"
6364
#include "llvm/Support/ScopedPrinter.h"
@@ -18743,8 +18744,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1874318744
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1874418745
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1874518746
}
18746-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18747-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1874818747
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1874918748
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1875018749
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18756,18 +18755,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1875618755
Intrinsic::ID IID;
1875718756
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1875818757
switch (BuiltinID) {
18759-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18760-
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18761-
IID = Intrinsic::amdgcn_global_atomic_fadd;
18762-
break;
1876318758
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1876418759
ArgTy = llvm::FixedVectorType::get(
1876518760
llvm::Type::getHalfTy(getLLVMContext()), 2);
1876618761
IID = Intrinsic::amdgcn_global_atomic_fadd;
1876718762
break;
18768-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18769-
IID = Intrinsic::amdgcn_global_atomic_fadd;
18770-
break;
1877118763
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1877218764
IID = Intrinsic::amdgcn_global_atomic_fmin;
1877318765
break;
@@ -19190,7 +19182,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1919019182
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1919119183
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1919219184
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19193-
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19185+
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19186+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19187+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1919419188
llvm::AtomicRMWInst::BinOp BinOp;
1919519189
switch (BuiltinID) {
1919619190
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19206,6 +19200,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1920619200
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1920719201
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1920819202
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19203+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19204+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1920919205
BinOp = llvm::AtomicRMWInst::FAdd;
1921019206
break;
1921119207
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19240,8 +19236,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1924019236
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1924119237
EmitScalarExpr(E->getArg(3)), AO, SSID);
1924219238
} else {
19243-
// The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19244-
SSID = llvm::SyncScope::System;
19239+
// Most of the builtins do not have syncscope/order arguments. For DS
19240+
// atomics the scope doesn't really matter, as they implicitly operate at
19241+
// workgroup scope.
19242+
//
19243+
// The global/flat cases need to use agent scope to consistently produce
19244+
// the native instruction instead of a cmpxchg expansion.
19245+
SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1924519246
AO = AtomicOrdering::SequentiallyConsistent;
1924619247

1924719248
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19256,6 +19257,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1925619257
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1925719258
if (Volatile)
1925819259
RMW->setVolatile(true);
19260+
19261+
unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19262+
if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19263+
// Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19264+
// instruction for flat and global operations.
19265+
llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19266+
RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19267+
19268+
// Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19269+
// instruction, but this only matters for float fadd.
19270+
if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19271+
RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19272+
}
19273+
1925919274
return Builder.CreateBitCast(RMW, OrigTy);
1926019275
}
1926119276
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
4949
}
5050

5151
// CHECK-LABEL: @test_global_add_f32
52-
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
52+
// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
5353
void test_global_add_f32(float *rtn, global float *addr, float x) {
5454
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
5555
}

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
1111

1212
// CHECK-LABEL: test_local_add_2bf16
1313
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
14-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
14+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
1515
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
1616

1717
// GFX12-LABEL: test_local_add_2bf16
@@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
2222

2323
// CHECK-LABEL: test_local_add_2bf16_noret
2424
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
25-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
25+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
2626
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
2727

2828
// GFX12-LABEL: test_local_add_2bf16_noret

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
typedef half __attribute__((ext_vector_type(2))) half2;
1010

1111
// CHECK-LABEL: test_global_add_f64
12-
// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %{{.*}}, double %{{.*}})
12+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
1313
// GFX90A-LABEL: test_global_add_f64$local:
1414
// GFX90A: global_atomic_add_f64
1515
void test_global_add_f64(__global double *addr, double x) {
@@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){
117117
}
118118

119119
// CHECK-LABEL: @test_global_add_f32
120-
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
120+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
121121
void test_global_add_f32(float *rtn, global float *addr, float x) {
122122
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
123123
}

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
4444
// CHECK-LABEL: test_local_add_2bf16
4545

4646
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
47-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
47+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4{{$}}
4848
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
4949

5050
// GFX940-LABEL: test_local_add_2bf16
@@ -70,7 +70,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
7070
}
7171

7272
// CHECK-LABEL: @test_global_add_f32
73-
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
73+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
7474
void test_global_add_f32(float *rtn, global float *addr, float x) {
7575
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
7676
}

0 commit comments

Comments
 (0)