58
58
#include "llvm/IR/MDBuilder.h"
59
59
#include "llvm/IR/MatrixBuilder.h"
60
60
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61
+ #include "llvm/Support/AMDGPUAddrSpace.h"
61
62
#include "llvm/Support/ConvertUTF.h"
62
63
#include "llvm/Support/MathExtras.h"
63
64
#include "llvm/Support/ScopedPrinter.h"
@@ -18654,8 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18654
18655
Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
18655
18656
return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
18656
18657
}
18657
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18658
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18659
18658
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18660
18659
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18661
18660
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18667,18 +18666,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18667
18666
Intrinsic::ID IID;
18668
18667
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18669
18668
switch (BuiltinID) {
18670
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18671
- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18672
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18673
- break;
18674
18669
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18675
18670
ArgTy = llvm::FixedVectorType::get(
18676
18671
llvm::Type::getHalfTy(getLLVMContext()), 2);
18677
18672
IID = Intrinsic::amdgcn_global_atomic_fadd;
18678
18673
break;
18679
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18680
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18681
- break;
18682
18674
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18683
18675
IID = Intrinsic::amdgcn_global_atomic_fmin;
18684
18676
break;
@@ -19091,7 +19083,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19091
19083
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
19092
19084
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
19093
19085
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19094
- case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
19086
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19087
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19088
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
19095
19089
llvm::AtomicRMWInst::BinOp BinOp;
19096
19090
switch (BuiltinID) {
19097
19091
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19107,6 +19101,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19107
19101
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
19108
19102
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19109
19103
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19104
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19105
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
19110
19106
BinOp = llvm::AtomicRMWInst::FAdd;
19111
19107
break;
19112
19108
}
@@ -19133,8 +19129,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19133
19129
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
19134
19130
EmitScalarExpr(E->getArg(3)), AO, SSID);
19135
19131
} else {
19136
- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19137
- SSID = llvm::SyncScope::System;
19132
+ // Most of the builtins do not have syncscope/order arguments. For DS
19133
+ // atomics the scope doesn't really matter, as they implicitly operate at
19134
+ // workgroup scope.
19135
+ //
19136
+ // The global/flat cases need to use agent scope to consistently produce
19137
+ // the native instruction instead of a cmpxchg expansion.
19138
+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
19138
19139
AO = AtomicOrdering::SequentiallyConsistent;
19139
19140
19140
19141
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19149,6 +19150,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19149
19150
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
19150
19151
if (Volatile)
19151
19152
RMW->setVolatile(true);
19153
+
19154
+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19155
+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19156
+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19157
+ // instruction for flat and global operations.
19158
+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19159
+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19160
+
19161
+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19162
+ // instruction, but this only matters for float fadd.
19163
+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19164
+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19165
+ }
19166
+
19152
19167
return Builder.CreateBitCast(RMW, OrigTy);
19153
19168
}
19154
19169
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments