58
58
#include "llvm/IR/MDBuilder.h"
59
59
#include "llvm/IR/MatrixBuilder.h"
60
60
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61
+ #include "llvm/Support/AMDGPUAddrSpace.h"
61
62
#include "llvm/Support/ConvertUTF.h"
62
63
#include "llvm/Support/MathExtras.h"
63
64
#include "llvm/Support/ScopedPrinter.h"
@@ -18743,8 +18744,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18743
18744
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
18744
18745
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
18745
18746
}
18746
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18747
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18748
18747
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18749
18748
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18750
18749
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18756,18 +18755,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18756
18755
Intrinsic::ID IID;
18757
18756
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18758
18757
switch (BuiltinID) {
18759
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18760
- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18761
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18762
- break;
18763
18758
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18764
18759
ArgTy = llvm::FixedVectorType::get(
18765
18760
llvm::Type::getHalfTy(getLLVMContext()), 2);
18766
18761
IID = Intrinsic::amdgcn_global_atomic_fadd;
18767
18762
break;
18768
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18769
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18770
- break;
18771
18763
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18772
18764
IID = Intrinsic::amdgcn_global_atomic_fmin;
18773
18765
break;
@@ -19190,7 +19182,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19190
19182
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19191
19183
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
19192
19184
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19193
- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19185
+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19186
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19187
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
19194
19188
llvm::AtomicRMWInst::BinOp BinOp;
19195
19189
switch (BuiltinID) {
19196
19190
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19206,6 +19200,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19206
19200
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
19207
19201
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19208
19202
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19203
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19204
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
19209
19205
BinOp = llvm::AtomicRMWInst::FAdd;
19210
19206
break;
19211
19207
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19240,8 +19236,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19240
19236
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
19241
19237
EmitScalarExpr(E->getArg(3)), AO, SSID);
19242
19238
} else {
19243
- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19244
- SSID = llvm::SyncScope::System;
19239
+ // Most of the builtins do not have syncscope/order arguments. For DS
19240
+ // atomics the scope doesn't really matter, as they implicitly operate at
19241
+ // workgroup scope.
19242
+ //
19243
+ // The global/flat cases need to use agent scope to consistently produce
19244
+ // the native instruction instead of a cmpxchg expansion.
19245
+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
19245
19246
AO = AtomicOrdering::SequentiallyConsistent;
19246
19247
19247
19248
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19256,6 +19257,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19256
19257
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
19257
19258
if (Volatile)
19258
19259
RMW->setVolatile(true);
19260
+
19261
+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19262
+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19263
+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19264
+ // instruction for flat and global operations.
19265
+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19266
+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19267
+
19268
+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19269
+ // instruction, but this only matters for float fadd.
19270
+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19271
+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19272
+ }
19273
+
19259
19274
return Builder.CreateBitCast(RMW, OrigTy);
19260
19275
}
19261
19276
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments