58
58
#include "llvm/IR/MDBuilder.h"
59
59
#include "llvm/IR/MatrixBuilder.h"
60
60
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61
+ #include "llvm/Support/AMDGPUAddrSpace.h"
61
62
#include "llvm/Support/ConvertUTF.h"
62
63
#include "llvm/Support/MathExtras.h"
63
64
#include "llvm/Support/ScopedPrinter.h"
@@ -18790,8 +18791,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18790
18791
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
18791
18792
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
18792
18793
}
18793
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18794
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18795
18794
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18796
18795
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18797
18796
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18803,18 +18802,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18803
18802
Intrinsic::ID IID;
18804
18803
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18805
18804
switch (BuiltinID) {
18806
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18807
- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18808
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18809
- break;
18810
18805
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18811
18806
ArgTy = llvm::FixedVectorType::get(
18812
18807
llvm::Type::getHalfTy(getLLVMContext()), 2);
18813
18808
IID = Intrinsic::amdgcn_global_atomic_fadd;
18814
18809
break;
18815
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18816
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18817
- break;
18818
18810
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18819
18811
IID = Intrinsic::amdgcn_global_atomic_fmin;
18820
18812
break;
@@ -19237,7 +19229,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19237
19229
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19238
19230
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
19239
19231
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19240
- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19232
+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19233
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19234
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
19241
19235
llvm::AtomicRMWInst::BinOp BinOp;
19242
19236
switch (BuiltinID) {
19243
19237
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19253,6 +19247,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19253
19247
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
19254
19248
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19255
19249
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19250
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19251
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
19256
19252
BinOp = llvm::AtomicRMWInst::FAdd;
19257
19253
break;
19258
19254
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19287,8 +19283,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19287
19283
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
19288
19284
EmitScalarExpr(E->getArg(3)), AO, SSID);
19289
19285
} else {
19290
- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19291
- SSID = llvm::SyncScope::System;
19286
+ // Most of the builtins do not have syncscope/order arguments. For DS
19287
+ // atomics the scope doesn't really matter, as they implicitly operate at
19288
+ // workgroup scope.
19289
+ //
19290
+ // The global/flat cases need to use agent scope to consistently produce
19291
+ // the native instruction instead of a cmpxchg expansion.
19292
+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
19292
19293
AO = AtomicOrdering::SequentiallyConsistent;
19293
19294
19294
19295
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19303,6 +19304,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19303
19304
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
19304
19305
if (Volatile)
19305
19306
RMW->setVolatile(true);
19307
+
19308
+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19309
+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19310
+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19311
+ // instruction for flat and global operations.
19312
+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19313
+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19314
+
19315
+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19316
+ // instruction, but this only matters for float fadd.
19317
+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19318
+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19319
+ }
19320
+
19306
19321
return Builder.CreateBitCast(RMW, OrigTy);
19307
19322
}
19308
19323
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments