58
58
#include "llvm/IR/MDBuilder.h"
59
59
#include "llvm/IR/MatrixBuilder.h"
60
60
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61
+ #include "llvm/Support/AMDGPUAddrSpace.h"
61
62
#include "llvm/Support/ConvertUTF.h"
62
63
#include "llvm/Support/MathExtras.h"
63
64
#include "llvm/Support/ScopedPrinter.h"
@@ -18776,8 +18777,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18776
18777
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
18777
18778
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
18778
18779
}
18779
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18780
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18781
18780
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18782
18781
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18783
18782
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18789,18 +18788,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18789
18788
Intrinsic::ID IID;
18790
18789
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18791
18790
switch (BuiltinID) {
18792
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18793
- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18794
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18795
- break;
18796
18791
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18797
18792
ArgTy = llvm::FixedVectorType::get(
18798
18793
llvm::Type::getHalfTy(getLLVMContext()), 2);
18799
18794
IID = Intrinsic::amdgcn_global_atomic_fadd;
18800
18795
break;
18801
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18802
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18803
- break;
18804
18796
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18805
18797
IID = Intrinsic::amdgcn_global_atomic_fmin;
18806
18798
break;
@@ -19223,7 +19215,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19223
19215
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19224
19216
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
19225
19217
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19226
- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19218
+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19219
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19220
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
19227
19221
llvm::AtomicRMWInst::BinOp BinOp;
19228
19222
switch (BuiltinID) {
19229
19223
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19239,6 +19233,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19239
19233
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
19240
19234
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19241
19235
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19236
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19237
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
19242
19238
BinOp = llvm::AtomicRMWInst::FAdd;
19243
19239
break;
19244
19240
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19273,8 +19269,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19273
19269
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
19274
19270
EmitScalarExpr(E->getArg(3)), AO, SSID);
19275
19271
} else {
19276
- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19277
- SSID = llvm::SyncScope::System;
19272
+ // Most of the builtins do not have syncscope/order arguments. For DS
19273
+ // atomics the scope doesn't really matter, as they implicitly operate at
19274
+ // workgroup scope.
19275
+ //
19276
+ // The global/flat cases need to use agent scope to consistently produce
19277
+ // the native instruction instead of a cmpxchg expansion.
19278
+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
19278
19279
AO = AtomicOrdering::SequentiallyConsistent;
19279
19280
19280
19281
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19289,6 +19290,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19289
19290
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
19290
19291
if (Volatile)
19291
19292
RMW->setVolatile(true);
19293
+
19294
+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19295
+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19296
+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19297
+ // instruction for flat and global operations.
19298
+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19299
+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19300
+
19301
+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19302
+ // instruction, but this only matters for float fadd.
19303
+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19304
+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19305
+ }
19306
+
19292
19307
return Builder.CreateBitCast(RMW, OrigTy);
19293
19308
}
19294
19309
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments