58
58
#include "llvm/IR/MDBuilder.h"
59
59
#include "llvm/IR/MatrixBuilder.h"
60
60
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61
+ #include "llvm/Support/AMDGPUAddrSpace.h"
61
62
#include "llvm/Support/ConvertUTF.h"
62
63
#include "llvm/Support/MathExtras.h"
63
64
#include "llvm/Support/ScopedPrinter.h"
@@ -18919,8 +18920,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18919
18920
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
18920
18921
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
18921
18922
}
18922
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18923
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18924
18923
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18925
18924
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18926
18925
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18932,18 +18931,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18932
18931
Intrinsic::ID IID;
18933
18932
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18934
18933
switch (BuiltinID) {
18935
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18936
- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18937
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18938
- break;
18939
18934
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18940
18935
ArgTy = llvm::FixedVectorType::get(
18941
18936
llvm::Type::getHalfTy(getLLVMContext()), 2);
18942
18937
IID = Intrinsic::amdgcn_global_atomic_fadd;
18943
18938
break;
18944
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18945
- IID = Intrinsic::amdgcn_global_atomic_fadd;
18946
- break;
18947
18939
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18948
18940
IID = Intrinsic::amdgcn_global_atomic_fmin;
18949
18941
break;
@@ -19366,7 +19358,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19366
19358
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19367
19359
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
19368
19360
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19369
- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19361
+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19362
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19363
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
19370
19364
llvm::AtomicRMWInst::BinOp BinOp;
19371
19365
switch (BuiltinID) {
19372
19366
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19382,6 +19376,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19382
19376
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
19383
19377
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19384
19378
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19379
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19380
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
19385
19381
BinOp = llvm::AtomicRMWInst::FAdd;
19386
19382
break;
19387
19383
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19416,8 +19412,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19416
19412
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
19417
19413
EmitScalarExpr(E->getArg(3)), AO, SSID);
19418
19414
} else {
19419
- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19420
- SSID = llvm::SyncScope::System;
19415
+ // Most of the builtins do not have syncscope/order arguments. For DS
19416
+ // atomics the scope doesn't really matter, as they implicitly operate at
19417
+ // workgroup scope.
19418
+ //
19419
+ // The global/flat cases need to use agent scope to consistently produce
19420
+ // the native instruction instead of a cmpxchg expansion.
19421
+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
19421
19422
AO = AtomicOrdering::SequentiallyConsistent;
19422
19423
19423
19424
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19432,6 +19433,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
19432
19433
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
19433
19434
if (Volatile)
19434
19435
RMW->setVolatile(true);
19436
+
19437
+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19438
+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19439
+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19440
+ // instruction for flat and global operations.
19441
+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19442
+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19443
+
19444
+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19445
+ // instruction, but this only matters for float fadd.
19446
+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19447
+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19448
+ }
19449
+
19435
19450
return Builder.CreateBitCast(RMW, OrigTy);
19436
19451
}
19437
19452
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments