Skip to content

Commit 1186e9d

Browse files
committed
[LLVM][AMDGPU] Specialize 32-bit atomic fadd instruction for generic address space
The 32-bit floating-point atomic add instructions on AMDGPUs does not support a "flat" or "generic" address space. So, if the address space cannot be determined statically, the AMDGPU backend will fall back to a CAS loop (which does support "flat" addressing). Instead, this patch emits runtime address-space checks to allow native FP atomic add instructions for global and LDS memory (and non-atomic FP add instructions for private/scratch memory). In order to do that, this patch introduces a new interface function `emitExpandAtomicRMW`. It is expected to be called when a common atomic expand doesn't work for a specific target, such as the case we discussed here. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D129690
1 parent 93c7a9b commit 1186e9d

File tree

7 files changed

+979
-14
lines changed

7 files changed

+979
-14
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2000,6 +2000,14 @@ class TargetLoweringBase {
20002000
llvm_unreachable("Masked atomicrmw expansion unimplemented on this target");
20012001
}
20022002

2003+
/// Perform a atomicrmw expansion using a target-specific way. This is
2004+
/// expected to be called when masked atomicrmw and bit test atomicrmw don't
2005+
/// work, and the target supports another way to lower atomicrmw.
2006+
virtual void emitExpandAtomicRMW(AtomicRMWInst *AI) const {
2007+
llvm_unreachable(
2008+
"Generic atomicrmw expansion unimplemented on this target");
2009+
}
2010+
20032011
/// Perform a bit test atomicrmw using a target-specific intrinsic. This
20042012
/// represents the combined bit test intrinsic which will be lowered at a late
20052013
/// stage by the backend.

llvm/lib/CodeGen/AtomicExpandPass.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,9 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
610610
}
611611
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
612612
return lowerAtomicRMWInst(AI);
613+
case TargetLoweringBase::AtomicExpansionKind::Expand:
614+
TLI->emitExpandAtomicRMW(AI);
615+
return true;
613616
default:
614617
llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
615618
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "llvm/CodeGen/MachineFunction.h"
3131
#include "llvm/CodeGen/MachineLoopInfo.h"
3232
#include "llvm/IR/DiagnosticInfo.h"
33+
#include "llvm/IR/IRBuilder.h"
3334
#include "llvm/IR/IntrinsicInst.h"
3435
#include "llvm/IR/IntrinsicsAMDGPU.h"
3536
#include "llvm/IR/IntrinsicsR600.h"
@@ -12866,6 +12867,19 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1286612867
if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts())
1286712868
return ReportUnsafeHWInst(AtomicExpansionKind::None);
1286812869

12870+
// If it is in flat address space, and the type is float, we will try to
12871+
// expand it, if the target supports global and lds atomic fadd. The
12872+
// reason we need that is, in the expansion, we emit the check of address
12873+
// space. If it is in global address space, we emit the global atomic
12874+
// fadd; if it is in shared address space, we emit the LDS atomic fadd.
12875+
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
12876+
Subtarget->hasLDSFPAtomicAdd()) {
12877+
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
12878+
return AtomicExpansionKind::Expand;
12879+
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
12880+
return AtomicExpansionKind::Expand;
12881+
}
12882+
1286912883
return AtomicExpansionKind::CmpXChg;
1287012884
}
1287112885

@@ -13066,3 +13080,140 @@ bool SITargetLowering::checkForPhysRegDependency(
1306613080
}
1306713081
return false;
1306813082
}
13083+
13084+
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
13085+
assert(Subtarget->hasAtomicFaddInsts() &&
13086+
"target should have atomic fadd instructions");
13087+
assert(AI->getType()->isFloatTy() &&
13088+
AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
13089+
"generic atomicrmw expansion only supports FP32 operand in flat "
13090+
"address space");
13091+
assert(AI->getOperation() == AtomicRMWInst::FAdd &&
13092+
"only fadd is supported for now");
13093+
13094+
// Given: atomicrmw fadd float* %addr, float %val ordering
13095+
//
13096+
// With this expansion we produce the following code:
13097+
// [...]
13098+
// %int8ptr = bitcast float* %addr to i8*
13099+
// br label %atomicrmw.check.shared
13100+
//
13101+
// atomicrmw.check.shared:
13102+
// %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %int8ptr)
13103+
// br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
13104+
//
13105+
// atomicrmw.shared:
13106+
// %cast.shared = addrspacecast float* %addr to float addrspace(3)*
13107+
// %loaded.shared = atomicrmw fadd float addrspace(3)* %cast.shared,
13108+
// float %val ordering
13109+
// br label %atomicrmw.phi
13110+
//
13111+
// atomicrmw.check.private:
13112+
// %is.private = call i1 @llvm.amdgcn.is.private(i8* %int8ptr)
13113+
// br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
13114+
//
13115+
// atomicrmw.private:
13116+
// %cast.private = addrspacecast float* %addr to float addrspace(5)*
13117+
// %loaded.private = load float, float addrspace(5)* %cast.private
13118+
// %val.new = fadd float %loaded.private, %val
13119+
// store float %val.new, float addrspace(5)* %cast.private
13120+
// br label %atomicrmw.phi
13121+
//
13122+
// atomicrmw.global:
13123+
// %cast.global = addrspacecast float* %addr to float addrspace(1)*
13124+
// %loaded.global = atomicrmw fadd float addrspace(1)* %cast.global,
13125+
// float %val ordering
13126+
// br label %atomicrmw.phi
13127+
//
13128+
// atomicrmw.phi:
13129+
// %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
13130+
// [ %loaded.private, %atomicrmw.private ],
13131+
// [ %loaded.global, %atomicrmw.global ]
13132+
// br label %atomicrmw.end
13133+
//
13134+
// atomicrmw.end:
13135+
// [...]
13136+
13137+
IRBuilder<> Builder(AI);
13138+
LLVMContext &Ctx = Builder.getContext();
13139+
13140+
BasicBlock *BB = Builder.GetInsertBlock();
13141+
Function *F = BB->getParent();
13142+
BasicBlock *ExitBB =
13143+
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
13144+
BasicBlock *CheckSharedBB =
13145+
BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
13146+
BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
13147+
BasicBlock *CheckPrivateBB =
13148+
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
13149+
BasicBlock *PrivateBB =
13150+
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
13151+
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
13152+
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
13153+
13154+
Value *Val = AI->getValOperand();
13155+
Type *ValTy = Val->getType();
13156+
Value *Addr = AI->getPointerOperand();
13157+
PointerType *PtrTy = cast<PointerType>(Addr->getType());
13158+
13159+
auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
13160+
Value *Val) -> Value * {
13161+
AtomicRMWInst *OldVal =
13162+
Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
13163+
AI->getOrdering(), AI->getSyncScopeID());
13164+
SmallVector<std::pair<unsigned, MDNode *>> MDs;
13165+
AI->getAllMetadata(MDs);
13166+
for (auto &P : MDs)
13167+
OldVal->setMetadata(P.first, P.second);
13168+
return OldVal;
13169+
};
13170+
13171+
std::prev(BB->end())->eraseFromParent();
13172+
Builder.SetInsertPoint(BB);
13173+
Value *Int8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
13174+
Builder.CreateBr(CheckSharedBB);
13175+
13176+
Builder.SetInsertPoint(CheckSharedBB);
13177+
CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
13178+
{Int8Ptr}, nullptr, "is.shared");
13179+
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
13180+
13181+
Builder.SetInsertPoint(SharedBB);
13182+
Value *CastToLocal = Builder.CreateAddrSpaceCast(
13183+
Addr,
13184+
PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::LOCAL_ADDRESS));
13185+
Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
13186+
Builder.CreateBr(PhiBB);
13187+
13188+
Builder.SetInsertPoint(CheckPrivateBB);
13189+
CallInst *IsPrivate = Builder.CreateIntrinsic(
13190+
Intrinsic::amdgcn_is_private, {}, {Int8Ptr}, nullptr, "is.private");
13191+
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
13192+
13193+
Builder.SetInsertPoint(PrivateBB);
13194+
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
13195+
Addr,
13196+
PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::PRIVATE_ADDRESS));
13197+
Value *LoadedPrivate =
13198+
Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
13199+
Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
13200+
Builder.CreateStore(NewVal, CastToPrivate);
13201+
Builder.CreateBr(PhiBB);
13202+
13203+
Builder.SetInsertPoint(GlobalBB);
13204+
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
13205+
Addr,
13206+
PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::GLOBAL_ADDRESS));
13207+
Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
13208+
Builder.CreateBr(PhiBB);
13209+
13210+
Builder.SetInsertPoint(PhiBB);
13211+
PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
13212+
Loaded->addIncoming(LoadedShared, SharedBB);
13213+
Loaded->addIncoming(LoadedPrivate, PrivateBB);
13214+
Loaded->addIncoming(LoadedGlobal, GlobalBB);
13215+
Builder.CreateBr(ExitBB);
13216+
13217+
AI->replaceAllUsesWith(Loaded);
13218+
AI->eraseFromParent();
13219+
}

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
493493
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
494494
AtomicExpansionKind
495495
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
496+
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
496497

497498
const TargetRegisterClass *getRegClassFor(MVT VT,
498499
bool isDivergent) const override;

0 commit comments

Comments
 (0)