39
39
#include "llvm/IR/IntrinsicInst.h"
40
40
#include "llvm/IR/IntrinsicsAMDGPU.h"
41
41
#include "llvm/IR/IntrinsicsR600.h"
42
+ #include "llvm/IR/MDBuilder.h"
42
43
#include "llvm/Support/CommandLine.h"
43
44
#include "llvm/Support/KnownBits.h"
44
45
#include "llvm/Support/ModRef.h"
@@ -16328,12 +16329,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
16328
16329
: TargetLowering::AtomicExpansionKind::CmpXChg;
16329
16330
}
16330
16331
16332
+ /// Return if a flat address space atomicrmw can access private memory.
16333
+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16334
+ const MDNode *NoaliasAddrSpaceMD =
16335
+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16336
+ if (!NoaliasAddrSpaceMD)
16337
+ return true;
16338
+
16339
+ for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16340
+ ++I) {
16341
+ auto *Low = mdconst::extract<ConstantInt>(
16342
+ NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16343
+ auto *High = mdconst::extract<ConstantInt>(
16344
+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16345
+
16346
+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16347
+ High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16348
+ return true;
16349
+ }
16350
+
16351
+ return false;
16352
+ }
16353
+
16331
16354
TargetLowering::AtomicExpansionKind
16332
16355
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16333
16356
unsigned AS = RMW->getPointerAddressSpace();
16334
16357
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16335
16358
return AtomicExpansionKind::NotAtomic;
16336
16359
16360
+ // 64-bit flat atomics that dynamically reside in private memory will silently
16361
+ // be dropped.
16362
+ //
16363
+ // Note that we will emit a new copy of the original atomic in the expansion,
16364
+ // which will be incrementally relegalized.
16365
+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16366
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16367
+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16368
+ flatInstrMayAccessPrivate(RMW))
16369
+ return AtomicExpansionKind::Expand;
16370
+
16337
16371
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16338
16372
OptimizationRemarkEmitter ORE(RMW->getFunction());
16339
16373
ORE.emit([=]() {
@@ -16732,20 +16766,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16732
16766
16733
16767
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16734
16768
Op == AtomicRMWInst::Xor) {
16735
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16736
- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16737
- "this cannot be replaced with add");
16738
- AI->setOperation(AtomicRMWInst::Add);
16739
- return;
16769
+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16770
+ ConstVal && ConstVal->isNullValue()) {
16771
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16772
+ AI->setOperation(AtomicRMWInst::Add);
16773
+
16774
+ // TODO: Turn the below private handling into a no-op for idempotent
16775
+ // cases.
16776
+ }
16740
16777
}
16741
16778
16742
- assert(Subtarget->hasAtomicFaddInsts() &&
16743
- "target should have atomic fadd instructions");
16744
- assert(AI->getType()->isFloatTy() &&
16745
- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16746
- "generic atomicrmw expansion only supports FP32 operand in flat "
16747
- "address space");
16748
- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16779
+ // The non-flat expansions should only perform the de-canonicalization of
16780
+ // identity values.
16781
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16782
+ return;
16783
+
16784
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16785
+ // global cases.
16786
+ //
16787
+ // If this is false, we are only dealing with the flat-targeting-private case,
16788
+ // where we only insert a check for private and still use the flat instruction
16789
+ // for global and shared.
16790
+
16791
+ // TODO: Avoid the private check for the fadd case depending on
16792
+ // noalias.addrspace.
16793
+
16794
+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16795
+ Subtarget->hasAtomicFaddInsts() &&
16796
+ AI->getType()->isFloatTy();
16749
16797
16750
16798
// Given: atomicrmw fadd ptr %addr, float %val ordering
16751
16799
//
@@ -16785,6 +16833,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16785
16833
//
16786
16834
// atomicrmw.end:
16787
16835
// [...]
16836
+ //
16837
+ //
16838
+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16839
+ // version that only inserts the private check, and uses the flat operation.
16788
16840
16789
16841
IRBuilder<> Builder(AI);
16790
16842
LLVMContext &Ctx = Builder.getContext();
@@ -16796,9 +16848,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16796
16848
Function *F = BB->getParent();
16797
16849
BasicBlock *ExitBB =
16798
16850
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16799
- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16800
- BasicBlock *CheckPrivateBB =
16801
- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16851
+ BasicBlock *SharedBB = nullptr;
16852
+
16853
+ BasicBlock *CheckPrivateBB = BB;
16854
+ if (FullFlatEmulation) {
16855
+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16856
+ CheckPrivateBB =
16857
+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16858
+ }
16859
+
16802
16860
BasicBlock *PrivateBB =
16803
16861
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16804
16862
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16811,23 +16869,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16811
16869
16812
16870
std::prev(BB->end())->eraseFromParent();
16813
16871
Builder.SetInsertPoint(BB);
16814
- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16815
- {Addr}, nullptr, "is.shared");
16816
- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16817
16872
16818
- Builder.SetInsertPoint(SharedBB);
16819
- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16820
- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16873
+ Value *LoadedShared = nullptr;
16874
+ if (FullFlatEmulation) {
16875
+ CallInst *IsShared = Builder.CreateIntrinsic(
16876
+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16877
+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16878
+ Builder.SetInsertPoint(SharedBB);
16879
+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16880
+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16821
16881
16822
- Instruction *Clone = AI->clone();
16823
- Clone->insertInto(SharedBB, SharedBB->end());
16824
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16825
- .set(CastToLocal);
16826
- Instruction * LoadedShared = Clone;
16882
+ Instruction *Clone = AI->clone();
16883
+ Clone->insertInto(SharedBB, SharedBB->end());
16884
+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16885
+ .set(CastToLocal);
16886
+ LoadedShared = Clone;
16827
16887
16828
- Builder.CreateBr(PhiBB);
16888
+ Builder.CreateBr(PhiBB);
16889
+ Builder.SetInsertPoint(CheckPrivateBB);
16890
+ }
16829
16891
16830
- Builder.SetInsertPoint(CheckPrivateBB);
16831
16892
CallInst *IsPrivate = Builder.CreateIntrinsic(
16832
16893
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16833
16894
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16844,23 +16905,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16844
16905
Builder.CreateBr(PhiBB);
16845
16906
16846
16907
Builder.SetInsertPoint(GlobalBB);
16847
- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16848
- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16849
- Value *LoadedGlobal = AI;
16850
16908
16851
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16909
+ // Continue using a flat instruction if we only emitted the check for private.
16910
+ Instruction *LoadedGlobal = AI;
16911
+ if (FullFlatEmulation) {
16912
+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16913
+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16914
+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16915
+ .set(CastToGlobal);
16916
+ }
16852
16917
16853
16918
AI->removeFromParent();
16854
16919
AI->insertInto(GlobalBB, GlobalBB->end());
16855
16920
16921
+ // The new atomicrmw may go through another round of legalization later.
16922
+ if (!FullFlatEmulation) {
16923
+ // We inserted the runtime check already, make sure we do not try to
16924
+ // re-expand this.
16925
+ // TODO: Should union with any existing metadata.
16926
+ MDBuilder MDB(F->getContext());
16927
+ MDNode *RangeNotPrivate =
16928
+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16929
+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16930
+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16931
+ RangeNotPrivate);
16932
+ }
16933
+
16856
16934
Builder.CreateBr(PhiBB);
16857
16935
16858
16936
Builder.SetInsertPoint(PhiBB);
16859
16937
16860
16938
if (ReturnValueIsUsed) {
16861
16939
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16862
16940
AI->replaceAllUsesWith(Loaded);
16863
- Loaded->addIncoming(LoadedShared, SharedBB);
16941
+ if (FullFlatEmulation)
16942
+ Loaded->addIncoming(LoadedShared, SharedBB);
16864
16943
Loaded->addIncoming(LoadedPrivate, PrivateBB);
16865
16944
Loaded->addIncoming(LoadedGlobal, GlobalBB);
16866
16945
Loaded->takeName(AI);
0 commit comments