@@ -16607,9 +16607,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16607
16607
16608
16608
TargetLowering::AtomicExpansionKind
16609
16609
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16610
- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16611
- ? AtomicExpansionKind::NotAtomic
16612
- : AtomicExpansionKind::None;
16610
+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16611
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16612
+ return AtomicExpansionKind::NotAtomic;
16613
+
16614
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16615
+ return AtomicExpansionKind::None;
16616
+
16617
+ const DataLayout &DL = CmpX->getDataLayout();
16618
+
16619
+ Type *ValTy = CmpX->getNewValOperand()->getType();
16620
+
16621
+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16622
+ // atomic in the private case.
16623
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16624
+ : AtomicExpansionKind::None;
16613
16625
}
16614
16626
16615
16627
const TargetRegisterClass *
@@ -16773,40 +16785,8 @@ bool SITargetLowering::checkForPhysRegDependency(
16773
16785
return false;
16774
16786
}
16775
16787
16776
- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16777
- AtomicRMWInst::BinOp Op = AI->getOperation();
16778
-
16779
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16780
- Op == AtomicRMWInst::Xor) {
16781
- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16782
- ConstVal && ConstVal->isNullValue()) {
16783
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16784
- AI->setOperation(AtomicRMWInst::Add);
16785
-
16786
- // TODO: Turn the below private handling into a no-op for idempotent
16787
- // cases.
16788
- }
16789
- }
16790
-
16791
- // The non-flat expansions should only perform the de-canonicalization of
16792
- // identity values.
16793
- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16794
- return;
16795
-
16796
- // FullFlatEmulation is true if we need to issue the private, shared, and
16797
- // global cases.
16798
- //
16799
- // If this is false, we are only dealing with the flat-targeting-private case,
16800
- // where we only insert a check for private and still use the flat instruction
16801
- // for global and shared.
16802
-
16803
- // TODO: Avoid the private check for the fadd case depending on
16804
- // noalias.addrspace.
16805
-
16806
- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16807
- Subtarget->hasAtomicFaddInsts() &&
16808
- AI->getType()->isFloatTy();
16809
-
16788
+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16789
+ Instruction *AI) const {
16810
16790
// Given: atomicrmw fadd ptr %addr, float %val ordering
16811
16791
//
16812
16792
// With this expansion we produce the following code:
@@ -16853,6 +16833,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16853
16833
IRBuilder<> Builder(AI);
16854
16834
LLVMContext &Ctx = Builder.getContext();
16855
16835
16836
+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16837
+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16838
+ : AtomicCmpXchgInst::getPointerOperandIndex();
16839
+ Value *Addr = AI->getOperand(PtrOpIdx);
16840
+
16841
+ /// TODO: Only need to check private, then emit flat-known-not private (no
16842
+ /// need for shared block, or cast to global).
16843
+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16844
+
16845
+ Align Alignment;
16846
+ if (RMW)
16847
+ Alignment = RMW->getAlign();
16848
+ else if (CX)
16849
+ Alignment = CX->getAlign();
16850
+ else
16851
+ llvm_unreachable("unhandled atomic operation");
16852
+
16853
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16854
+ // global cases.
16855
+ //
16856
+ // If this is false, we are only dealing with the flat-targeting-private case,
16857
+ // where we only insert a check for private and still use the flat instruction
16858
+ // for global and shared.
16859
+
16860
+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16861
+ Subtarget->hasAtomicFaddInsts() &&
16862
+ RMW->getType()->isFloatTy();
16863
+
16856
16864
// If the return value isn't used, do not introduce a false use in the phi.
16857
16865
bool ReturnValueIsUsed = !AI->use_empty();
16858
16866
@@ -16874,11 +16882,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16874
16882
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16875
16883
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16876
16884
16877
- Value *Val = AI->getValOperand();
16878
- Type *ValTy = Val->getType();
16879
- Value *Addr = AI->getPointerOperand();
16880
- Align Alignment = AI->getAlign();
16881
-
16882
16885
std::prev(BB->end())->eraseFromParent();
16883
16886
Builder.SetInsertPoint(BB);
16884
16887
@@ -16893,8 +16896,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16893
16896
16894
16897
Instruction *Clone = AI->clone();
16895
16898
Clone->insertInto(SharedBB, SharedBB->end());
16896
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16897
- .set(CastToLocal);
16899
+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
16898
16900
LoadedShared = Clone;
16899
16901
16900
16902
Builder.CreateBr(PhiBB);
@@ -16906,14 +16908,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16906
16908
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16907
16909
16908
16910
Builder.SetInsertPoint(PrivateBB);
16911
+
16909
16912
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16910
16913
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16911
- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16912
- Alignment, "loaded.private");
16913
16914
16914
- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16915
+ Value *LoadedPrivate;
16916
+ if (RMW) {
16917
+ LoadedPrivate = Builder.CreateAlignedLoad(
16918
+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16919
+
16920
+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16921
+ LoadedPrivate, RMW->getValOperand());
16922
+
16923
+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16924
+ } else {
16925
+ auto [ResultLoad, Equal] =
16926
+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16927
+ CX->getNewValOperand(), CX->getAlign());
16928
+
16929
+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16930
+ ResultLoad, 0);
16931
+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16932
+ }
16915
16933
16916
- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
16917
16934
Builder.CreateBr(PhiBB);
16918
16935
16919
16936
Builder.SetInsertPoint(GlobalBB);
@@ -16923,8 +16940,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16923
16940
if (FullFlatEmulation) {
16924
16941
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16925
16942
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16926
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16927
- .set(CastToGlobal);
16943
+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
16928
16944
}
16929
16945
16930
16946
AI->removeFromParent();
@@ -16948,7 +16964,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16948
16964
Builder.SetInsertPoint(PhiBB);
16949
16965
16950
16966
if (ReturnValueIsUsed) {
16951
- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16967
+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
16952
16968
AI->replaceAllUsesWith(Loaded);
16953
16969
if (FullFlatEmulation)
16954
16970
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16960,6 +16976,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16960
16976
Builder.CreateBr(ExitBB);
16961
16977
}
16962
16978
16979
+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16980
+ AtomicRMWInst::BinOp Op = AI->getOperation();
16981
+
16982
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16983
+ Op == AtomicRMWInst::Xor) {
16984
+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16985
+ ConstVal && ConstVal->isNullValue()) {
16986
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16987
+ AI->setOperation(AtomicRMWInst::Add);
16988
+
16989
+ // We may still need the private-alias-flat handling below.
16990
+
16991
+ // TODO: Skip this for cases where we cannot access remote memory.
16992
+ }
16993
+ }
16994
+
16995
+ // The non-flat expansions should only perform the de-canonicalization of
16996
+ // identity values.
16997
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16998
+ return;
16999
+
17000
+ emitExpandAtomicAddrSpacePredicate(AI);
17001
+ }
17002
+
17003
+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
17004
+ emitExpandAtomicAddrSpacePredicate(CI);
17005
+ }
17006
+
16963
17007
LoadInst *
16964
17008
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16965
17009
IRBuilder<> Builder(AI);
0 commit comments