@@ -16595,9 +16595,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16595
16595
16596
16596
TargetLowering::AtomicExpansionKind
16597
16597
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16598
- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16599
- ? AtomicExpansionKind::NotAtomic
16600
- : AtomicExpansionKind::None;
16598
+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16599
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16600
+ return AtomicExpansionKind::NotAtomic;
16601
+
16602
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16603
+ return AtomicExpansionKind::None;
16604
+
16605
+ const DataLayout &DL = CmpX->getDataLayout();
16606
+
16607
+ Type *ValTy = CmpX->getNewValOperand()->getType();
16608
+
16609
+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16610
+ // atomic in the private case.
16611
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16612
+ : AtomicExpansionKind::None;
16601
16613
}
16602
16614
16603
16615
const TargetRegisterClass *
@@ -16761,40 +16773,8 @@ bool SITargetLowering::checkForPhysRegDependency(
16761
16773
return false;
16762
16774
}
16763
16775
16764
- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16765
- AtomicRMWInst::BinOp Op = AI->getOperation();
16766
-
16767
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16768
- Op == AtomicRMWInst::Xor) {
16769
- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16770
- ConstVal && ConstVal->isNullValue()) {
16771
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16772
- AI->setOperation(AtomicRMWInst::Add);
16773
-
16774
- // TODO: Turn the below private handling into a no-op for idempotent
16775
- // cases.
16776
- }
16777
- }
16778
-
16779
- // The non-flat expansions should only perform the de-canonicalization of
16780
- // identity values.
16781
- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16782
- return;
16783
-
16784
- // FullFlatEmulation is true if we need to issue the private, shared, and
16785
- // global cases.
16786
- //
16787
- // If this is false, we are only dealing with the flat-targeting-private case,
16788
- // where we only insert a check for private and still use the flat instruction
16789
- // for global and shared.
16790
-
16791
- // TODO: Avoid the private check for the fadd case depending on
16792
- // noalias.addrspace.
16793
-
16794
- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16795
- Subtarget->hasAtomicFaddInsts() &&
16796
- AI->getType()->isFloatTy();
16797
-
16776
+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16777
+ Instruction *AI) const {
16798
16778
// Given: atomicrmw fadd ptr %addr, float %val ordering
16799
16779
//
16800
16780
// With this expansion we produce the following code:
@@ -16841,6 +16821,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16841
16821
IRBuilder<> Builder(AI);
16842
16822
LLVMContext &Ctx = Builder.getContext();
16843
16823
16824
+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16825
+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16826
+ : AtomicCmpXchgInst::getPointerOperandIndex();
16827
+ Value *Addr = AI->getOperand(PtrOpIdx);
16828
+
16829
+ /// TODO: Only need to check private, then emit flat-known-not private (no
16830
+ /// need for shared block, or cast to global).
16831
+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16832
+
16833
+ Align Alignment;
16834
+ if (RMW)
16835
+ Alignment = RMW->getAlign();
16836
+ else if (CX)
16837
+ Alignment = CX->getAlign();
16838
+ else
16839
+ llvm_unreachable("unhandled atomic operation");
16840
+
16841
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16842
+ // global cases.
16843
+ //
16844
+ // If this is false, we are only dealing with the flat-targeting-private case,
16845
+ // where we only insert a check for private and still use the flat instruction
16846
+ // for global and shared.
16847
+
16848
+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16849
+ Subtarget->hasAtomicFaddInsts() &&
16850
+ RMW->getType()->isFloatTy();
16851
+
16844
16852
// If the return value isn't used, do not introduce a false use in the phi.
16845
16853
bool ReturnValueIsUsed = !AI->use_empty();
16846
16854
@@ -16862,11 +16870,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16862
16870
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16863
16871
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16864
16872
16865
- Value *Val = AI->getValOperand();
16866
- Type *ValTy = Val->getType();
16867
- Value *Addr = AI->getPointerOperand();
16868
- Align Alignment = AI->getAlign();
16869
-
16870
16873
std::prev(BB->end())->eraseFromParent();
16871
16874
Builder.SetInsertPoint(BB);
16872
16875
@@ -16881,8 +16884,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16881
16884
16882
16885
Instruction *Clone = AI->clone();
16883
16886
Clone->insertInto(SharedBB, SharedBB->end());
16884
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16885
- .set(CastToLocal);
16887
+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
16886
16888
LoadedShared = Clone;
16887
16889
16888
16890
Builder.CreateBr(PhiBB);
@@ -16894,14 +16896,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16894
16896
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16895
16897
16896
16898
Builder.SetInsertPoint(PrivateBB);
16899
+
16897
16900
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16898
16901
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16899
- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16900
- Alignment, "loaded.private");
16901
16902
16902
- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16903
+ Value *LoadedPrivate;
16904
+ if (RMW) {
16905
+ LoadedPrivate = Builder.CreateAlignedLoad(
16906
+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16907
+
16908
+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16909
+ LoadedPrivate, RMW->getValOperand());
16910
+
16911
+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16912
+ } else {
16913
+ auto [ResultLoad, Equal] =
16914
+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16915
+ CX->getNewValOperand(), CX->getAlign());
16916
+
16917
+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16918
+ ResultLoad, 0);
16919
+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16920
+ }
16903
16921
16904
- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
16905
16922
Builder.CreateBr(PhiBB);
16906
16923
16907
16924
Builder.SetInsertPoint(GlobalBB);
@@ -16911,8 +16928,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16911
16928
if (FullFlatEmulation) {
16912
16929
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16913
16930
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16914
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16915
- .set(CastToGlobal);
16931
+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
16916
16932
}
16917
16933
16918
16934
AI->removeFromParent();
@@ -16936,7 +16952,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16936
16952
Builder.SetInsertPoint(PhiBB);
16937
16953
16938
16954
if (ReturnValueIsUsed) {
16939
- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16955
+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
16940
16956
AI->replaceAllUsesWith(Loaded);
16941
16957
if (FullFlatEmulation)
16942
16958
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16948,6 +16964,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16948
16964
Builder.CreateBr(ExitBB);
16949
16965
}
16950
16966
16967
+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16968
+ AtomicRMWInst::BinOp Op = AI->getOperation();
16969
+
16970
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16971
+ Op == AtomicRMWInst::Xor) {
16972
+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16973
+ ConstVal && ConstVal->isNullValue()) {
16974
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16975
+ AI->setOperation(AtomicRMWInst::Add);
16976
+
16977
+ // We may still need the private-alias-flat handling below.
16978
+
16979
+ // TODO: Skip this for cases where we cannot access remote memory.
16980
+ }
16981
+ }
16982
+
16983
+ // The non-flat expansions should only perform the de-canonicalization of
16984
+ // identity values.
16985
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16986
+ return;
16987
+
16988
+ emitExpandAtomicAddrSpacePredicate(AI);
16989
+ }
16990
+
16991
+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16992
+ emitExpandAtomicAddrSpacePredicate(CI);
16993
+ }
16994
+
16951
16995
LoadInst *
16952
16996
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16953
16997
IRBuilder<> Builder(AI);
0 commit comments