llvm · arsenm · Nov 4, 2024 · Aug 14, 2024 · Sep 30, 2024 · Nov 1, 2024
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2204,6 +2204,11 @@ class TargetLoweringBase {
         "Generic atomicrmw expansion unimplemented on this target");
   }
 
+  /// Perform a cmpxchg expansion using a target-specific method.
+  virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+    llvm_unreachable("Generic cmpxchg expansion unimplemented on this target");
+  }
+
   /// Perform a bit test atomicrmw using a target-specific intrinsic. This
   /// represents the combined bit test intrinsic which will be lowered at a late
   /// stage by the backend.

diff --git a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
@@ -23,6 +23,12 @@ class IRBuilderBase;
 /// Convert the given Cmpxchg into primitive load and compare.
 bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI);
 
+/// Emit IR to implement the given cmpxchg operation on values in registers,
+/// returning the new value.
+std::pair<Value *, Value *> buildCmpXchgValue(IRBuilderBase &Builder,
+                                              Value *Ptr, Value *Cmp,
+                                              Value *Val, Align Alignment);
+
 /// Convert the given RMWI into primitive load and stores,
 /// assuming that doing so is legal. Return true if the lowering
 /// succeeds.

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1674,6 +1674,10 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
     return true;
   case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
     return lowerAtomicCmpXchgInst(CI);
+  case TargetLoweringBase::AtomicExpansionKind::Expand: {
+    TLI->emitExpandAtomicCmpXchg(CI);
+    return true;
+  }
   }
 }
 

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16322,15 +16322,14 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
        ++I) {
     auto *Low = mdconst::extract<ConstantInt>(
         NoaliasAddrSpaceMD->getOperand(2 * I + 0));
-    auto *High = mdconst::extract<ConstantInt>(
-        NoaliasAddrSpaceMD->getOperand(2 * I + 1));
-
-    if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
-        High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
-      return true;
+    if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
+      auto *High = mdconst::extract<ConstantInt>(
+          NoaliasAddrSpaceMD->getOperand(2 * I + 1));
+      return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
+    }
   }
 
-  return false;
+  return true;
 }
 
 TargetLowering::AtomicExpansionKind
@@ -16577,9 +16576,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
-  return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
-             : AtomicExpansionKind::None;
+  unsigned AddrSpace = CmpX->getPointerAddressSpace();
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
+    return AtomicExpansionKind::NotAtomic;
+
+  if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
+    return AtomicExpansionKind::None;
+
+  const DataLayout &DL = CmpX->getDataLayout();
+
+  Type *ValTy = CmpX->getNewValOperand()->getType();
+
+  // If a 64-bit flat atomic may alias private, we need to avoid using the
+  // atomic in the private case.
+  return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
+                                           : AtomicExpansionKind::None;
 }
 
 const TargetRegisterClass *
@@ -16745,40 +16756,8 @@ bool SITargetLowering::checkForPhysRegDependency(
   return false;
 }
 
-void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
-  AtomicRMWInst::BinOp Op = AI->getOperation();
-
-  if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
-      Op == AtomicRMWInst::Xor) {
-    if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
-        ConstVal && ConstVal->isNullValue()) {
-      // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
-      AI->setOperation(AtomicRMWInst::Add);
-
-      // TODO: Turn the below private handling into a no-op for idempotent
-      // cases.
-    }
-  }
-
-  // The non-flat expansions should only perform the de-canonicalization of
-  // identity values.
-  if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
-    return;
-
-  // FullFlatEmulation is true if we need to issue the private, shared, and
-  // global cases.
-  //
-  // If this is false, we are only dealing with the flat-targeting-private case,
-  // where we only insert a check for private and still use the flat instruction
-  // for global and shared.
-
-  // TODO: Avoid the private check for the fadd case depending on
-  // noalias.addrspace.
-
-  bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
-                           Subtarget->hasAtomicFaddInsts() &&
-                           AI->getType()->isFloatTy();
-
+void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
+    Instruction *AI) const {
   // Given: atomicrmw fadd ptr %addr, float %val ordering
   //
   // With this expansion we produce the following code:
@@ -16825,6 +16804,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
   LLVMContext &Ctx = Builder.getContext();
 
+  auto *RMW = dyn_cast<AtomicRMWInst>(AI);
+  const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
+                                : AtomicCmpXchgInst::getPointerOperandIndex();
+  Value *Addr = AI->getOperand(PtrOpIdx);
+
+  /// TODO: Only need to check private, then emit flat-known-not private (no
+  /// need for shared block, or cast to global).
+  AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
+
+  Align Alignment;
+  if (RMW)
+    Alignment = RMW->getAlign();
+  else if (CX)
+    Alignment = CX->getAlign();
+  else
+    llvm_unreachable("unhandled atomic operation");
+
+  // FullFlatEmulation is true if we need to issue the private, shared, and
+  // global cases.
+  //
+  // If this is false, we are only dealing with the flat-targeting-private case,
+  // where we only insert a check for private and still use the flat instruction
+  // for global and shared.
+
+  bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
+                           Subtarget->hasAtomicFaddInsts() &&
+                           RMW->getType()->isFloatTy();
+
   // If the return value isn't used, do not introduce a false use in the phi.
   bool ReturnValueIsUsed = !AI->use_empty();
 
@@ -16846,11 +16853,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
   BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
 
-  Value *Val = AI->getValOperand();
-  Type *ValTy = Val->getType();
-  Value *Addr = AI->getPointerOperand();
-  Align Alignment = AI->getAlign();
-
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
 
@@ -16865,8 +16867,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
 
     Instruction *Clone = AI->clone();
     Clone->insertInto(SharedBB, SharedBB->end());
-    Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
-        .set(CastToLocal);
+    Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
     LoadedShared = Clone;
 
     Builder.CreateBr(PhiBB);
@@ -16878,14 +16879,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
 
   Builder.SetInsertPoint(PrivateBB);
+
   Value *CastToPrivate = Builder.CreateAddrSpaceCast(
       Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
-  Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
-                                                   Alignment, "loaded.private");
 
-  Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
+  Value *LoadedPrivate;
+  if (RMW) {
+    LoadedPrivate = Builder.CreateAlignedLoad(
+        RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
+
+    Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
+                                        LoadedPrivate, RMW->getValOperand());
+
+    Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
+  } else {
+    auto [ResultLoad, Equal] =
+        buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
+                          CX->getNewValOperand(), CX->getAlign());
+
+    Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
+                                              ResultLoad, 0);
+    LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
+  }
 
-  Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
   Builder.CreateBr(PhiBB);
 
   Builder.SetInsertPoint(GlobalBB);
@@ -16895,8 +16911,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   if (FullFlatEmulation) {
     Value *CastToGlobal = Builder.CreateAddrSpaceCast(
         Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
-    AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
-        .set(CastToGlobal);
+    AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
   }
 
   AI->removeFromParent();
@@ -16920,7 +16935,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   Builder.SetInsertPoint(PhiBB);
 
   if (ReturnValueIsUsed) {
-    PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
+    PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
     AI->replaceAllUsesWith(Loaded);
     if (FullFlatEmulation)
       Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16932,6 +16947,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   Builder.CreateBr(ExitBB);
 }
 
+void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
+  AtomicRMWInst::BinOp Op = AI->getOperation();
+
+  if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
+      Op == AtomicRMWInst::Xor) {
+    if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
+        ConstVal && ConstVal->isNullValue()) {
+      // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
+      AI->setOperation(AtomicRMWInst::Add);
+
+      // We may still need the private-alias-flat handling below.
+
+      // TODO: Skip this for cases where we cannot access remote memory.
+    }
+  }
+
+  // The non-flat expansions should only perform the de-canonicalization of
+  // identity values.
+  if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
+    return;
+
+  emitExpandAtomicAddrSpacePredicate(AI);
+}
+
+void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+  emitExpandAtomicAddrSpacePredicate(CI);
+}
+
 LoadInst *
 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -544,7 +544,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
   AtomicExpansionKind
   shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+  void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
   void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
+  void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
 
   LoadInst *
   lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;

diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
@@ -25,20 +25,30 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
   Value *Cmp = CXI->getCompareOperand();
   Value *Val = CXI->getNewValOperand();
 
-  LoadInst *Orig =
-      Builder.CreateAlignedLoad(Val->getType(), Ptr, CXI->getAlign());
-  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
-  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
-  Builder.CreateAlignedStore(Res, Ptr, CXI->getAlign());
+  auto [Orig, Equal] =
+      buildCmpXchgValue(Builder, Ptr, Cmp, Val, CXI->getAlign());
 
-  Res = Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
+  Value *Res =
+      Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
   Res = Builder.CreateInsertValue(Res, Equal, 1);
 
   CXI->replaceAllUsesWith(Res);
   CXI->eraseFromParent();
   return true;
 }
 
+std::pair<Value *, Value *> llvm::buildCmpXchgValue(IRBuilderBase &Builder,
+                                                    Value *Ptr, Value *Cmp,
+                                                    Value *Val,
+                                                    Align Alignment) {
+  LoadInst *Orig = Builder.CreateAlignedLoad(Val->getType(), Ptr, Alignment);
+  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+  Builder.CreateAlignedStore(Res, Ptr, Alignment);
+
+  return {Orig, Equal};
+}
+
 Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op,
                                  IRBuilderBase &Builder, Value *Loaded,
                                  Value *Val) {