Add out-of-line-atomics support to GlobalISel #74588

RoboTux · 2023-12-06T12:20:24Z

This patch implement the GlobalISel counterpart to
4d7df43.

This patch implement the GlobalISel counterpart to 4d7df43.

llvmbot · 2023-12-06T12:20:56Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-aarch64

Author: Thomas Preud'homme (RoboTux)

Changes

This patch implement the GlobalISel counterpart to
4d7df43.

Patch is 363.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74588.diff

9 Files Affected:

(modified) llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (+183)
(modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+26-3)
(modified) llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll (+8-40)
(modified) llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll (+16-32)
(modified) llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll (+955-1425)
(modified) llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll (+363-1320)
(modified) llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll (+239)
(modified) llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll (+1367)
(modified) llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir (-2)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 045fc78218dae..186937e597c5b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -765,6 +765,166 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
   return LegalizerHelper::Legalized;
 }
 
+static RTLIB::Libcall
+getOutlineAtomicLibcall(unsigned Opc, AtomicOrdering Order, uint64_t MemSize) {
+  unsigned ModeN, ModelN;
+  switch (MemSize) {
+  case 1:
+    ModeN = 0;
+    break;
+  case 2:
+    ModeN = 1;
+    break;
+  case 4:
+    ModeN = 2;
+    break;
+  case 8:
+    ModeN = 3;
+    break;
+  case 16:
+    ModeN = 4;
+    break;
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+
+  switch (Order) {
+  case AtomicOrdering::Monotonic:
+    ModelN = 0;
+    break;
+  case AtomicOrdering::Acquire:
+    ModelN = 1;
+    break;
+  case AtomicOrdering::Release:
+    ModelN = 2;
+    break;
+  case AtomicOrdering::AcquireRelease:
+  case AtomicOrdering::SequentiallyConsistent:
+    ModelN = 3;
+    break;
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+
+#define LCALLS(A, B)                                                           \
+  { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
+#define LCALL5(A)                                                              \
+  LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
+  switch (Opc) {
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_XCHG: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_ADD: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_AND: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_OR: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_XOR: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
+    return LC[ModeN][ModelN];
+  }
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+#undef LCALLS
+#undef LCALL5
+}
+
+static LegalizerHelper::LegalizeResult
+createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
+  auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+
+  // Add all the args, except for the last which is an imm denoting 'tail'.
+  // const CallLowering::ArgInfo &Result,
+  // Operand 0 & 1 are return: 0 is old val, 1 is success, 2-4 are reg operands:
+  // 2 is ptr, 3 is expected, 4 is new
+  Type *RetTy;
+  SmallVector<Register> RetRegs;
+  SmallVector<CallLowering::ArgInfo, 3> Args;
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    Register Success;
+    LLT SuccessLLT;
+    auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
+        MI.getFirst4RegLLTs();
+    RetRegs.push_back(Ret);
+    RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
+    if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
+      std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
+               NewLLT) = MI.getFirst5RegLLTs();
+      RetRegs.push_back(Success);
+      RetTy = StructType::get(
+          Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
+    }
+    Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
+    Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
+    Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
+    break;
+  }
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR: {
+    auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
+    RetRegs.push_back(Ret);
+    RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
+    if (Opc == TargetOpcode::G_ATOMICRMW_AND) {
+      Register Tmp = MRI.createGenericVirtualRegister(ValLLT);
+      MIRBuilder.buildXor(Tmp, MIRBuilder.buildConstant(ValLLT, -1), Val);
+      Val = Tmp;
+    }
+    Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
+    Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
+    break;
+  }
+  default:
+    llvm_unreachable("unsupported opcode");
+  }
+
+  auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  auto &AtomicMI = cast<GMemOperation>(MI);
+  auto Ordering = AtomicMI.getMMO().getMergedOrdering();
+  uint64_t MemSize = AtomicMI.getMemSize();
+  RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(Opc, Ordering, MemSize);
+  const char *Name = TLI.getLibcallName(RTLibcall);
+
+  // Unsupported libcall on the target.
+  if (!Name) {
+    LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
+                      << MIRBuilder.getTII().getName(Opc) << "\n");
+    return LegalizerHelper::UnableToLegalize;
+  }
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
+
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
+    return LegalizerHelper::UnableToLegalize;
+
+  return LegalizerHelper::Legalized;
+}
+
 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
                                        Type *FromType) {
   auto ToMVT = MVT::getVT(ToType);
@@ -1020,6 +1180,18 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR:
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    auto Status = createAtomicLibcall(MIRBuilder, MI);
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
   case TargetOpcode::G_BZERO:
   case TargetOpcode::G_MEMCPY:
   case TargetOpcode::G_MEMMOVE:
@@ -3793,6 +3965,17 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerTRUNC(MI);
   GISEL_VECREDUCE_CASES_NONSEQ
     return lowerVectorReduction(MI);
+  case G_ATOMICRMW_SUB: {
+    auto Val = MI.getOperand(2).getReg();
+    LLT ValLLT = MRI.getType(Val);
+    Register Tmp = MRI.createGenericVirtualRegister(ValLLT);
+    MIRBuilder.buildSub(Tmp, MIRBuilder.buildConstant(ValLLT, 0), Val);
+    auto [Ret, Mem] = MI.getFirst2Regs();
+    auto &MMO = cast<GMemOperation>(MI).getMMO();
+    MIRBuilder.buildAtomicRMWAdd(Ret, Mem, Tmp, MMO);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 21a412e9360dc..7fce3e501db57 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -758,16 +758,39 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
 
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
+      .libcallIf([&ST](const LegalityQuery &Query) {
+        return ST.outlineAtomics() && !ST.hasLSE();
+      })
       .customIf([](const LegalityQuery &Query) {
         return Query.Types[0].getSizeInBits() == 128;
       })
       .clampScalar(0, s32, s64)
       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
 
+  getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
+                               G_ATOMICRMW_AND, G_ATOMICRMW_OR,
+                               G_ATOMICRMW_XOR})
+      .libcallIf([&ST](const LegalityQuery &Query) {
+        return ST.outlineAtomics() && !ST.hasLSE();
+      })
+      .clampScalar(0, s32, s64)
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
+
+  getActionDefinitionsBuilder(G_ATOMICRMW_SUB)
+      .lowerIf([&ST](const LegalityQuery &Query) {
+        return ST.outlineAtomics() && !ST.hasLSE();
+      })
+      .clampScalar(0, s32, s64)
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
+
+  // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
+  // Don't outline them unless
+  // (1) high level <atomic> support approved:
+  //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
+  // (2) low level libgcc and compiler-rt support implemented by:
+  //   min/max outline atomics helpers
   getActionDefinitionsBuilder(
-      {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
-       G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
-       G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
+      {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
       .clampScalar(0, s32, s64)
       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
 
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
index fb4bef33d9b4f..fccafb29addbc 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
@@ -229,11 +229,7 @@ define dso_local i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_unordered:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_unordered:
 ; -O1:    ldxp x0, x1, [x8]
@@ -244,11 +240,7 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_unordered_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_unordered_const:
 ; -O1:    ldxp x0, x1, [x8]
@@ -259,11 +251,7 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
@@ -274,11 +262,7 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_monotonic_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const:
 ; -O1:    ldxp x0, x1, [x8]
@@ -289,11 +273,7 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt
 
 define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_acquire:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -304,11 +284,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_acquire_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_acquire_const:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -319,11 +295,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr)
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq_rel
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -334,11 +306,7 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq_rel
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const:
 ; -O1:    ldaxp x0, x1, [x8]
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
index 3d204b734d4a0..e594561010464 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
@@ -117,14 +117,10 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_unordered:
@@ -136,14 +132,10 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_monotonic:
@@ -155,14 +147,10 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_release:
@@ -174,14 +162,10 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
index c660c139e35d4..e9b096e8c6c44 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
@@ -145,14 +145,10 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
@@ -164,14 +160,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire:
@@ -183,14 +175,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release:
@@ -202,14 +190,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
@@ -221,14 +205,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8...
[truncated]

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

ilinpv

Overall outline atomics part looks good, for global-isel probably @davemgreen can provide some input. Thank you for bringing outline atomics there!

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

ilinpv · 2023-12-14T18:07:30Z

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_ADD: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};


Tests for ldadd4, ldadd8 outline atomics would be great to add.

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

…tomics

github-actions · 2023-12-15T17:46:46Z

✅ With the latest revision this PR passed the C/C++ code formatter.

aemerson

Not an expert on atomics, but why would we have a libcall for -O0 but not for O1 in the tests?

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

arsenm

Are pointer type xchg / cmpxchg already tested?

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

RoboTux · 2023-12-18T11:38:37Z

Not an expert on atomics, but why would we have a libcall for -O0 but not for O1 in the tests?

I looked at it for the u?(min|max) and it seemed to boil down to the atomic expand pass being run at -O1 and above.

RoboTux · 2023-12-18T11:53:05Z

Not an expert on atomics, but why would we have a libcall for -O0 but not for O1 in the tests?

I looked at it for the u?(min|max) and it seemed to boil down to the atomic expand pass being run at -O1 and above.

No sorry, it's not that it's only run at O1 and above, it's that the output is different. At O0 it keeps the cmpxchg whereas at O1 it changes the cmpxchg into a ldxr + stlxr intrinsics.

RoboTux · 2023-12-18T14:20:20Z

Not an expert on atomics, but why would we have a libcall for -O0 but not for O1 in the tests?

I looked at it for the u?(min|max) and it seemed to boil down to the atomic expand pass being run at -O1 and above.

No sorry, it's not that it's only run at O1 and above, it's that the output is different. At O0 it keeps the cmpxchg whereas at O1 it changes the cmpxchg into a ldxr + stlxr intrinsics.

@aemerson

AArch64TargetLowering::shouldExpandAtomicRMWInIR() has:
  // Nand is not supported in LSE.
  // Leave 128 bits to LLSC or CmpXChg.
  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
    if (Subtarget->hasLSE())
      return AtomicExpansionKind::None;
    if (Subtarget->outlineAtomics()) {
      // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
      // Don't outline them unless
      // (1) high level <atomic> support approved:
      //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
      // (2) low level libgcc and compiler-rt support implemented by:
      //   min/max outline atomics helpers
      if (AI->getOperation() != AtomicRMWInst::Min &&
          AI->getOperation() != AtomicRMWInst::Max &&
          AI->getOperation() != AtomicRMWInst::UMin &&
          AI->getOperation() != AtomicRMWInst::UMax) {
        return AtomicExpansionKind::None;
      }
    }
  }

  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
  // implement atomicrmw without spilling. If the target address is also on the
  // stack and close enough to the spill slot, this can lead to a situation
  // where the monitor always gets cleared and the atomic operation can never
  // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
  // we have a single CAS instruction that can replace the loop.
  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
      Subtarget->hasLSE())
    return AtomicExpansionKind::CmpXChg;

That explains why -O0 differs from -O1 for nand and u?(min|max)

arsenm · 2023-12-18T14:43:04Z

// At -O0, fast-regalloc cannot cope with the live vregs necessary to

This sounds extremely unsound

jyknight · 2023-12-18T23:20:47Z

This sounds extremely unsound

It is completely unsound. The constraints that approximately every architecture has on LL/SC loops make it unsound to ever generate isolated LL and SC instructions at the IR level, as we are currently doing on AArch64, ARM, and Hexagon.

We already added the infrastructure to do this properly, and used it for RISCV, and it's now also being used on Loongarch. ARM/AArch64 should be updated to match.

See discussion in https://lists.llvm.org/pipermail/llvm-dev/2018-June/123993.html

RoboTux · 2023-12-21T10:43:44Z

// At -O0, fast-regalloc cannot cope with the live vregs necessary to

This sounds extremely unsound

Just to clarify, is the expectation for me to solve this in this patch or can this be done in a separate patch? Note that this is existing code exercised when compiling at -O1 or above (i.e. when not using GlobalISel) for some of the atomics.

Best regards,
Thomas

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

arsenm · 2023-12-21T13:12:01Z

// At -O0, fast-regalloc cannot cope with the live vregs necessary to

This sounds extremely unsound

Just to clarify, is the expectation for me to solve this in this patch or can this be done in a separate patch?

This should be separate, it's an unrelated issue

RoboTux · 2024-01-02T16:53:36Z

My apologies @arsenm, I had missed your review comments. All fixed now.

RoboTux · 2024-01-04T10:52:04Z

Thanks for fixing the unused variable @DamonFool , I was about to revert and push a new patch.

Add out-of-line-atomics support to GlobalISel

57e9965

This patch implement the GlobalISel counterpart to 4d7df43.

RoboTux requested a review from ilinpv December 6, 2023 12:20

llvmbot added backend:AArch64 llvm:globalisel labels Dec 6, 2023

tschuett reviewed Dec 6, 2023

View reviewed changes

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp Outdated Show resolved Hide resolved

Move ATOMICRMW_SUB to ATOMICRMW_ADD conversion to libcall()

454c46a

arsenm reviewed Dec 6, 2023

View reviewed changes

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp Outdated Show resolved Hide resolved

Put legal first, then custom

845d86c

ilinpv reviewed Dec 14, 2023

View reviewed changes

davemgreen requested review from aemerson and davemgreen December 14, 2023 18:36

RoboTux added 4 commits December 15, 2023 14:44

Share outline atomic libcall selection.

f8488aa

Add 32 and 64 bits atomic tests for GlobalISel

8bc9d0f

Do not duplicate explanation for min/max outlining

904204a

Merge remote-tracking branch 'origin/main' into global-isel-outline-a…

84aa367

…tomics

Fix codestyle

3ac8ba1

aemerson reviewed Dec 17, 2023

View reviewed changes

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp Outdated Show resolved Hide resolved

arsenm reviewed Dec 18, 2023

View reviewed changes

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp Outdated Show resolved Hide resolved

Allow ptr atomics & remove stray comment.

6fc3b3b

arsenm reviewed Dec 21, 2023

View reviewed changes

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp Outdated Show resolved Hide resolved

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp Outdated Show resolved Hide resolved

Let MIRBuilder create destination register

62d0b48

Fix codestyle

79b0890

arsenm approved these changes Jan 4, 2024

View reviewed changes

RoboTux merged commit ce61b0e into llvm:main Jan 4, 2024

RoboTux deleted the global-isel-outline-atomics branch April 18, 2024 15:58

Add out-of-line-atomics support to GlobalISel #74588

Add out-of-line-atomics support to GlobalISel #74588

Uh oh!

Conversation

RoboTux commented Dec 6, 2023

Uh oh!

llvmbot commented Dec 6, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

ilinpv left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

ilinpv Dec 14, 2023

Choose a reason for hiding this comment

Uh oh!

Uh oh!

github-actions bot commented Dec 15, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

aemerson left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

RoboTux commented Dec 18, 2023

Uh oh!

RoboTux commented Dec 18, 2023

Uh oh!

RoboTux commented Dec 18, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Dec 18, 2023

Uh oh!

jyknight commented Dec 18, 2023

Uh oh!

RoboTux commented Dec 21, 2023

Uh oh!

Uh oh!

Uh oh!

arsenm commented Dec 21, 2023

Uh oh!

RoboTux commented Jan 2, 2024

Uh oh!

RoboTux commented Jan 4, 2024

Uh oh!

Uh oh!

llvmbot commented Dec 6, 2023 •

edited

Loading

github-actions bot commented Dec 15, 2023 •

edited

Loading

RoboTux commented Dec 18, 2023 •

edited

Loading