Skip to content

Commit 9638d08

Browse files
[NVPTX] Support for memory orderings for cmpxchg (#126159)
So far, all cmpxchg instructions were lowered to atom.cas. This change adds support for memory orders in lowering. Specifically: - For cmpxchg which are emulated, memory ordering is enforced by adding fences around the emulation loops. - For cmpxchg which are lowered to PTX directly, where the memory order is supported in ptx, lower directly to the correct ptx instruction. - For seq_cst cmpxchg which are lowered to PTX directly, use a sequence (fence.sc; atom.cas.acquire) to provide the semantics that we want. Also adds tests for all possible combinations of (size, memory ordering, address space, SM/PTX versions) This also adds `atomicOperationOrderAfterFenceSplit` in TargetLowering, for specially handling seq_cst atomics.
1 parent d9d1f24 commit 9638d08

File tree

14 files changed

+19153
-173
lines changed

14 files changed

+19153
-173
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2173,6 +2173,14 @@ class TargetLoweringBase {
21732173
return false;
21742174
}
21752175

2176+
// The memory ordering that AtomicExpandPass should assign to a atomic
2177+
// instruction that it has lowered by adding fences. This can be used
2178+
// to "fold" one of the fences into the atomic instruction.
2179+
virtual AtomicOrdering
2180+
atomicOperationOrderAfterFenceSplit(const Instruction *I) const {
2181+
return AtomicOrdering::Monotonic;
2182+
}
2183+
21762184
/// Whether AtomicExpandPass should automatically insert a trailing fence
21772185
/// without reducing the ordering for this atomic. Defaults to false.
21782186
virtual bool

llvm/lib/CodeGen/AtomicExpandPass.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,10 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
324324
// failure path. As a result, fence insertion is directly done by
325325
// expandAtomicCmpXchg in that case.
326326
FenceOrdering = CASI->getMergedOrdering();
327-
CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
328-
CASI->setFailureOrdering(AtomicOrdering::Monotonic);
327+
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
328+
329+
CASI->setSuccessOrdering(CASOrdering);
330+
CASI->setFailureOrdering(CASOrdering);
329331
}
330332

331333
if (FenceOrdering != AtomicOrdering::Monotonic) {

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,15 @@
4242
#include "llvm/IR/FPEnv.h"
4343
#include "llvm/IR/Function.h"
4444
#include "llvm/IR/GlobalValue.h"
45+
#include "llvm/IR/IRBuilder.h"
4546
#include "llvm/IR/Instruction.h"
4647
#include "llvm/IR/Instructions.h"
4748
#include "llvm/IR/IntrinsicsNVPTX.h"
4849
#include "llvm/IR/Module.h"
4950
#include "llvm/IR/Type.h"
5051
#include "llvm/IR/Value.h"
5152
#include "llvm/Support/Alignment.h"
53+
#include "llvm/Support/AtomicOrdering.h"
5254
#include "llvm/Support/Casting.h"
5355
#include "llvm/Support/CodeGen.h"
5456
#include "llvm/Support/CommandLine.h"
@@ -997,6 +999,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
997999
// actions
9981000
computeRegisterProperties(STI.getRegisterInfo());
9991001

1002+
// PTX support for 16-bit CAS is emulated. Only use 32+
10001003
setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
10011004
setMaxAtomicSizeInBitsSupported(64);
10021005
setMaxDivRemBitWidthSupported(64);
@@ -5600,6 +5603,70 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
56005603
return AtomicExpansionKind::CmpXChg;
56015604
}
56025605

5606+
bool NVPTXTargetLowering::shouldInsertFencesForAtomic(
5607+
const Instruction *I) const {
5608+
auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
5609+
// When CAS bitwidth is not supported on the hardware, the CAS is emulated
5610+
// using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
5611+
// the memory order using explicit fences around the retry loop.
5612+
// The memory order of natively supported CAS operations can be enforced
5613+
// by lowering to an atom.cas with the right memory synchronizing effect.
5614+
// However, atom.cas only supports relaxed, acquire, release and acq_rel.
5615+
// So we also use explicit fences for enforcing memory order for
5616+
// seq_cast CAS with natively-supported bitwidths.
5617+
return CI &&
5618+
(cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
5619+
STI.getMinCmpXchgSizeInBits() ||
5620+
CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
5621+
}
5622+
5623+
AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
5624+
const Instruction *I) const {
5625+
auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
5626+
bool BitwidthSupportedAndIsSeqCst =
5627+
CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
5628+
cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
5629+
STI.getMinCmpXchgSizeInBits();
5630+
return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
5631+
: AtomicOrdering::Monotonic;
5632+
}
5633+
5634+
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
5635+
Instruction *Inst,
5636+
AtomicOrdering Ord) const {
5637+
if (!isa<AtomicCmpXchgInst>(Inst))
5638+
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
5639+
5640+
// Specialize for cmpxchg
5641+
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
5642+
if (isReleaseOrStronger(Ord))
5643+
return Ord == AtomicOrdering::SequentiallyConsistent
5644+
? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
5645+
: Builder.CreateFence(AtomicOrdering::Release);
5646+
5647+
return nullptr;
5648+
}
5649+
5650+
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
5651+
Instruction *Inst,
5652+
AtomicOrdering Ord) const {
5653+
// Specialize for cmpxchg
5654+
if (!isa<AtomicCmpXchgInst>(Inst))
5655+
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
5656+
5657+
auto CASWidth =
5658+
cast<IntegerType>(
5659+
dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())
5660+
->getBitWidth();
5661+
// Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
5662+
if (isAcquireOrStronger(Ord) &&
5663+
(Ord != AtomicOrdering::SequentiallyConsistent ||
5664+
CASWidth < STI.getMinCmpXchgSizeInBits()))
5665+
return Builder.CreateFence(AtomicOrdering::Acquire);
5666+
5667+
return nullptr;
5668+
}
5669+
56035670
// Pin NVPTXTargetObjectFile's vtables to this file.
56045671
NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
56055672

llvm/lib/Target/NVPTX/NVPTXISelLowering.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "NVPTX.h"
1818
#include "llvm/CodeGen/SelectionDAG.h"
1919
#include "llvm/CodeGen/TargetLowering.h"
20+
#include "llvm/Support/AtomicOrdering.h"
2021

2122
namespace llvm {
2223
namespace NVPTXISD {
@@ -260,6 +261,16 @@ class NVPTXTargetLowering : public TargetLowering {
260261
return true;
261262
}
262263

264+
bool shouldInsertFencesForAtomic(const Instruction *) const override;
265+
266+
AtomicOrdering
267+
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
268+
269+
Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
270+
AtomicOrdering Ord) const override;
271+
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
272+
AtomicOrdering Ord) const override;
273+
263274
private:
264275
const NVPTXSubtarget &STI; // cache the subtarget here
265276
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 81 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -2011,41 +2011,41 @@ multiclass F_ATOMIC_2_NEG<ValueType regT, NVPTXRegClass regclass, string SpaceSt
20112011

20122012
// has 3 operands
20132013
multiclass F_ATOMIC_3_imp<ValueType ptrT, NVPTXRegClass ptrclass,
2014-
ValueType regT, NVPTXRegClass regclass,
2014+
ValueType regT, NVPTXRegClass regclass, string SemStr,
20152015
string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
20162016
Operand IMMType, list<Predicate> Pred> {
20172017
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
20182018
def reg : NVPTXInst<(outs regclass:$dst),
20192019
(ins ptrclass:$addr, regclass:$b, regclass:$c),
2020-
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
2020+
!strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
20212021
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
20222022
Requires<Pred>;
20232023

20242024
def imm1 : NVPTXInst<(outs regclass:$dst),
20252025
(ins ptrclass:$addr, IMMType:$b, regclass:$c),
2026-
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
2026+
!strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
20272027
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
20282028
Requires<Pred>;
20292029

20302030
def imm2 : NVPTXInst<(outs regclass:$dst),
20312031
(ins ptrclass:$addr, regclass:$b, IMMType:$c),
2032-
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
2032+
!strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
20332033
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
20342034
Requires<Pred>;
20352035

20362036
def imm3 : NVPTXInst<(outs regclass:$dst),
20372037
(ins ptrclass:$addr, IMMType:$b, IMMType:$c),
2038-
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
2038+
!strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
20392039
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
20402040
Requires<Pred>;
20412041
}
20422042
}
2043-
multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
2044-
string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
2045-
defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
2046-
IntOp, IMMType, Pred>;
2047-
defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
2048-
IntOp, IMMType, Pred>;
2043+
multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SemStr, string SpaceStr,
2044+
string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
2045+
defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
2046+
OpcStr, IntOp, IMMType, Pred>;
2047+
defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
2048+
OpcStr, IntOp, IMMType, Pred>;
20492049
}
20502050

20512051
// atom_add
@@ -2427,51 +2427,76 @@ defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor",
24272427
defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
24282428
".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;
24292429

2430-
// atom_cas
2431-
2432-
def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
2433-
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
2434-
def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
2435-
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
2436-
def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
2437-
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
2438-
def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
2439-
(atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
2440-
def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
2441-
(atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
2442-
def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
2443-
(atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
2444-
def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
2445-
(atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
2446-
def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
2447-
(atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
2448-
def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
2449-
(atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
2450-
2451-
defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
2452-
atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
2453-
defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
2454-
atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
2455-
defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
2456-
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
2457-
defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
2458-
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
2459-
defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
2460-
atomic_cmp_swap_i32_g, i32imm>;
2461-
defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
2462-
atomic_cmp_swap_i32_s, i32imm>;
2463-
defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<i32, Int32Regs, "", ".b32", ".cas",
2464-
atomic_cmp_swap_i32_gen, i32imm>;
2465-
defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32",
2466-
".cas", atomic_cmp_swap_i32_gen, i32imm>;
2467-
defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", ".cas",
2468-
atomic_cmp_swap_i64_g, i64imm>;
2469-
defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<i64, Int64Regs, ".shared", ".b64", ".cas",
2470-
atomic_cmp_swap_i64_s, i64imm>;
2471-
defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<i64, Int64Regs, "", ".b64", ".cas",
2472-
atomic_cmp_swap_i64_gen, i64imm>;
2473-
defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64",
2474-
".cas", atomic_cmp_swap_i64_gen, i64imm>;
2430+
multiclass ternary_atomic_op_as {
2431+
// one record per address space
2432+
def NAME#_generic: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
2433+
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
2434+
AS_match.generic>;
2435+
2436+
def NAME#_global: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
2437+
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
2438+
AS_match.global>;
2439+
2440+
def NAME#_shared: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
2441+
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
2442+
AS_match.shared>;
2443+
}
2444+
2445+
// generate pattern fragments for size x memory order
2446+
// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs
2447+
// using larger-bitwidth cas
2448+
foreach size = ["i16", "i32", "i64"] in {
2449+
foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in {
2450+
defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as;
2451+
}
2452+
}
2453+
2454+
// eg. with type = 32, order = ".acquire", addrspace = ".global",
2455+
// atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global.
2456+
// preds = [hasSM<70>, hasPTX<63>]
2457+
// F_ATOMIC_3<i32, Int32Regs, ".acquire", ".global", ".b32",
2458+
// ".cas", atomic_cmp_swap_i32_acquire_global, i32imm,
2459+
// [hasSM<70>, hasPTX<63>]>
2460+
multiclass INT_PTX_ATOM_CAS<string atomic_cmp_swap_pat, string type,
2461+
string order, string addrspace, list<Predicate> preds>
2462+
: F_ATOMIC_3<!cast<ValueType>("i"#type),
2463+
!cast<NVPTXRegClass>("Int"#type#"Regs"),
2464+
order,
2465+
addrspace,
2466+
".b"#type,
2467+
".cas",
2468+
!cast<PatFrag>(atomic_cmp_swap_pat),
2469+
!cast<Operand>("i"#type#"imm"),
2470+
preds>;
2471+
2472+
// Define atom.cas for all combinations of size x addrspace x memory order
2473+
// supported in PTX *and* on the hardware.
2474+
foreach size = ["32", "64"] in {
2475+
foreach addrspace = ["generic", "global", "shared"] in {
2476+
defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace);
2477+
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
2478+
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
2479+
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
2480+
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
2481+
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
2482+
defm INT_PTX_ATOM_CAS_#size#_#order#addrspace
2483+
: INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size,
2484+
cas_order_string, cas_addrspace_string,
2485+
[hasSM<70>, hasPTX<63>]>;
2486+
defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace
2487+
: INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size,
2488+
"", cas_addrspace_string, []>;
2489+
}
2490+
}
2491+
}
2492+
2493+
// Note that 16-bit CAS support in PTX is emulated.
2494+
defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
2495+
atomic_cmp_swap_i16_global, i16imm, [hasSM<70>, hasPTX<63>]>;
2496+
defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, "", ".shared", ".b16", ".cas",
2497+
atomic_cmp_swap_i16_shared, i16imm, [hasSM<70>, hasPTX<63>]>;
2498+
defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".b16", ".cas",
2499+
atomic_cmp_swap_i16_generic, i16imm, [hasSM<70>, hasPTX<63>]>;
24752500

24762501
// Support for scoped atomic operations. Matches
24772502
// int_nvvm_atomic_{op}_{space}_{type}_{scope}

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
147147
// set of equivalent memory operations with a scalar data-type, executed in
148148
// an unspecified order on the elements in the vector.
149149
unsigned getMaxRequiredAlignment() const { return 8; }
150-
// Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS
150+
// Get the smallest cmpxchg word size that the hardware supports.
151151
unsigned getMinCmpXchgSizeInBits() const { return 32; }
152152

153153
unsigned getPTXVersion() const { return PTXVersion; }

llvm/test/CodeGen/NVPTX/atomics-sm90.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
7171
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
7272
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
7373
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
74-
; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
74+
; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32;
7575
; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
7676
; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
7777
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
@@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
8787
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
8888
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
8989
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
90-
; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
90+
; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37;
9191
; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
9292
; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
9393
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
@@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
109109
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
110110
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
111111
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
112-
; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
112+
; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45;
113113
; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
114114
; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
115115
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
@@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
131131
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
132132
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
133133
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
134-
; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
134+
; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53;
135135
; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
136136
; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
137137
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;

0 commit comments

Comments
 (0)