Skip to content

[NVPTX] Support for memory orderings for cmpxchg #126159

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -2173,6 +2173,14 @@ class TargetLoweringBase {
return false;
}

// The memory ordering that AtomicExpandPass should assign to a atomic
// instruction that it has lowered by adding fences. This can be used
// to "fold" one of the fences into the atomic instruction.
virtual AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const {
return AtomicOrdering::Monotonic;
}

/// Whether AtomicExpandPass should automatically insert a trailing fence
/// without reducing the ordering for this atomic. Defaults to false.
virtual bool
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/CodeGen/AtomicExpandPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,10 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// failure path. As a result, fence insertion is directly done by
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
CASI->setFailureOrdering(AtomicOrdering::Monotonic);
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the name of this reflect that it is only called for CAS?

Copy link
Contributor Author

@akshayrdeodhar akshayrdeodhar Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same API could be used for RMW as well -> pass the instruction, and have an if-else ladder inside, and specialize based on the instruction. This is how emitLeadingFence and emitTrailingFence are implemented.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(We're planning to use it for RMW eventually)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@topperc does the API work?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking of merging this Monday next week, let me know if there are any other concerns.


CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
}

if (FenceOrdering != AtomicOrdering::Monotonic) {
Expand Down
67 changes: 67 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,15 @@
#include "llvm/IR/FPEnv.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
Expand Down Expand Up @@ -995,6 +997,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// actions
computeRegisterProperties(STI.getRegisterInfo());

// PTX support for 16-bit CAS is emulated. Only use 32+
setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
setMaxAtomicSizeInBitsSupported(64);
setMaxDivRemBitWidthSupported(64);
Expand Down Expand Up @@ -5565,6 +5568,70 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
return AtomicExpansionKind::CmpXChg;
}

bool NVPTXTargetLowering::shouldInsertFencesForAtomic(
const Instruction *I) const {
auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
// When CAS bitwidth is not supported on the hardware, the CAS is emulated
// using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
// the memory order using explicit fences around the retry loop.
// The memory order of natively supported CAS operations can be enforced
// by lowering to an atom.cas with the right memory synchronizing effect.
// However, atom.cas only supports relaxed, acquire, release and acq_rel.
// So we also use explicit fences for enforcing memory order for
// seq_cast CAS with natively-supported bitwidths.
return CI &&
(cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
STI.getMinCmpXchgSizeInBits() ||
CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
}

AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
const Instruction *I) const {
auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
bool BitwidthSupportedAndIsSeqCst =
CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
STI.getMinCmpXchgSizeInBits();
return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
: AtomicOrdering::Monotonic;
}

Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);

// Specialize for cmpxchg
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
: Builder.CreateFence(AtomicOrdering::Release);

return nullptr;
}

Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);

auto CASWidth =
cast<IntegerType>(
dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())
->getBitWidth();
// Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
return Builder.CreateFence(AtomicOrdering::Acquire);

return nullptr;
}

// Pin NVPTXTargetObjectFile's vtables to this file.
NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;

Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "NVPTX.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/Support/AtomicOrdering.h"

namespace llvm {
namespace NVPTXISD {
Expand Down Expand Up @@ -260,6 +261,16 @@ class NVPTXTargetLowering : public TargetLowering {
return true;
}

bool shouldInsertFencesForAtomic(const Instruction *) const override;

AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;

Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;

private:
const NVPTXSubtarget &STI; // cache the subtarget here
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
Expand Down
137 changes: 81 additions & 56 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -1962,41 +1962,41 @@ multiclass F_ATOMIC_2_NEG<ValueType regT, NVPTXRegClass regclass, string SpaceSt

// has 3 operands
multiclass F_ATOMIC_3_imp<ValueType ptrT, NVPTXRegClass ptrclass,
ValueType regT, NVPTXRegClass regclass,
ValueType regT, NVPTXRegClass regclass, string SemStr,
string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
Operand IMMType, list<Predicate> Pred> {
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def reg : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, regclass:$c),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
!strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
Requires<Pred>;

def imm1 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, regclass:$c),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
!strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
Requires<Pred>;

def imm2 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, IMMType:$c),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
!strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
Requires<Pred>;

def imm3 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, IMMType:$c),
!strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
!strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
Requires<Pred>;
}
}
multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
IntOp, IMMType, Pred>;
defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
IntOp, IMMType, Pred>;
multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SemStr, string SpaceStr,
string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
OpcStr, IntOp, IMMType, Pred>;
defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
OpcStr, IntOp, IMMType, Pred>;
}

// atom_add
Expand Down Expand Up @@ -2378,51 +2378,76 @@ defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor",
defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;

// atom_cas

def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;

defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
atomic_cmp_swap_i32_g, i32imm>;
defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
atomic_cmp_swap_i32_s, i32imm>;
defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<i32, Int32Regs, "", ".b32", ".cas",
atomic_cmp_swap_i32_gen, i32imm>;
defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32",
".cas", atomic_cmp_swap_i32_gen, i32imm>;
defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", ".cas",
atomic_cmp_swap_i64_g, i64imm>;
defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<i64, Int64Regs, ".shared", ".b64", ".cas",
atomic_cmp_swap_i64_s, i64imm>;
defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<i64, Int64Regs, "", ".b64", ".cas",
atomic_cmp_swap_i64_gen, i64imm>;
defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64",
".cas", atomic_cmp_swap_i64_gen, i64imm>;
multiclass ternary_atomic_op_as {
// one record per address space
def NAME#_generic: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
AS_match.generic>;

def NAME#_global: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
AS_match.global>;

def NAME#_shared: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
AS_match.shared>;
}

// generate pattern fragments for size x memory order
// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs
// using larger-bitwidth cas
foreach size = ["i16", "i32", "i64"] in {
foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in {
defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as;
}
}

// eg. with type = 32, order = ".acquire", addrspace = ".global",
// atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global.
// preds = [hasSM<70>, hasPTX<63>]
// F_ATOMIC_3<i32, Int32Regs, ".acquire", ".global", ".b32",
// ".cas", atomic_cmp_swap_i32_acquire_global, i32imm,
// [hasSM<70>, hasPTX<63>]>
multiclass INT_PTX_ATOM_CAS<string atomic_cmp_swap_pat, string type,
string order, string addrspace, list<Predicate> preds>
: F_ATOMIC_3<!cast<ValueType>("i"#type),
!cast<NVPTXRegClass>("Int"#type#"Regs"),
order,
addrspace,
".b"#type,
".cas",
!cast<PatFrag>(atomic_cmp_swap_pat),
!cast<Operand>("i"#type#"imm"),
preds>;

// Define atom.cas for all combinations of size x addrspace x memory order
// supported in PTX *and* on the hardware.
foreach size = ["32", "64"] in {
foreach addrspace = ["generic", "global", "shared"] in {
defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace);
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#size#_#order#addrspace
: INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size,
cas_order_string, cas_addrspace_string,
[hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace
: INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size,
"", cas_addrspace_string, []>;
}
}
}

// Note that 16-bit CAS support in PTX is emulated.
defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
atomic_cmp_swap_i16_global, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, "", ".shared", ".b16", ".cas",
atomic_cmp_swap_i16_shared, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".b16", ".cas",
atomic_cmp_swap_i16_generic, i16imm, [hasSM<70>, hasPTX<63>]>;

// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/NVPTX/NVPTXSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// set of equivalent memory operations with a scalar data-type, executed in
// an unspecified order on the elements in the vector.
unsigned getMaxRequiredAlignment() const { return 8; }
// Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS
// Get the smallest cmpxchg word size that the hardware supports.
unsigned getMinCmpXchgSizeInBits() const { return 32; }

unsigned getPTXVersion() const { return PTXVersion; }
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/NVPTX/atomics-sm90.ll
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32;
; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
Expand All @@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37;
; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
Expand All @@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45;
; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
Expand All @@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53;
; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
Expand Down
Loading