Skip to content

Commit 9a2d160

Browse files
committed
AMDGPU: Add codegen for atomicrmw operations usub_cond and usub_sat
Split off from llvm#105553 as per discussion there.
1 parent 229aa66 commit 9a2d160

30 files changed

+3374
-487
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1612,6 +1612,8 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
16121612
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
16131613
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
16141614
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
1615+
def int_amdgcn_raw_ptr_buffer_atomic_usub_cond : AMDGPURawPtrBufferAtomic;
1616+
def int_amdgcn_raw_ptr_buffer_atomic_usub_sat : AMDGPURawPtrBufferAtomic;
16151617
def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
16161618
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
16171619
[llvm_anyint_ty],

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
273273
// FIXME: Check MMO is atomic
274274
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
275275
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
276+
def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
277+
def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
276278
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
277279
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
278280

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4077,6 +4077,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
40774077
case TargetOpcode::G_ATOMICRMW_UMAX:
40784078
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
40794079
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4080+
case TargetOpcode::G_ATOMICRMW_USUB_COND:
4081+
case TargetOpcode::G_ATOMICRMW_USUB_SAT:
40804082
case TargetOpcode::G_ATOMICRMW_FADD:
40814083
case TargetOpcode::G_ATOMICRMW_FMIN:
40824084
case TargetOpcode::G_ATOMICRMW_FMAX:

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
692692
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
693693
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
694694
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
695+
defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
696+
defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
695697
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
696698

697699
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1671,6 +1671,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
16711671
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
16721672
}
16731673

1674+
auto &Atomics32 =
1675+
getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1676+
.legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1677+
if (ST.hasFlatAddressSpace()) {
1678+
Atomics32.legalFor({{S32, FlatPtr}});
1679+
}
1680+
16741681
// TODO: v2bf16 operations, and fat buffer pointer support.
16751682
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
16761683
if (ST.hasLDSFPAtomicAddF32()) {

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,6 +1744,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
17441744
case AtomicRMWInst::FMin:
17451745
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
17461746
break;
1747+
case AtomicRMWInst::USubCond:
1748+
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_cond;
1749+
break;
1750+
case AtomicRMWInst::USubSat:
1751+
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_sat;
1752+
break;
17471753
case AtomicRMWInst::FSub: {
17481754
report_fatal_error("atomic floating point subtraction not supported for "
17491755
"buffer resources and should've been expanded away");
@@ -1766,13 +1772,10 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
17661772
case AtomicRMWInst::UIncWrap:
17671773
case AtomicRMWInst::UDecWrap:
17681774
report_fatal_error("wrapping increment/decrement not supported for "
1769-
"buffer resources and should've ben expanded away");
1775+
"buffer resources and should've been expanded away");
17701776
break;
17711777
case AtomicRMWInst::BAD_BINOP:
17721778
llvm_unreachable("Not sure how we got a bad binop");
1773-
case AtomicRMWInst::USubCond:
1774-
case AtomicRMWInst::USubSat:
1775-
break;
17761779
}
17771780
}
17781781

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5419,6 +5419,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
54195419
case AMDGPU::G_ATOMICRMW_FMAX:
54205420
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
54215421
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5422+
case AMDGPU::G_ATOMICRMW_USUB_COND:
5423+
case AMDGPU::G_ATOMICRMW_USUB_SAT:
54225424
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
54235425
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
54245426
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,7 +1089,34 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
10891089
}
10901090
}
10911091

1092+
multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
1093+
ValueType vt, string frag> {
1094+
let OtherPredicates = [LDSRequiresM0Init] in {
1095+
def : DSAtomicRetPat<inst, vt,
1096+
!cast<PatFrag>(frag#"_local_m0_"#vt)>;
1097+
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
1098+
def : DSAtomicRetPat<noRetInst, vt,
1099+
!cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
1100+
}
10921101

1102+
let OtherPredicates = [NotLDSRequiresM0Init] in {
1103+
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
1104+
!cast<PatFrag>(frag#"_local_"#vt)>;
1105+
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
1106+
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
1107+
!cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
1108+
}
1109+
1110+
let OtherPredicates = [HasGDS] in {
1111+
def : DSAtomicRetPat<inst, vt,
1112+
!cast<PatFrag>(frag#"_region_m0_"#vt),
1113+
/* complexity */ 0, /* gds */ 1>;
1114+
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
1115+
def : DSAtomicRetPat<noRetInst, vt,
1116+
!cast<PatFrag>(frag#"_region_m0_noret_"#vt),
1117+
/* complexity */ 1, /* gds */ 1>;
1118+
}
1119+
}
10931120

10941121
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
10951122
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
@@ -1172,6 +1199,14 @@ defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_l
11721199
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
11731200
}
11741201

1202+
let SubtargetPredicate = isGFX12Plus in {
1203+
1204+
defm : DSAtomicRetNoRetPatCondSub_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
1205+
1206+
defm : DSAtomicRetNoRetPat_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
1207+
1208+
} // let SubtargetPredicate = isGFX12Plus
1209+
11751210
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
11761211
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
11771212
}

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1642,6 +1642,12 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
16421642
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
16431643
}
16441644

1645+
let SubtargetPredicate = isGFX12Plus in {
1646+
defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;
1647+
1648+
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
1649+
defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
1650+
}
16451651
} // end foreach as
16461652

16471653
let SubtargetPredicate = isGFX12Plus in {
@@ -1788,10 +1794,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
17881794
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
17891795
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
17901796
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
1791-
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
1797+
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
17921798

17931799
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
1794-
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
1800+
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
17951801

17961802
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
17971803
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -1808,10 +1814,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
18081814
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
18091815

18101816
let SubtargetPredicate = isGFX12Plus in {
1811-
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
1817+
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
18121818

18131819
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
1814-
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
1820+
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
18151821
}
18161822

18171823
let OtherPredicates = [isGFX12Plus] in {

llvm/lib/Target/AMDGPU/R600ISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2191,6 +2191,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
21912191
// FIXME: Cayman at least appears to have instructions for this, but the
21922192
// instruction defintions appear to be missing.
21932193
return AtomicExpansionKind::CmpXChg;
2194+
case AtomicRMWInst::USubCond:
2195+
case AtomicRMWInst::USubSat:
2196+
if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
2197+
unsigned Size = IntTy->getBitWidth();
2198+
if (Size == 32)
2199+
return AtomicExpansionKind::None;
2200+
}
2201+
return AtomicExpansionKind::CmpXChg;
21942202
case AtomicRMWInst::Xchg: {
21952203
const DataLayout &DL = RMW->getFunction()->getDataLayout();
21962204
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
994994
ISD::ATOMIC_LOAD_FMAX,
995995
ISD::ATOMIC_LOAD_UINC_WRAP,
996996
ISD::ATOMIC_LOAD_UDEC_WRAP,
997+
ISD::ATOMIC_LOAD_USUB_COND,
998+
ISD::ATOMIC_LOAD_USUB_SAT,
997999
ISD::INTRINSIC_VOID,
9981000
ISD::INTRINSIC_W_CHAIN});
9991001

@@ -16806,10 +16808,10 @@ static bool isV2BF16(Type *Ty) {
1680616808
}
1680716809

1680816810
/// \return true if atomicrmw integer ops work for the type.
16809-
static bool isAtomicRMWLegalIntTy(Type *Ty) {
16811+
static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) {
1681016812
if (auto *IT = dyn_cast<IntegerType>(Ty)) {
1681116813
unsigned BW = IT->getBitWidth();
16812-
return BW == 32 || BW == 64;
16814+
return BW == 32 || (BW == 64 && Allow64);
1681316815
}
1681416816

1681516817
return false;
@@ -16861,8 +16863,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
1686116863

1686216864
/// \return Action to perform on AtomicRMWInsts for integer operations.
1686316865
static TargetLowering::AtomicExpansionKind
16864-
atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
16865-
return isAtomicRMWLegalIntTy(RMW->getType())
16866+
atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW, bool Allow64 = true) {
16867+
return isAtomicRMWLegalIntTy(RMW->getType(), Allow64)
1686616868
? TargetLowering::AtomicExpansionKind::None
1686716869
: TargetLowering::AtomicExpansionKind::CmpXChg;
1686816870
}
@@ -16931,6 +16933,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1693116933
case AtomicRMWInst::UIncWrap:
1693216934
case AtomicRMWInst::UDecWrap:
1693316935
return atomicSupportedIfLegalIntType(RMW);
16936+
case AtomicRMWInst::USubCond:
16937+
case AtomicRMWInst::USubSat:
16938+
return atomicSupportedIfLegalIntType(RMW, false);
1693416939
case AtomicRMWInst::Sub:
1693516940
case AtomicRMWInst::Or:
1693616941
case AtomicRMWInst::Xor: {

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
809809
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
810810
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
811811
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
812+
defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
813+
defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
812814
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
813815
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
814816
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,12 @@ body: |
8282
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UDEC_WRAP
8383
%20:_(s32) = G_ATOMICRMW_UDEC_WRAP %1, %5
8484
85+
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_COND
86+
%21:_(s32) = G_ATOMICRMW_USUB_COND %1, %5
87+
88+
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_SAT
89+
%22:_(s32) = G_ATOMICRMW_USUB_SAT %1, %5
90+
8591
$vgpr0 = COPY %4(s32)
8692
SI_RETURN implicit $vgpr0
8793

0 commit comments

Comments
 (0)