Skip to content

Commit 6debee5

Browse files
committed
AMDGPU: Remove global/flat atomic fadd intrinics
These have been replaced with atomicrmw.
1 parent 09c8b09 commit 6debee5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+384
-2324
lines changed

llvm/docs/ReleaseNotes.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,11 @@ Changes to the AMDGPU Backend
137137
:ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and
138138
`fmax` with addrspace(3) instead.
139139

140+
* Removed ``llvm.amdgcn.flat.atomic.fadd`` and
141+
``llvm.amdgcn.global.atomic.fadd`` intrinsics. Users should use the
142+
:ref:`atomicrmw <i_atomicrmw>` instruction with `fadd` and
143+
addrspace(0) or addrspace(1) instead.
144+
140145
Changes to the ARM Backend
141146
--------------------------
142147

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2820,8 +2820,6 @@ def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
28202820
// gfx908 intrinsics
28212821
// ===----------------------------------------------------------------------===//
28222822

2823-
def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2824-
28252823
// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
28262824
class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
28272825
ClangBuiltin<!subst("int", "__builtin", NAME)>,
@@ -2860,7 +2858,6 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
28602858

28612859
def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
28622860
def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2863-
def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
28642861
def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
28652862
def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
28662863

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,8 +1035,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
10351035

10361036
if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
10371037
Name.starts_with("ds.fmax") ||
1038-
Name.starts_with("global.atomic.fadd.v2bf16") ||
1039-
Name.starts_with("flat.atomic.fadd.v2bf16")) {
1038+
Name.starts_with("global.atomic.fadd") ||
1039+
Name.starts_with("flat.atomic.fadd")) {
10401040
// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
10411041
// declaration.
10421042
NewFn = nullptr;

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -618,16 +618,11 @@ multiclass local_addr_space_atomic_op {
618618
}
619619
}
620620

621-
defm int_amdgcn_flat_atomic_fadd : noret_op;
622-
defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
623621
defm int_amdgcn_flat_atomic_fmin : noret_op;
624622
defm int_amdgcn_flat_atomic_fmax : noret_op;
625-
defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op;
626-
defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
627623
defm int_amdgcn_global_atomic_fmin : noret_op;
628624
defm int_amdgcn_global_atomic_fmax : noret_op;
629625
defm int_amdgcn_global_atomic_csub : noret_op;
630-
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
631626
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
632627
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
633628
defm int_amdgcn_flat_atomic_fmax_num : noret_op;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4886,13 +4886,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
48864886
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
48874887
break;
48884888
}
4889-
case Intrinsic::amdgcn_global_atomic_fadd:
48904889
case Intrinsic::amdgcn_global_atomic_csub:
48914890
case Intrinsic::amdgcn_global_atomic_fmin:
48924891
case Intrinsic::amdgcn_global_atomic_fmax:
48934892
case Intrinsic::amdgcn_global_atomic_fmin_num:
48944893
case Intrinsic::amdgcn_global_atomic_fmax_num:
4895-
case Intrinsic::amdgcn_flat_atomic_fadd:
48964894
case Intrinsic::amdgcn_flat_atomic_fmin:
48974895
case Intrinsic::amdgcn_flat_atomic_fmax:
48984896
case Intrinsic::amdgcn_flat_atomic_fmin_num:

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,13 +239,11 @@ def : SourceOfDivergence<int_r600_read_tidig_y>;
239239
def : SourceOfDivergence<int_r600_read_tidig_z>;
240240
def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
241241
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
242-
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
243242
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
244243
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
245244
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
246245
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
247246
def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
248-
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
249247
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
250248
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
251249
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,7 +1018,6 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
10181018
switch (IID) {
10191019
case Intrinsic::amdgcn_is_shared:
10201020
case Intrinsic::amdgcn_is_private:
1021-
case Intrinsic::amdgcn_flat_atomic_fadd:
10221021
case Intrinsic::amdgcn_flat_atomic_fmax:
10231022
case Intrinsic::amdgcn_flat_atomic_fmin:
10241023
case Intrinsic::amdgcn_flat_atomic_fmax_num:
@@ -1080,7 +1079,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
10801079
return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
10811080
{NewV, MaskOp});
10821081
}
1083-
case Intrinsic::amdgcn_flat_atomic_fadd:
10841082
case Intrinsic::amdgcn_flat_atomic_fmax:
10851083
case Intrinsic::amdgcn_flat_atomic_fmin:
10861084
case Intrinsic::amdgcn_flat_atomic_fmax_num:

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,11 +1134,7 @@ class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag,
11341134
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value)),
11351135
(inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> {
11361136
}
1137-
1138-
def : DSAtomicRetPatIntrinsic<DS_ADD_RTN_F64, f64, int_amdgcn_flat_atomic_fadd_local_addrspace>;
1139-
let AddedComplexity = 1 in
1140-
def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret_local_addrspace>;
1141-
}
1137+
} // End SubtargetPredicate = HasLdsAtomicAddF64
11421138

11431139
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
11441140
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,25 +1621,17 @@ let OtherPredicates = [isGFX12Only] in {
16211621

16221622
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
16231623
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
1624-
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
1625-
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
16261624
}
16271625

16281626
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
16291627
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
1630-
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
1631-
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
16321628
}
16331629

16341630
let OtherPredicates = [HasAtomicFaddRtnInsts] in {
16351631
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
1636-
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
1637-
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
16381632
}
16391633

16401634
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
1641-
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
1642-
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
16431635
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
16441636
}
16451637

@@ -1657,19 +1649,14 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax",
16571649

16581650
let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in {
16591651
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
1660-
defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
1661-
defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
16621652
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
1663-
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
16641653
}
16651654

16661655
let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
16671656
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
1668-
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
16691657
}
16701658

16711659
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
1672-
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
16731660
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
16741661
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
16751662
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1335,13 +1335,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13351335
MachineMemOperand::MODereferenceable;
13361336
return true;
13371337
}
1338-
case Intrinsic::amdgcn_global_atomic_fadd:
13391338
case Intrinsic::amdgcn_global_atomic_fmin:
13401339
case Intrinsic::amdgcn_global_atomic_fmax:
13411340
case Intrinsic::amdgcn_global_atomic_fmin_num:
13421341
case Intrinsic::amdgcn_global_atomic_fmax_num:
13431342
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1344-
case Intrinsic::amdgcn_flat_atomic_fadd:
13451343
case Intrinsic::amdgcn_flat_atomic_fmin:
13461344
case Intrinsic::amdgcn_flat_atomic_fmax:
13471345
case Intrinsic::amdgcn_flat_atomic_fmin_num:
@@ -1448,13 +1446,11 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
14481446
case Intrinsic::amdgcn_ds_consume:
14491447
case Intrinsic::amdgcn_ds_ordered_add:
14501448
case Intrinsic::amdgcn_ds_ordered_swap:
1451-
case Intrinsic::amdgcn_flat_atomic_fadd:
14521449
case Intrinsic::amdgcn_flat_atomic_fmax:
14531450
case Intrinsic::amdgcn_flat_atomic_fmax_num:
14541451
case Intrinsic::amdgcn_flat_atomic_fmin:
14551452
case Intrinsic::amdgcn_flat_atomic_fmin_num:
14561453
case Intrinsic::amdgcn_global_atomic_csub:
1457-
case Intrinsic::amdgcn_global_atomic_fadd:
14581454
case Intrinsic::amdgcn_global_atomic_fmax:
14591455
case Intrinsic::amdgcn_global_atomic_fmax_num:
14601456
case Intrinsic::amdgcn_global_atomic_fmin:

llvm/test/Bitcode/amdgcn-atomic.ll

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,4 +322,36 @@ define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) %
322322
ret <2 x i16> %result
323323
}
324324

325+
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr nocapture, <2 x half>) #0
326+
327+
define <2 x half> @upgrade_amdgcn_flat_atomic_fadd_v2f16_p0_v2f16(ptr %ptr, <2 x half> %data) {
328+
; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
329+
%result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
330+
ret <2 x half> %result
331+
}
332+
333+
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) #0
334+
335+
define <2 x half> @upgrade_amdgcn_global_atomic_fadd_v2f16_p1_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
336+
; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
337+
%result = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
338+
ret <2 x half> %result
339+
}
340+
341+
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr nocapture, float) #0
342+
343+
define float @upgrade_amdgcn_flat_atomic_fadd_f32_p0_f32(ptr %ptr, float %data) {
344+
; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
345+
%result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
346+
ret float %result
347+
}
348+
349+
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #0
350+
351+
define float @upgrade_amdgcn_global_atomic_fadd_f32_p1_f32(ptr addrspace(1) %ptr, float %data) {
352+
; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
353+
%result = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
354+
ret float %result
355+
}
356+
325357
attributes #0 = { argmemonly nounwind willreturn }

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
1212
; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
1313
; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
1414
; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
15-
; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
15+
; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
1616
; GFX940-NEXT: S_ENDPGM 0
1717
;
1818
; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
@@ -23,7 +23,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
2323
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
2424
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
2525
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
26-
; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
26+
; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
2727
; GFX11-NEXT: S_ENDPGM 0
2828
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
2929
ret void
@@ -38,7 +38,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
3838
; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
3939
; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
4040
; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
41-
; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
41+
; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
4242
; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
4343
; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
4444
;
@@ -50,7 +50,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
5050
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
5151
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
5252
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
53-
; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
53+
; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
5454
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
5555
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
5656
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
Lines changed: 2 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
4-
5-
define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %data) {
6-
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic
7-
; GFX90A_GFX940: bb.1 (%ir-block.0):
8-
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
9-
; GFX90A_GFX940-NEXT: {{ $}}
10-
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
11-
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
12-
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
13-
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
14-
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
15-
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
16-
; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr)
17-
; GFX90A_GFX940-NEXT: S_ENDPGM 0
18-
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data)
19-
ret void
20-
}
21-
22-
define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %data) {
23-
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_intrinsic
24-
; GFX90A_GFX940: bb.1 (%ir-block.0):
25-
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
26-
; GFX90A_GFX940-NEXT: {{ $}}
27-
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
28-
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
29-
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
30-
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
31-
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
32-
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
33-
; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr)
34-
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
35-
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
36-
; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
37-
; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
38-
; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
39-
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
40-
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
41-
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data)
42-
ret double %ret
43-
}
2+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
3+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
444

455
define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) {
466
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw
@@ -82,6 +42,4 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
8242
ret double %ret
8343
}
8444

85-
declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)
86-
8745
!0 = !{}

0 commit comments

Comments
 (0)