Skip to content

Commit 36eaf0d

Browse files
authored
AMDGPU: Don't canonicalize fminnum/fmaxnum if targets support IEEE fminimum(maximum)_num (#127711)
For targets that support IEEE fminimum_num/fmaximum_num, the corresponding *_min_num_fXY/*_max_num_fXY instructions themselves already did the canonicalization for the inputs. As a result, we do not need to explicitly canonicalize the inputs for fminnum/fmaxnum.
1 parent 8337d01 commit 36eaf0d

22 files changed

+954
-1424
lines changed

llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,8 @@ class LegalizerHelper {
425425
LegalizeResult lowerThreewayCompare(MachineInstr &MI);
426426
LegalizeResult lowerMinMax(MachineInstr &MI);
427427
LegalizeResult lowerFCopySign(MachineInstr &MI);
428-
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
428+
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI,
429+
bool ShouldCanonicalize = true);
429430
LegalizeResult lowerFMad(MachineInstr &MI);
430431
LegalizeResult lowerIntrinsicRound(MachineInstr &MI);
431432
LegalizeResult lowerFFloor(MachineInstr &MI);

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5314,7 +5314,8 @@ class TargetLowering : public TargetLoweringBase {
53145314
SelectionDAG &DAG) const;
53155315

53165316
/// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
5317-
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
5317+
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG,
5318+
bool ShouldCanonicalize = true) const;
53185319

53195320
/// Expand fminimum/fmaximum into multiple comparison with selects.
53205321
SDValue expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const;

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8137,14 +8137,14 @@ LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
81378137
}
81388138

81398139
LegalizerHelper::LegalizeResult
8140-
LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
8140+
LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI, bool ShouldCanonicalize) {
81418141
unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
81428142
TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
81438143

81448144
auto [Dst, Src0, Src1] = MI.getFirst3Regs();
81458145
LLT Ty = MRI.getType(Dst);
81468146

8147-
if (!MI.getFlag(MachineInstr::FmNoNans)) {
8147+
if (ShouldCanonicalize && !MI.getFlag(MachineInstr::FmNoNans)) {
81488148
// Insert canonicalizes if it's possible we need to quiet to get correct
81498149
// sNaN behavior.
81508150

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8488,7 +8488,8 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
84888488
}
84898489

84908490
SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
8491-
SelectionDAG &DAG) const {
8491+
SelectionDAG &DAG,
8492+
bool ShouldCanonicalize) const {
84928493
if (SDValue Expanded = expandVectorNaryOpBySplitting(Node, DAG))
84938494
return Expanded;
84948495

@@ -8505,7 +8506,7 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
85058506
SDValue Quiet0 = Node->getOperand(0);
85068507
SDValue Quiet1 = Node->getOperand(1);
85078508

8508-
if (!Node->getFlags().hasNoNaNs()) {
8509+
if (ShouldCanonicalize && !Node->getFlags().hasNoNaNs()) {
85098510
// Insert canonicalizes if it's possible we need to quiet to get correct
85108511
// sNaN behavior.
85118512
if (!DAG.isKnownNeverSNaN(Quiet0)) {

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2710,7 +2710,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
27102710
if (IsIEEEOp)
27112711
return true;
27122712

2713-
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2713+
return Helper.lowerFMinNumMaxNum(MI, !ST.hasIEEEMinNumMaxNum()) ==
2714+
LegalizerHelper::Legalized;
27142715
}
27152716

27162717
bool AMDGPULegalizerInfo::legalizeExtractVectorElt(

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1428,6 +1428,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
14281428
// \returns true if the target has IEEE fminimum/fmaximum instructions
14291429
bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
14301430

1431+
// \returns true if the target has IEEE fminimum_num/fmaximum_num
1432+
// instructions
1433+
bool hasIEEEMinNumMaxNum() const { return getGeneration() >= GFX12; }
1434+
14311435
// \returns true if the target has IEEE fminimum3/fmaximum3 instructions
14321436
bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
14331437

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6833,7 +6833,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
68336833
// mode functions, but this happens to be OK since it's only done in cases
68346834
// where there is known no sNaN.
68356835
if (IsIEEEMode)
6836-
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6836+
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG,
6837+
!Subtarget->hasIEEEMinNumMaxNum());
68376838

68386839
if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
68396840
VT == MVT::v16bf16)

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 23 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -602,15 +602,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
602602
; GFX12-NEXT: s_wait_bvhcnt 0x0
603603
; GFX12-NEXT: s_wait_kmcnt 0x0
604604
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
605-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
606605
; GFX12-NEXT: s_mov_b32 s0, 0
607606
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
608607
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
609608
; GFX12-NEXT: s_wait_loadcnt 0x0
610609
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
611-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
612-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
613-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
610+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
611+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
614612
; GFX12-NEXT: s_wait_storecnt 0x0
615613
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
616614
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -757,21 +755,18 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
757755
; GFX12-NEXT: s_wait_samplecnt 0x0
758756
; GFX12-NEXT: s_wait_bvhcnt 0x0
759757
; GFX12-NEXT: s_wait_kmcnt 0x0
760-
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
761-
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
758+
; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
762759
; GFX12-NEXT: s_mov_b32 s0, 0
763760
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
764761
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
765762
; GFX12-NEXT: s_wait_loadcnt 0x0
766-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
767-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
768-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
763+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
769764
; GFX12-NEXT: s_wait_storecnt 0x0
770-
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
765+
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
771766
; GFX12-NEXT: s_wait_loadcnt 0x0
772767
; GFX12-NEXT: global_inv scope:SCOPE_DEV
773-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
774-
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
768+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
769+
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
775770
; GFX12-NEXT: s_wait_alu 0xfffe
776771
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
777772
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
11881183
; GFX12-NEXT: s_wait_bvhcnt 0x0
11891184
; GFX12-NEXT: s_wait_kmcnt 0x0
11901185
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1191-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
11921186
; GFX12-NEXT: s_mov_b32 s0, 0
11931187
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
11941188
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
11951189
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
11961190
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1197-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1198-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1199-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
1191+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1192+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
12001193
; GFX12-NEXT: s_wait_storecnt 0x0
12011194
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12021195
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
13411334
; GFX12-NEXT: s_wait_samplecnt 0x0
13421335
; GFX12-NEXT: s_wait_bvhcnt 0x0
13431336
; GFX12-NEXT: s_wait_kmcnt 0x0
1344-
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1345-
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
1337+
; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
13461338
; GFX12-NEXT: s_mov_b32 s0, 0
13471339
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
13481340
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
13491341
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1350-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1351-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1352-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
1342+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
13531343
; GFX12-NEXT: s_wait_storecnt 0x0
1354-
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1344+
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13551345
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
13561346
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1357-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
1358-
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
1347+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
1348+
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
13591349
; GFX12-NEXT: s_wait_alu 0xfffe
13601350
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
13611351
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
17991789
; GFX12-NEXT: s_wait_bvhcnt 0x0
18001790
; GFX12-NEXT: s_wait_kmcnt 0x0
18011791
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1802-
; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1792+
; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
18031793
; GFX12-NEXT: s_mov_b32 s4, 0
18041794
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1805-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
18061795
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
18071796
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
18081797
; GFX12-NEXT: s_wait_loadcnt 0x0
18091798
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
18101799
; GFX12-NEXT: s_wait_storecnt 0x0
18111800
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1812-
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1813-
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1814-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1801+
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5]
18151802
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
18161803
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18171804
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1971,23 +1958,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
19711958
; GFX12-NEXT: s_wait_bvhcnt 0x0
19721959
; GFX12-NEXT: s_wait_kmcnt 0x0
19731960
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1974-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
19751961
; GFX12-NEXT: s_mov_b32 s4, 0
1976-
; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen
1962+
; GFX12-NEXT: buffer_load_b64 v[4:5], v6, s[0:3], null offen
19771963
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
19781964
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
19791965
; GFX12-NEXT: s_wait_loadcnt 0x0
1980-
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
1966+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1]
1967+
; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
19811968
; GFX12-NEXT: s_wait_storecnt 0x0
1982-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1983-
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
1984-
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1985-
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1969+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1970+
; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
19861971
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
19871972
; GFX12-NEXT: s_wait_loadcnt 0x0
19881973
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1989-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
1990-
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
1974+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
1975+
; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
19911976
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
19921977
; GFX12-NEXT: s_wait_alu 0xfffe
19931978
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 23 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -602,15 +602,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
602602
; GFX12-NEXT: s_wait_bvhcnt 0x0
603603
; GFX12-NEXT: s_wait_kmcnt 0x0
604604
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
605-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
606605
; GFX12-NEXT: s_mov_b32 s0, 0
607606
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
608607
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
609608
; GFX12-NEXT: s_wait_loadcnt 0x0
610609
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
611-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
612-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
613-
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
610+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
611+
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
614612
; GFX12-NEXT: s_wait_storecnt 0x0
615613
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
616614
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -757,21 +755,18 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
757755
; GFX12-NEXT: s_wait_samplecnt 0x0
758756
; GFX12-NEXT: s_wait_bvhcnt 0x0
759757
; GFX12-NEXT: s_wait_kmcnt 0x0
760-
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
761-
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
758+
; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
762759
; GFX12-NEXT: s_mov_b32 s0, 0
763760
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
764761
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
765762
; GFX12-NEXT: s_wait_loadcnt 0x0
766-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
767-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
768-
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
763+
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
769764
; GFX12-NEXT: s_wait_storecnt 0x0
770-
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
765+
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
771766
; GFX12-NEXT: s_wait_loadcnt 0x0
772767
; GFX12-NEXT: global_inv scope:SCOPE_DEV
773-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
774-
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
768+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
769+
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
775770
; GFX12-NEXT: s_wait_alu 0xfffe
776771
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
777772
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
11881183
; GFX12-NEXT: s_wait_bvhcnt 0x0
11891184
; GFX12-NEXT: s_wait_kmcnt 0x0
11901185
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1191-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
11921186
; GFX12-NEXT: s_mov_b32 s0, 0
11931187
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
11941188
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
11951189
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
11961190
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1197-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1198-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1199-
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
1191+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1192+
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
12001193
; GFX12-NEXT: s_wait_storecnt 0x0
12011194
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12021195
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
13411334
; GFX12-NEXT: s_wait_samplecnt 0x0
13421335
; GFX12-NEXT: s_wait_bvhcnt 0x0
13431336
; GFX12-NEXT: s_wait_kmcnt 0x0
1344-
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1345-
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
1337+
; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
13461338
; GFX12-NEXT: s_mov_b32 s0, 0
13471339
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
13481340
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
13491341
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1350-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1351-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1352-
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
1342+
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
13531343
; GFX12-NEXT: s_wait_storecnt 0x0
1354-
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1344+
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13551345
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
13561346
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1357-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
1358-
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
1347+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
1348+
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
13591349
; GFX12-NEXT: s_wait_alu 0xfffe
13601350
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
13611351
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
17991789
; GFX12-NEXT: s_wait_bvhcnt 0x0
18001790
; GFX12-NEXT: s_wait_kmcnt 0x0
18011791
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1802-
; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1792+
; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
18031793
; GFX12-NEXT: s_mov_b32 s4, 0
18041794
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1805-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
18061795
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
18071796
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
18081797
; GFX12-NEXT: s_wait_loadcnt 0x0
18091798
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
18101799
; GFX12-NEXT: s_wait_storecnt 0x0
18111800
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1812-
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1813-
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
1814-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1801+
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5]
18151802
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
18161803
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18171804
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1971,23 +1958,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
19711958
; GFX12-NEXT: s_wait_bvhcnt 0x0
19721959
; GFX12-NEXT: s_wait_kmcnt 0x0
19731960
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1974-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
19751961
; GFX12-NEXT: s_mov_b32 s4, 0
1976-
; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen
1962+
; GFX12-NEXT: buffer_load_b64 v[4:5], v6, s[0:3], null offen
19771963
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
19781964
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
19791965
; GFX12-NEXT: s_wait_loadcnt 0x0
1980-
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
1966+
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[0:1]
1967+
; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
19811968
; GFX12-NEXT: s_wait_storecnt 0x0
1982-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1983-
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
1984-
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1985-
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1969+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1970+
; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
19861971
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
19871972
; GFX12-NEXT: s_wait_loadcnt 0x0
19881973
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1989-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
1990-
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
1974+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
1975+
; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
19911976
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
19921977
; GFX12-NEXT: s_wait_alu 0xfffe
19931978
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4

0 commit comments

Comments
 (0)