Skip to content

Commit 37c341d

Browse files
committed
Revert "AMDGPU: Don't canonicalize fminnum/fmaxnum if targets support IEEE fminimum(maximum)_num (#127711)"
This reverts commit 36eaf0d. This is not a sound approach to dealing with this instruction change. The new behavior is a different opcode pair, not a modifier on the existing opcode.
1 parent 3e5ae57 commit 37c341d

22 files changed

+1424
-954
lines changed

llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,8 +425,7 @@ class LegalizerHelper {
425425
LegalizeResult lowerThreewayCompare(MachineInstr &MI);
426426
LegalizeResult lowerMinMax(MachineInstr &MI);
427427
LegalizeResult lowerFCopySign(MachineInstr &MI);
428-
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI,
429-
bool ShouldCanonicalize = true);
428+
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
430429
LegalizeResult lowerFMad(MachineInstr &MI);
431430
LegalizeResult lowerIntrinsicRound(MachineInstr &MI);
432431
LegalizeResult lowerFFloor(MachineInstr &MI);

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5314,8 +5314,7 @@ class TargetLowering : public TargetLoweringBase {
53145314
SelectionDAG &DAG) const;
53155315

53165316
/// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
5317-
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG,
5318-
bool ShouldCanonicalize = true) const;
5317+
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
53195318

53205319
/// Expand fminimum/fmaximum into multiple comparison with selects.
53215320
SDValue expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const;

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8137,14 +8137,14 @@ LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
81378137
}
81388138

81398139
LegalizerHelper::LegalizeResult
8140-
LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI, bool ShouldCanonicalize) {
8140+
LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
81418141
unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
81428142
TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
81438143

81448144
auto [Dst, Src0, Src1] = MI.getFirst3Regs();
81458145
LLT Ty = MRI.getType(Dst);
81468146

8147-
if (ShouldCanonicalize && !MI.getFlag(MachineInstr::FmNoNans)) {
8147+
if (!MI.getFlag(MachineInstr::FmNoNans)) {
81488148
// Insert canonicalizes if it's possible we need to quiet to get correct
81498149
// sNaN behavior.
81508150

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8488,8 +8488,7 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
84888488
}
84898489

84908490
SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
8491-
SelectionDAG &DAG,
8492-
bool ShouldCanonicalize) const {
8491+
SelectionDAG &DAG) const {
84938492
if (SDValue Expanded = expandVectorNaryOpBySplitting(Node, DAG))
84948493
return Expanded;
84958494

@@ -8506,7 +8505,7 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
85068505
SDValue Quiet0 = Node->getOperand(0);
85078506
SDValue Quiet1 = Node->getOperand(1);
85088507

8509-
if (ShouldCanonicalize && !Node->getFlags().hasNoNaNs()) {
8508+
if (!Node->getFlags().hasNoNaNs()) {
85108509
// Insert canonicalizes if it's possible we need to quiet to get correct
85118510
// sNaN behavior.
85128511
if (!DAG.isKnownNeverSNaN(Quiet0)) {

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2710,8 +2710,7 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
27102710
if (IsIEEEOp)
27112711
return true;
27122712

2713-
return Helper.lowerFMinNumMaxNum(MI, !ST.hasIEEEMinNumMaxNum()) ==
2714-
LegalizerHelper::Legalized;
2713+
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
27152714
}
27162715

27172716
bool AMDGPULegalizerInfo::legalizeExtractVectorElt(

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1428,10 +1428,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
14281428
// \returns true if the target has IEEE fminimum/fmaximum instructions
14291429
bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
14301430

1431-
// \returns true if the target has IEEE fminimum_num/fmaximum_num
1432-
// instructions
1433-
bool hasIEEEMinNumMaxNum() const { return getGeneration() >= GFX12; }
1434-
14351431
// \returns true if the target has IEEE fminimum3/fmaximum3 instructions
14361432
bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
14371433

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6833,8 +6833,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
68336833
// mode functions, but this happens to be OK since it's only done in cases
68346834
// where there is known no sNaN.
68356835
if (IsIEEEMode)
6836-
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG,
6837-
!Subtarget->hasIEEEMinNumMaxNum());
6836+
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
68386837

68396838
if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
68406839
VT == MVT::v16bf16)

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -602,13 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
602602
; GFX12-NEXT: s_wait_bvhcnt 0x0
603603
; GFX12-NEXT: s_wait_kmcnt 0x0
604604
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
605+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
605606
; GFX12-NEXT: s_mov_b32 s0, 0
606607
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
607608
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
608609
; GFX12-NEXT: s_wait_loadcnt 0x0
609610
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
610-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
611-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
611+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
612+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
613+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
612614
; GFX12-NEXT: s_wait_storecnt 0x0
613615
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
614616
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -755,18 +757,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
755757
; GFX12-NEXT: s_wait_samplecnt 0x0
756758
; GFX12-NEXT: s_wait_bvhcnt 0x0
757759
; GFX12-NEXT: s_wait_kmcnt 0x0
758-
; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
760+
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
761+
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
759762
; GFX12-NEXT: s_mov_b32 s0, 0
760763
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
761764
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
762765
; GFX12-NEXT: s_wait_loadcnt 0x0
763-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
766+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
767+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
768+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
764769
; GFX12-NEXT: s_wait_storecnt 0x0
765-
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
770+
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
766771
; GFX12-NEXT: s_wait_loadcnt 0x0
767772
; GFX12-NEXT: global_inv scope:SCOPE_DEV
768-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
769-
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
773+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
774+
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
770775
; GFX12-NEXT: s_wait_alu 0xfffe
771776
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
772777
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1183,13 +1188,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
11831188
; GFX12-NEXT: s_wait_bvhcnt 0x0
11841189
; GFX12-NEXT: s_wait_kmcnt 0x0
11851190
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1191+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
11861192
; GFX12-NEXT: s_mov_b32 s0, 0
11871193
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
11881194
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
11891195
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
11901196
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1191-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1192-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
1197+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1198+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1199+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
11931200
; GFX12-NEXT: s_wait_storecnt 0x0
11941201
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11951202
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1334,18 +1341,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
13341341
; GFX12-NEXT: s_wait_samplecnt 0x0
13351342
; GFX12-NEXT: s_wait_bvhcnt 0x0
13361343
; GFX12-NEXT: s_wait_kmcnt 0x0
1337-
; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
1344+
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1345+
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
13381346
; GFX12-NEXT: s_mov_b32 s0, 0
13391347
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
13401348
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
13411349
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1342-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
1350+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1351+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1352+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
13431353
; GFX12-NEXT: s_wait_storecnt 0x0
1344-
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1354+
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13451355
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
13461356
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1347-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
1348-
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1357+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
1358+
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
13491359
; GFX12-NEXT: s_wait_alu 0xfffe
13501360
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
13511361
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1789,16 +1799,19 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
17891799
; GFX12-NEXT: s_wait_bvhcnt 0x0
17901800
; GFX12-NEXT: s_wait_kmcnt 0x0
17911801
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1792-
; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
1802+
; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
17931803
; GFX12-NEXT: s_mov_b32 s4, 0
17941804
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1805+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
17951806
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
17961807
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
17971808
; GFX12-NEXT: s_wait_loadcnt 0x0
17981809
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
17991810
; GFX12-NEXT: s_wait_storecnt 0x0
18001811
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1801-
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5]
1812+
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1813+
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1814+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
18021815
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
18031816
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18041817
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1958,21 +1971,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
19581971
; GFX12-NEXT: s_wait_bvhcnt 0x0
19591972
; GFX12-NEXT: s_wait_kmcnt 0x0
19601973
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1974+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
19611975
; GFX12-NEXT: s_mov_b32 s4, 0
1962-
; GFX12-NEXT: buffer_load_b64 v[4:5], v6, s[0:3], null offen
1976+
; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen
19631977
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
19641978
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
19651979
; GFX12-NEXT: s_wait_loadcnt 0x0
1966-
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1]
1967-
; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
1980+
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
19681981
; GFX12-NEXT: s_wait_storecnt 0x0
1969-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1970-
; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
1982+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1983+
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
1984+
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1985+
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
19711986
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
19721987
; GFX12-NEXT: s_wait_loadcnt 0x0
19731988
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1974-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
1975-
; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
1989+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
1990+
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
19761991
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
19771992
; GFX12-NEXT: s_wait_alu 0xfffe
19781993
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -602,13 +602,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
602602
; GFX12-NEXT: s_wait_bvhcnt 0x0
603603
; GFX12-NEXT: s_wait_kmcnt 0x0
604604
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
605+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
605606
; GFX12-NEXT: s_mov_b32 s0, 0
606607
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
607608
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
608609
; GFX12-NEXT: s_wait_loadcnt 0x0
609610
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
610-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
611-
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
611+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
612+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
613+
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
612614
; GFX12-NEXT: s_wait_storecnt 0x0
613615
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
614616
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -755,18 +757,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
755757
; GFX12-NEXT: s_wait_samplecnt 0x0
756758
; GFX12-NEXT: s_wait_bvhcnt 0x0
757759
; GFX12-NEXT: s_wait_kmcnt 0x0
758-
; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
760+
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
761+
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
759762
; GFX12-NEXT: s_mov_b32 s0, 0
760763
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
761764
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
762765
; GFX12-NEXT: s_wait_loadcnt 0x0
763-
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
766+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
767+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
768+
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
764769
; GFX12-NEXT: s_wait_storecnt 0x0
765-
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
770+
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
766771
; GFX12-NEXT: s_wait_loadcnt 0x0
767772
; GFX12-NEXT: global_inv scope:SCOPE_DEV
768-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
769-
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
773+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
774+
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
770775
; GFX12-NEXT: s_wait_alu 0xfffe
771776
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
772777
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1183,13 +1188,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
11831188
; GFX12-NEXT: s_wait_bvhcnt 0x0
11841189
; GFX12-NEXT: s_wait_kmcnt 0x0
11851190
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1191+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
11861192
; GFX12-NEXT: s_mov_b32 s0, 0
11871193
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
11881194
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
11891195
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
11901196
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1191-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1192-
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
1197+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1198+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1199+
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
11931200
; GFX12-NEXT: s_wait_storecnt 0x0
11941201
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11951202
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1334,18 +1341,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
13341341
; GFX12-NEXT: s_wait_samplecnt 0x0
13351342
; GFX12-NEXT: s_wait_bvhcnt 0x0
13361343
; GFX12-NEXT: s_wait_kmcnt 0x0
1337-
; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
1344+
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1345+
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
13381346
; GFX12-NEXT: s_mov_b32 s0, 0
13391347
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
13401348
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
13411349
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1342-
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3]
1350+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1351+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1352+
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
13431353
; GFX12-NEXT: s_wait_storecnt 0x0
1344-
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1354+
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13451355
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
13461356
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1347-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
1348-
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1357+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
1358+
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
13491359
; GFX12-NEXT: s_wait_alu 0xfffe
13501360
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
13511361
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1789,16 +1799,19 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
17891799
; GFX12-NEXT: s_wait_bvhcnt 0x0
17901800
; GFX12-NEXT: s_wait_kmcnt 0x0
17911801
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1792-
; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
1802+
; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
17931803
; GFX12-NEXT: s_mov_b32 s4, 0
17941804
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1805+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
17951806
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
17961807
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
17971808
; GFX12-NEXT: s_wait_loadcnt 0x0
17981809
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
17991810
; GFX12-NEXT: s_wait_storecnt 0x0
18001811
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1801-
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5]
1812+
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1813+
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
1814+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
18021815
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
18031816
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18041817
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1958,21 +1971,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
19581971
; GFX12-NEXT: s_wait_bvhcnt 0x0
19591972
; GFX12-NEXT: s_wait_kmcnt 0x0
19601973
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1974+
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
19611975
; GFX12-NEXT: s_mov_b32 s4, 0
1962-
; GFX12-NEXT: buffer_load_b64 v[4:5], v6, s[0:3], null offen
1976+
; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen
19631977
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
19641978
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
19651979
; GFX12-NEXT: s_wait_loadcnt 0x0
1966-
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[0:1]
1967-
; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
1980+
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
19681981
; GFX12-NEXT: s_wait_storecnt 0x0
1969-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1970-
; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
1982+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1983+
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
1984+
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1985+
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
19711986
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
19721987
; GFX12-NEXT: s_wait_loadcnt 0x0
19731988
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1974-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
1975-
; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
1989+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
1990+
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
19761991
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
19771992
; GFX12-NEXT: s_wait_alu 0xfffe
19781993
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4

0 commit comments

Comments
 (0)