AMDGPU: Form v_med_f32 from minimumnum/maximumnum immediate pattern #141048

arsenm · 2025-05-22T12:00:43Z

This makes little difference in the final output, as we manage to form this
after these are lowered to the _ieee operations. This does result in fewer steps
in the DAG, and helps prepare for changing the handling of minnum/maxnum.

arsenm · 2025-05-22T12:00:56Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-05-22T12:01:50Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

This makes little difference in the final output, as we manage to form this
after these are lowered to the _ieee operations. This does result in fewer steps
in the DAG, and helps prepare for changing the handling of minnum/maxnum.

Full diff: https://github.com/llvm/llvm-project/pull/141048.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31-2)
(modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+8-8)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2d337fafe6dc2..ade88a16193b8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13593,10 +13593,34 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
   if (K0->getValueAPF() > K1->getValueAPF())
     return SDValue();
 
+  // med3 with a nan input acts like
+  // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
+  //
+  // So the result depends on whether the IEEE mode bit is enabled or not with a
+  // signaling nan input.
+  // ieee=1
+  // s0 snan: yields s2
+  // s1 snan: yields s2
+  // s2 snan: qnan
+
+  // s0 qnan: min(s1, s2)
+  // s1 qnan: min(s0, s2)
+  // s2 qnan: min(s0, s1)
+
+  // ieee=0
+  // s0 snan: min(s1, s2)
+  // s1 snan: min(s0, s2)
+  // s2 snan: qnan
+
+  // s0 qnan: min(s1, s2)
+  // s1 qnan: min(s0, s2)
+  // s2 qnan: min(s0, s1)
   const MachineFunction &MF = DAG.getMachineFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  // TODO: Check IEEE bit enabled?
+  // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
+  // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
+  // can only form if op0 is fmaxnum_ieee if IEEE=1.
   EVT VT = Op0.getValueType();
   if (Info->getMode().DX10Clamp) {
     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
@@ -13714,9 +13738,14 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
       return Med3;
   }
 
-  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
+  // if !is_snan(x):
+  //   fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
+  //   fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
+  //   fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
+  //   fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
        (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
+       (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
        (Opc == AMDGPUISD::FMIN_LEGACY &&
         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
       (VT == MVT::f32 || VT == MVT::f64 ||
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 01681e7d8b8ed..6274b38a63fe0 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -4115,13 +4115,13 @@ define float @v_clamp_f32_daz_minimumnum_maximumnum(float %a) #0 {
 ; GFX6-LABEL: v_clamp_f32_daz_minimumnum_maximumnum:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e64 v0, 1.0, v0 clamp
+; GFX6-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_clamp_f32_daz_minimumnum_maximumnum:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, v0 clamp
+; GFX8-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_clamp_f32_daz_minimumnum_maximumnum:
@@ -4154,13 +4154,13 @@ define float @v_clamp_f32_minimumnum_maximumnum(float %a) #1 {
 ; GFX6-LABEL: v_clamp_f32_minimumnum_maximumnum:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e64 v0, 1.0, v0 clamp
+; GFX6-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_clamp_f32_minimumnum_maximumnum:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, v0 clamp
+; GFX8-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_clamp_f32_minimumnum_maximumnum:
@@ -4193,13 +4193,13 @@ define float @v_clamp_f32_neg_minimumnum_maximumnum(float %a) #1 {
 ; GFX6-LABEL: v_clamp_f32_neg_minimumnum_maximumnum:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e64 v0, -1.0, v0 clamp
+; GFX6-NEXT:    v_max_f32_e64 v0, -v0, -v0 clamp
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_clamp_f32_neg_minimumnum_maximumnum:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e64 v0, -1.0, v0 clamp
+; GFX8-NEXT:    v_max_f32_e64 v0, -v0, -v0 clamp
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_clamp_f32_neg_minimumnum_maximumnum:
@@ -4233,13 +4233,13 @@ define float @v_clamp_f32_minimumnum_maximumnum_no_ieee(float %a) #5 {
 ; GFX6-LABEL: v_clamp_f32_minimumnum_maximumnum_no_ieee:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e64 v0, 1.0, v0 clamp
+; GFX6-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_clamp_f32_minimumnum_maximumnum_no_ieee:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e64 v0, 1.0, v0 clamp
+; GFX8-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_clamp_f32_minimumnum_maximumnum_no_ieee:

arsenm · 2025-05-22T18:57:30Z

Merge activity

May 22, 6:57 PM UTC: A user started a stack merge that includes this pull request via Graphite.
May 22, 7:02 PM UTC: Graphite rebased this pull request as part of a merge.
May 22, 7:04 PM UTC: @arsenm merged this pull request with Graphite.

This makes little difference in the final output, as we manage to form this after these are lowered to the _ieee operations. This does result in fewer steps in the DAG, and helps prepare for changing the handling of minnum/maxnum.

…lvm#141048) This makes little difference in the final output, as we manage to form this after these are lowered to the _ieee operations. This does result in fewer steps in the DAG, and helps prepare for changing the handling of minnum/maxnum.

arsenm mentioned this pull request May 22, 2025

AMDGPU: Add baseline v_med3_f32 tests from minimumnum/maximumnum #141047

Merged

arsenm added backend:AMDGPU floating-point Floating-point math labels May 22, 2025 — with Graphite App

arsenm requested review from jayfoad, Pierre-vh, rampitec and vpykhtin May 22, 2025 12:01

arsenm marked this pull request as ready for review May 22, 2025 12:01

rampitec approved these changes May 22, 2025

View reviewed changes

arsenm force-pushed the users/arsenm/amdgpu/baseline-tests-med3-minimumnum-maximumnum branch from cd73596 to ad6db14 Compare May 22, 2025 18:59

Base automatically changed from users/arsenm/amdgpu/baseline-tests-med3-minimumnum-maximumnum to main May 22, 2025 19:01

arsenm force-pushed the users/arsenm/amdgpu/form-fmed3-from-immediate-pattern-minimumnum-maximumnum branch from 52ea761 to 8c1a5fe Compare May 22, 2025 19:02

arsenm merged commit db0bac0 into main May 22, 2025
6 of 9 checks passed

arsenm deleted the users/arsenm/amdgpu/form-fmed3-from-immediate-pattern-minimumnum-maximumnum branch May 22, 2025 19:04

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Form v_med_f32 from minimumnum/maximumnum immediate pattern #141048

AMDGPU: Form v_med_f32 from minimumnum/maximumnum immediate pattern #141048

Uh oh!

arsenm commented May 22, 2025

Uh oh!

arsenm commented May 22, 2025 •

edited

Loading

Uh oh!

llvmbot commented May 22, 2025

Uh oh!

arsenm commented May 22, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

AMDGPU: Form v_med_f32 from minimumnum/maximumnum immediate pattern #141048

AMDGPU: Form v_med_f32 from minimumnum/maximumnum immediate pattern #141048

Uh oh!

Conversation

arsenm commented May 22, 2025

Uh oh!

arsenm commented May 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 22, 2025

Uh oh!

arsenm commented May 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

Uh oh!

Uh oh!

arsenm commented May 22, 2025 •

edited

Loading

arsenm commented May 22, 2025 •

edited

Loading