Skip to content

Commit db0bac0

Browse files
authored
AMDGPU: Form v_med_f32 from minimumnum/maximumnum immediate pattern (#141048)
This makes little difference in the final output, as we manage to form this after these are lowered to the _ieee operations. This does result in fewer steps in the DAG, and helps prepare for changing the handling of minnum/maxnum.
1 parent 57e9097 commit db0bac0

File tree

2 files changed

+39
-10
lines changed

2 files changed

+39
-10
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13593,10 +13593,34 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
1359313593
if (K0->getValueAPF() > K1->getValueAPF())
1359413594
return SDValue();
1359513595

13596+
// med3 with a nan input acts like
13597+
// v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
13598+
//
13599+
// So the result depends on whether the IEEE mode bit is enabled or not with a
13600+
// signaling nan input.
13601+
// ieee=1
13602+
// s0 snan: yields s2
13603+
// s1 snan: yields s2
13604+
// s2 snan: qnan
13605+
13606+
// s0 qnan: min(s1, s2)
13607+
// s1 qnan: min(s0, s2)
13608+
// s2 qnan: min(s0, s1)
13609+
13610+
// ieee=0
13611+
// s0 snan: min(s1, s2)
13612+
// s1 snan: min(s0, s2)
13613+
// s2 snan: qnan
13614+
13615+
// s0 qnan: min(s1, s2)
13616+
// s1 qnan: min(s0, s2)
13617+
// s2 qnan: min(s0, s1)
1359613618
const MachineFunction &MF = DAG.getMachineFunction();
1359713619
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1359813620

13599-
// TODO: Check IEEE bit enabled?
13621+
// TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
13622+
// whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
13623+
// can only form if op0 is fmaxnum_ieee if IEEE=1.
1360013624
EVT VT = Op0.getValueType();
1360113625
if (Info->getMode().DX10Clamp) {
1360213626
// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
@@ -13714,9 +13738,14 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
1371413738
return Med3;
1371513739
}
1371613740

13717-
// fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13741+
// if !is_snan(x):
13742+
// fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
13743+
// fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
13744+
// fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
13745+
// fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
1371813746
if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
1371913747
(Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13748+
(Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
1372013749
(Opc == AMDGPUISD::FMIN_LEGACY &&
1372113750
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
1372213751
(VT == MVT::f32 || VT == MVT::f64 ||

llvm/test/CodeGen/AMDGPU/clamp.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4115,13 +4115,13 @@ define float @v_clamp_f32_daz_minimumnum_maximumnum(float %a) #0 {
41154115
; GFX6-LABEL: v_clamp_f32_daz_minimumnum_maximumnum:
41164116
; GFX6: ; %bb.0:
41174117
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4118-
; GFX6-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp
4118+
; GFX6-NEXT: v_max_f32_e64 v0, v0, v0 clamp
41194119
; GFX6-NEXT: s_setpc_b64 s[30:31]
41204120
;
41214121
; GFX8-LABEL: v_clamp_f32_daz_minimumnum_maximumnum:
41224122
; GFX8: ; %bb.0:
41234123
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4124-
; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp
4124+
; GFX8-NEXT: v_max_f32_e64 v0, v0, v0 clamp
41254125
; GFX8-NEXT: s_setpc_b64 s[30:31]
41264126
;
41274127
; GFX9-LABEL: v_clamp_f32_daz_minimumnum_maximumnum:
@@ -4154,13 +4154,13 @@ define float @v_clamp_f32_minimumnum_maximumnum(float %a) #1 {
41544154
; GFX6-LABEL: v_clamp_f32_minimumnum_maximumnum:
41554155
; GFX6: ; %bb.0:
41564156
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4157-
; GFX6-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp
4157+
; GFX6-NEXT: v_max_f32_e64 v0, v0, v0 clamp
41584158
; GFX6-NEXT: s_setpc_b64 s[30:31]
41594159
;
41604160
; GFX8-LABEL: v_clamp_f32_minimumnum_maximumnum:
41614161
; GFX8: ; %bb.0:
41624162
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4163-
; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp
4163+
; GFX8-NEXT: v_max_f32_e64 v0, v0, v0 clamp
41644164
; GFX8-NEXT: s_setpc_b64 s[30:31]
41654165
;
41664166
; GFX9-LABEL: v_clamp_f32_minimumnum_maximumnum:
@@ -4193,13 +4193,13 @@ define float @v_clamp_f32_neg_minimumnum_maximumnum(float %a) #1 {
41934193
; GFX6-LABEL: v_clamp_f32_neg_minimumnum_maximumnum:
41944194
; GFX6: ; %bb.0:
41954195
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4196-
; GFX6-NEXT: v_mul_f32_e64 v0, -1.0, v0 clamp
4196+
; GFX6-NEXT: v_max_f32_e64 v0, -v0, -v0 clamp
41974197
; GFX6-NEXT: s_setpc_b64 s[30:31]
41984198
;
41994199
; GFX8-LABEL: v_clamp_f32_neg_minimumnum_maximumnum:
42004200
; GFX8: ; %bb.0:
42014201
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4202-
; GFX8-NEXT: v_mul_f32_e64 v0, -1.0, v0 clamp
4202+
; GFX8-NEXT: v_max_f32_e64 v0, -v0, -v0 clamp
42034203
; GFX8-NEXT: s_setpc_b64 s[30:31]
42044204
;
42054205
; GFX9-LABEL: v_clamp_f32_neg_minimumnum_maximumnum:
@@ -4233,13 +4233,13 @@ define float @v_clamp_f32_minimumnum_maximumnum_no_ieee(float %a) #5 {
42334233
; GFX6-LABEL: v_clamp_f32_minimumnum_maximumnum_no_ieee:
42344234
; GFX6: ; %bb.0:
42354235
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4236-
; GFX6-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp
4236+
; GFX6-NEXT: v_max_f32_e64 v0, v0, v0 clamp
42374237
; GFX6-NEXT: s_setpc_b64 s[30:31]
42384238
;
42394239
; GFX8-LABEL: v_clamp_f32_minimumnum_maximumnum_no_ieee:
42404240
; GFX8: ; %bb.0:
42414241
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4242-
; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp
4242+
; GFX8-NEXT: v_max_f32_e64 v0, v0, v0 clamp
42434243
; GFX8-NEXT: s_setpc_b64 s[30:31]
42444244
;
42454245
; GFX9-LABEL: v_clamp_f32_minimumnum_maximumnum_no_ieee:

0 commit comments

Comments
 (0)