Skip to content

Commit 0b0d9a3

Browse files
authored
[CodeGen] [AMDGPU] Attempt DAGCombine for fmul with select to ldexp (#111109)
The materialization cost of 32-bit non-inline in case of fmul is quite relatively more, rather than if possible to combine it into ldexp instruction for specific scenarios (for datatypes like f64, f32 and f16) as this is being handled here : The dag combine for any pair of select values which are exact exponent of 2. ``` fmul x, select(y, A, B) -> ldexp (x, select i32 (y, a, b)) fmul x, select(y, -A, -B) -> ldexp ((fneg x), select i32 (y, a, b)) where, A=2^a & B=2^b ; a and b are integers. ``` This dagCombine is handled separately in fmulCombine (newly defined in SIIselLowering), targeting fmul fusing it with select type operand into ldexp. Thus, it fixes #104900.
1 parent 6a137fb commit 0b0d9a3

15 files changed

+4276
-2980
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -914,6 +914,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
914914
ISD::FADD,
915915
ISD::FSUB,
916916
ISD::FDIV,
917+
ISD::FMUL,
917918
ISD::FMINNUM,
918919
ISD::FMAXNUM,
919920
ISD::FMINNUM_IEEE,
@@ -14629,6 +14630,66 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
1462914630
return SDValue();
1463014631
}
1463114632

14633+
SDValue SITargetLowering::performFMulCombine(SDNode *N,
14634+
DAGCombinerInfo &DCI) const {
14635+
SelectionDAG &DAG = DCI.DAG;
14636+
EVT VT = N->getValueType(0);
14637+
EVT ScalarVT = VT.getScalarType();
14638+
EVT IntVT = VT.changeElementType(MVT::i32);
14639+
14640+
SDValue LHS = N->getOperand(0);
14641+
SDValue RHS = N->getOperand(1);
14642+
14643+
// It is cheaper to realize i32 inline constants as compared against
14644+
// materializing f16 or f64 (or even non-inline f32) values,
14645+
// possible via ldexp usage, as shown below :
14646+
//
14647+
// Given : A = 2^a & B = 2^b ; where a and b are integers.
14648+
// fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14649+
// fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14650+
if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14651+
(RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14652+
const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14653+
if (!TrueNode)
14654+
return SDValue();
14655+
const ConstantFPSDNode *FalseNode =
14656+
isConstOrConstSplatFP(RHS.getOperand(2));
14657+
if (!FalseNode)
14658+
return SDValue();
14659+
14660+
if (TrueNode->isNegative() != FalseNode->isNegative())
14661+
return SDValue();
14662+
14663+
// For f32, only non-inline constants should be transformed.
14664+
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14665+
if (ScalarVT == MVT::f32 &&
14666+
TII->isInlineConstant(TrueNode->getValueAPF()) &&
14667+
TII->isInlineConstant(FalseNode->getValueAPF()))
14668+
return SDValue();
14669+
14670+
int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14671+
if (TrueNodeExpVal == INT_MIN)
14672+
return SDValue();
14673+
int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14674+
if (FalseNodeExpVal == INT_MIN)
14675+
return SDValue();
14676+
14677+
SDLoc SL(N);
14678+
SDValue SelectNode =
14679+
DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14680+
DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14681+
DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14682+
14683+
LHS = TrueNode->isNegative()
14684+
? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14685+
: LHS;
14686+
14687+
return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14688+
}
14689+
14690+
return SDValue();
14691+
}
14692+
1463214693
SDValue SITargetLowering::performFMACombine(SDNode *N,
1463314694
DAGCombinerInfo &DCI) const {
1463414695
SelectionDAG &DAG = DCI.DAG;
@@ -14915,6 +14976,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
1491514976
return performFSubCombine(N, DCI);
1491614977
case ISD::FDIV:
1491714978
return performFDivCombine(N, DCI);
14979+
case ISD::FMUL:
14980+
return performFMulCombine(N, DCI);
1491814981
case ISD::SETCC:
1491914982
return performSetCCCombine(N, DCI);
1492014983
case ISD::FMAXNUM:

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
218218
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
219219
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
220220
SDValue performFDivCombine(SDNode *N, DAGCombinerInfo &DCI) const;
221+
SDValue performFMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
221222
SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
222223
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
223224
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,10 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
8282
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
8383
; CHECK-NEXT: s_mov_b32 s4, 0x800000
8484
; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
85-
; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
85+
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
8686
; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1
87-
; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
88-
; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3
87+
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3
88+
; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3
8989
; CHECK-NEXT: v_log_f32_e32 v3, v3
9090
; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1
9191
; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
@@ -98,10 +98,10 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
9898
; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
9999
; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3
100100
; CHECK-NEXT: v_exp_f32_e32 v2, v2
101-
; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000
102-
; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
101+
; CHECK-NEXT: v_not_b32_e32 v3, 63
102+
; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
103103
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1
104-
; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3
104+
; CHECK-NEXT: v_ldexp_f32 v2, v2, v3
105105
; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
106106
; CHECK-NEXT: s_setpc_b64 s[30:31]
107107
%y = sitofp i32 %y.i to float
@@ -228,9 +228,9 @@ define float @test_powr_fast_f32(float %x, float %y) {
228228
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229229
; CHECK-NEXT: s_mov_b32 s4, 0x800000
230230
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
231-
; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
232-
; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
233-
; CHECK-NEXT: v_mul_f32_e32 v0, v0, v3
231+
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
232+
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3
233+
; CHECK-NEXT: v_ldexp_f32 v0, v0, v3
234234
; CHECK-NEXT: v_log_f32_e32 v0, v0
235235
; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
236236
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -242,9 +242,9 @@ define float @test_powr_fast_f32(float %x, float %y) {
242242
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
243243
; CHECK-NEXT: v_fma_f32 v0, v1, v0, v2
244244
; CHECK-NEXT: v_exp_f32_e32 v0, v0
245-
; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000
246-
; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
247-
; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1
245+
; CHECK-NEXT: v_not_b32_e32 v1, 63
246+
; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
247+
; CHECK-NEXT: v_ldexp_f32 v0, v0, v1
248248
; CHECK-NEXT: s_setpc_b64 s[30:31]
249249
%powr = tail call fast float @_Z4powrff(float %x, float %y)
250250
ret float %powr
@@ -368,9 +368,9 @@ define float @test_pown_fast_f32(float %x, i32 %y) {
368368
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
369369
; CHECK-NEXT: s_mov_b32 s4, 0x800000
370370
; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
371-
; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
372-
; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
373-
; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3
371+
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
372+
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3
373+
; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3
374374
; CHECK-NEXT: v_log_f32_e32 v3, v3
375375
; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1
376376
; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
@@ -383,10 +383,10 @@ define float @test_pown_fast_f32(float %x, i32 %y) {
383383
; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
384384
; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3
385385
; CHECK-NEXT: v_exp_f32_e32 v2, v2
386-
; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000
387-
; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
386+
; CHECK-NEXT: v_not_b32_e32 v3, 63
387+
; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
388388
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1
389-
; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3
389+
; CHECK-NEXT: v_ldexp_f32 v2, v2, v3
390390
; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
391391
; CHECK-NEXT: s_setpc_b64 s[30:31]
392392
%call = tail call fast float @_Z4pownfi(float %x, i32 %y)
@@ -511,9 +511,9 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
511511
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512512
; CHECK-NEXT: s_mov_b32 s4, 0x800000
513513
; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
514-
; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
515-
; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
516-
; CHECK-NEXT: v_mul_f32_e64 v0, |v0|, v3
514+
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
515+
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3
516+
; CHECK-NEXT: v_ldexp_f32 v0, |v0|, v3
517517
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1
518518
; CHECK-NEXT: v_log_f32_e32 v0, v0
519519
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
@@ -527,9 +527,9 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
527527
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
528528
; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2
529529
; CHECK-NEXT: v_exp_f32_e32 v0, v0
530-
; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000
531-
; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
532-
; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1
530+
; CHECK-NEXT: v_not_b32_e32 v1, 63
531+
; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
532+
; CHECK-NEXT: v_ldexp_f32 v0, v0, v1
533533
; CHECK-NEXT: s_setpc_b64 s[30:31]
534534
%y = shl i32 %y.arg, 1
535535
%call = tail call fast float @_Z4pownfi(float %x, i32 %y)
@@ -651,9 +651,9 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
651651
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652652
; CHECK-NEXT: s_mov_b32 s4, 0x800000
653653
; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
654-
; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
655-
; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
656-
; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3
654+
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
655+
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3
656+
; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3
657657
; CHECK-NEXT: v_or_b32_e32 v1, 1, v1
658658
; CHECK-NEXT: v_log_f32_e32 v3, v3
659659
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
@@ -667,10 +667,10 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
667667
; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
668668
; CHECK-NEXT: v_fma_f32 v1, v2, v1, v3
669669
; CHECK-NEXT: v_exp_f32_e32 v1, v1
670-
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1f800000
671-
; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
670+
; CHECK-NEXT: v_not_b32_e32 v2, 63
671+
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
672672
; CHECK-NEXT: s_brev_b32 s4, -2
673-
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2
673+
; CHECK-NEXT: v_ldexp_f32 v1, v1, v2
674674
; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0
675675
; CHECK-NEXT: s_setpc_b64 s[30:31]
676676
%y = or i32 %y.arg, 1

0 commit comments

Comments
 (0)