Skip to content

Commit 16bc07a

Browse files
committed
AMDGPU: Select f64 fmul by negative power of 2 to ldexp
Select fmul x, -K -> ldexp(-x, log2(fabsK)) Select fmul fabs(x), -K -> ldexp(-|x|, log2(fabsK)) https://reviews.llvm.org/D158173
1 parent 4c4ff50 commit 16bc07a

File tree

4 files changed

+486
-255
lines changed

4 files changed

+486
-255
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5131,7 +5131,7 @@ void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
51315131
const MachineInstr &MI,
51325132
int OpIdx) const {
51335133
const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5134-
int ExpVal = APF.getExactLog2();
5134+
int ExpVal = APF.getExactLog2Abs();
51355135
assert(ExpVal != INT_MIN);
51365136
MIB.addImm(ExpVal);
51375137
}

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3330,7 +3330,7 @@ defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>;
33303330
// Convert a floating-point power of 2 to the integer exponent.
33313331
def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{
33323332
const auto &APF = N->getValueAPF();
3333-
int Log2 = APF.getExactLog2();
3333+
int Log2 = APF.getExactLog2Abs();
33343334
assert(Log2 != INT_MIN);
33353335
return CurDAG->getTargetConstant(Log2, SDLoc(N), MVT::i32);
33363336
}]>;
@@ -3339,8 +3339,24 @@ def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{
33393339
// immediate where it's preferable to emit a multiply by as an
33403340
// ldexp. We skip over 0.5 to 4.0 as those are inline immediates
33413341
// anyway.
3342-
def fpimm_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
3343-
int Exp = Imm.getExactLog2();
3342+
def fpimm_pos_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
3343+
if (Imm.isNegative())
3344+
return false;
3345+
3346+
int Exp = Imm.getExactLog2Abs();
3347+
// Prefer leaving the FP inline immediates as they are.
3348+
// 0.5, 1.0, 2.0, 4.0
3349+
3350+
// For f64 ldexp is always better than materializing a 64-bit
3351+
// constant.
3352+
return Exp != INT_MIN && (Exp < -1 || Exp > 2);
3353+
}], FPPow2ToExponentXForm
3354+
>;
3355+
3356+
def fpimm_neg_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
3357+
if (!Imm.isNegative())
3358+
return false;
3359+
int Exp = Imm.getExactLog2Abs();
33443360
// Prefer leaving the FP inline immediates as they are.
33453361
// 0.5, 1.0, 2.0, 4.0
33463362

@@ -3353,17 +3369,32 @@ def fpimm_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
33533369
// f64 is different because we also want to handle cases that may
33543370
// require materialization of the exponent.
33553371
// TODO: If we know f64 ops are fast, prefer add (ldexp x, N), y over fma
3356-
// TODO: fmul x, -2^n -> ldexp(-x, n)
33573372
// TODO: For f32/f16, it's not a clear win on code size to use ldexp
33583373
// in place of mul since we have to use the vop3 form. Are there power
33593374
// savings or some other reason to prefer ldexp over mul?
33603375
def : GCNPat<
33613376
(any_fmul (f64 (VOP3Mods f64:$src0, i32:$src0_mods)),
3362-
fpimm_pow2_prefer_ldexp_f64:$src1),
3377+
fpimm_pos_pow2_prefer_ldexp_f64:$src1),
33633378
(V_LDEXP_F64_e64 i32:$src0_mods, VSrc_b64:$src0,
33643379
0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1))))
33653380
>;
33663381

3382+
def : GCNPat<
3383+
(any_fmul f64:$src0, fpimm_neg_pow2_prefer_ldexp_f64:$src1),
3384+
(V_LDEXP_F64_e64 SRCMODS.NEG, VSrc_b64:$src0,
3385+
0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1))))
3386+
>;
3387+
3388+
// We want to avoid using VOP3Mods which could pull in another fneg
3389+
// which we would need to be re-negated (which should never happen in
3390+
// practice). I don't see a way to apply an SDNodeXForm that accounts
3391+
// for a second operand.
3392+
def : GCNPat<
3393+
(any_fmul (fabs f64:$src0), fpimm_neg_pow2_prefer_ldexp_f64:$src1),
3394+
(V_LDEXP_F64_e64 SRCMODS.NEG_ABS, VSrc_b64:$src0,
3395+
0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1))))
3396+
>;
3397+
33673398
class AMDGPUGenericInstruction : GenericInstruction {
33683399
let Namespace = "AMDGPU";
33693400
}

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -289,11 +289,9 @@ body: |
289289
; GCN: liveins: $vgpr0_vgpr1
290290
; GCN-NEXT: {{ $}}
291291
; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
292-
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
293-
; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1070596096, implicit $exec
294-
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
295-
; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY]], 0, [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
296-
; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MUL_F64_e64_]]
292+
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
293+
; GCN-NEXT: [[V_LDEXP_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_LDEXP_F64_e64 1, [[COPY]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
294+
; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_LDEXP_F64_e64_]]
297295
; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
298296
%0:vgpr(s64) = COPY $vgpr0_vgpr1
299297
%1:vgpr(s64) = G_FCONSTANT double -16.0
@@ -315,11 +313,9 @@ body: |
315313
; GCN: liveins: $vgpr0_vgpr1
316314
; GCN-NEXT: {{ $}}
317315
; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
318-
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
319-
; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1070596096, implicit $exec
320-
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
321-
; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 2, [[COPY]], 0, [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
322-
; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MUL_F64_e64_]]
316+
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
317+
; GCN-NEXT: [[V_LDEXP_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_LDEXP_F64_e64 3, [[COPY]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
318+
; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_LDEXP_F64_e64_]]
323319
; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
324320
%0:vgpr(s64) = COPY $vgpr0_vgpr1
325321
%1:vgpr(s64) = G_FABS %0
@@ -342,11 +338,14 @@ body: |
342338
; GCN: liveins: $vgpr0_vgpr1
343339
; GCN-NEXT: {{ $}}
344340
; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
345-
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
346-
; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1070596096, implicit $exec
347-
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
348-
; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 3, [[COPY]], 0, [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
349-
; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MUL_F64_e64_]]
341+
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
342+
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
343+
; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_MOV_B32_]], [[COPY1]], implicit $exec
344+
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
345+
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_OR_B32_e64_]], %subreg.sub1
346+
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4
347+
; GCN-NEXT: [[V_LDEXP_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_LDEXP_F64_e64 1, [[REG_SEQUENCE]], 0, [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
348+
; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_LDEXP_F64_e64_]]
350349
; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
351350
%0:vgpr(s64) = COPY $vgpr0_vgpr1
352351
%1:vgpr(s64) = G_FABS %0

0 commit comments

Comments
 (0)