-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[GlobalISel][AArch64][AMDGPU] Expand FPOWI into series of multiplication #95217
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-backend-aarch64 Author: None (isuckatcs) ChangesSelectionDAG already converts FPOWI into multiplications, this patch introduces the same optimization into GlobalISel. Patch is 77.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95217.diff 7 Files Affected:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 9830b521797c1..f39216e439cbb 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7142,14 +7142,50 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
return UnableToLegalize;
}
-// TODO: If RHS is a constant SelectionDAGBuilder expands this into a
-// multiplication tree.
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
- auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+ auto [Dst, Base, Exp] = MI.getFirst3Regs();
LLT Ty = MRI.getType(Dst);
+
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ std::optional<int64_t> ConstantExpValue = getIConstantVRegSExtVal(Exp, MRI);
+
+ if(!ConstantExpValue)
+ return UnableToLegalize;
+
+ int64_t OriginalExprVal = *ConstantExpValue;
+ int64_t ExpVal = OriginalExprVal;
+
+ if (ExpVal == 0) {
+ MIRBuilder.buildFConstant(Dst, 1.0);
+ MI.removeFromParent();
+ return Legalized;
+ }
+
+ if (ExpVal < 0)
+ ExpVal = -ExpVal;
+
+ Register Res = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildCopy(Res, Base);
+
+ while (--ExpVal > 0) {
+ Register Tmp = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildFMul(Tmp, Res, Base);
+
+ Res = Tmp;
+ }
+
+ // If the original was negative, invert the result, producing 1/(x*x*x).
+ if (OriginalExprVal < 0) {
+ Register One = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildFConstant(One, 1.0);
+
+ Register Quotient = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildFDiv(Quotient, One, Res);
+
+ Res = Quotient;
+ }
- auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
- MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
+ MIRBuilder.buildCopy(Dst, Res);
MI.eraseFromParent();
return Legalized;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 42cd43c3afa37..65940e5a1ead0 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -277,7 +277,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_FPOWI)
.scalarize(0)
.minScalar(0, s32)
- .libcallFor({{s32, s32}, {s64, s32}});
+ .customFor({{s32, s32}, {s64, s32}});
getActionDefinitionsBuilder(G_INSERT)
.legalIf(all(typeInSet(0, {s32, s64, p0}),
@@ -1263,6 +1263,8 @@ bool AArch64LegalizerInfo::legalizeCustom(
case TargetOpcode::G_FSHL:
case TargetOpcode::G_FSHR:
return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
+ case TargetOpcode::G_FPOWI:
+ return legalizeFPowI(MI, LocObserver, Helper);
case TargetOpcode::G_ROTR:
return legalizeRotate(MI, MRI, Helper);
case TargetOpcode::G_CTPOP:
@@ -1344,6 +1346,15 @@ bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
return true;
}
+bool AArch64LegalizerInfo::legalizeFPowI(MachineInstr &MI,
+ LostDebugLocObserver &Observer,
+ LegalizerHelper &Helper) const {
+ if(Helper.lowerFPOWI(MI) == LegalizerHelper::Legalized)
+ return true;
+
+ return Helper.libcall(MI, Observer) == LegalizerHelper::Legalized;
+}
+
bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 00d85a36e4b2c..686c139aa4981 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -56,6 +56,9 @@ class AArch64LegalizerInfo : public LegalizerInfo {
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer,
LegalizerHelper &Helper) const;
+ bool legalizeFPowI(MachineInstr &MI,
+ LostDebugLocObserver &Observer,
+ LegalizerHelper &Helper) const;
bool legalizeCTPOP(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
bool legalizeAtomicCmpxchg128(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee7fb20c23aa7..a38c74382dd3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1217,7 +1217,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_FPOWI)
.clampScalar(0, MinScalarFPTy, S32)
- .lower();
+ .custom();
auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
Log2Ops.customFor({S32});
@@ -2127,6 +2127,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
return legalizeFExp(MI, B);
case TargetOpcode::G_FPOW:
return legalizeFPow(MI, B);
+ case TargetOpcode::G_FPOWI:
+ return legalizeFPowI(Helper, MI, B, LocObserver);
case TargetOpcode::G_FFLOOR:
return legalizeFFloor(MI, MRI, B);
case TargetOpcode::G_BUILD_VECTOR:
@@ -3731,6 +3733,21 @@ bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeFPowI(LegalizerHelper &Helper, MachineInstr &MI, MachineIRBuilder &B,
+ LostDebugLocObserver &LocObserver) const {
+ if(Helper.lowerFPOWI(MI) == LegalizerHelper::Legalized)
+ return true;
+
+ auto [Dst, Base, Exp] = MI.getFirst3Regs();
+ LLT Ty = B.getMRI()->getType(Dst);
+
+ auto CvtSrc1 = B.buildSITOFP(Ty, Exp);
+ B.buildFPow(Dst, Base, CvtSrc1, MI.getFlags());
+ MI.eraseFromParent();
+
+ return true;
+}
+
// Find a source register, ignoring any possible source modifiers.
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
Register ModSrc = OrigSrc;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1d821dadc21..22d4ac0ebb4ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -95,6 +95,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
unsigned Flags) const;
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFPowI(LegalizerHelper &Helper, MachineInstr &MI, MachineIRBuilder &B,
+ LostDebugLocObserver &LocObserver) const;
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir
new file mode 100644
index 0000000000000..c4261277d343e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir
@@ -0,0 +1,145 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-- -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: fpowi_s64_zero
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s64_zero
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; CHECK-NEXT: $d0 = COPY [[C]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s64) = G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_zero
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s32_zero
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: $s0 = COPY [[C]](s32)
+ %0:_(s32) = COPY $s0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s32) = G_FPOWI %0, %2(s32)
+ $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_positive
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_positive
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY2]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[FMUL1]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FMUL2]], [[COPY]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[FMUL3]](s64)
+ ; CHECK-NEXT: $d0 = COPY [[COPY3]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 5
+ %3:_(s64) = G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s64_negative
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s64_negative
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY2]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[FMUL1]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FMUL2]], [[COPY]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s64) = G_FDIV [[C]], [[FMUL3]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[FDIV]](s64)
+ ; CHECK-NEXT: $d0 = COPY [[COPY3]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 -5
+ %3:_(s64) = G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_negative
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s32_negative
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY2]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[COPY]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s32) = G_FDIV [[C]], [[FMUL3]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[FDIV]](s32)
+ ; CHECK-NEXT: $s0 = COPY [[COPY3]](s32)
+ %0:_(s32) = COPY $s0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 -5
+ %3:_(s32) = G_FPOWI %0, %2(s32)
+ $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_libcall
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_libcall
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $d0 = COPY [[COPY]](s64)
+ ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: BL &__powidf2, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $w0, implicit-def $d0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: $d0 = COPY [[COPY2]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s64) = G_FPOWI %0, %1(s32)
+ $d0 = COPY %2(s64)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index b169063d67872..5bd9378a6cf77 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -141,51 +141,57 @@ define float @v_powi_1_f32(float %l) {
}
define float @v_powi_neg1_f32(float %l) {
-; GFX78-LABEL: v_powi_neg1_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg1_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT: v_rcp_f32_e32 v2, v1
+; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg1_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v1
+; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_neg1_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
+; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -1.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 -1)
ret float %res
@@ -195,99 +201,74 @@ define float @v_powi_2_f32(float %l) {
; GFX78-LABEL: v_powi_2_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_2_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 2)
ret float %res
}
define float @v_powi_neg2_f32(float %l) {
-; GFX78-LABEL: v_powi_neg2_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0,...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: None (isuckatcs) ChangesSelectionDAG already converts FPOWI into multiplications, this patch introduces the same optimization into GlobalISel. Patch is 77.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95217.diff 7 Files Affected:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 9830b521797c1..f39216e439cbb 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7142,14 +7142,50 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
return UnableToLegalize;
}
-// TODO: If RHS is a constant SelectionDAGBuilder expands this into a
-// multiplication tree.
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
- auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+ auto [Dst, Base, Exp] = MI.getFirst3Regs();
LLT Ty = MRI.getType(Dst);
+
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ std::optional<int64_t> ConstantExpValue = getIConstantVRegSExtVal(Exp, MRI);
+
+ if(!ConstantExpValue)
+ return UnableToLegalize;
+
+ int64_t OriginalExprVal = *ConstantExpValue;
+ int64_t ExpVal = OriginalExprVal;
+
+ if (ExpVal == 0) {
+ MIRBuilder.buildFConstant(Dst, 1.0);
+ MI.removeFromParent();
+ return Legalized;
+ }
+
+ if (ExpVal < 0)
+ ExpVal = -ExpVal;
+
+ Register Res = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildCopy(Res, Base);
+
+ while (--ExpVal > 0) {
+ Register Tmp = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildFMul(Tmp, Res, Base);
+
+ Res = Tmp;
+ }
+
+ // If the original was negative, invert the result, producing 1/(x*x*x).
+ if (OriginalExprVal < 0) {
+ Register One = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildFConstant(One, 1.0);
+
+ Register Quotient = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildFDiv(Quotient, One, Res);
+
+ Res = Quotient;
+ }
- auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
- MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
+ MIRBuilder.buildCopy(Dst, Res);
MI.eraseFromParent();
return Legalized;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 42cd43c3afa37..65940e5a1ead0 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -277,7 +277,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_FPOWI)
.scalarize(0)
.minScalar(0, s32)
- .libcallFor({{s32, s32}, {s64, s32}});
+ .customFor({{s32, s32}, {s64, s32}});
getActionDefinitionsBuilder(G_INSERT)
.legalIf(all(typeInSet(0, {s32, s64, p0}),
@@ -1263,6 +1263,8 @@ bool AArch64LegalizerInfo::legalizeCustom(
case TargetOpcode::G_FSHL:
case TargetOpcode::G_FSHR:
return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
+ case TargetOpcode::G_FPOWI:
+ return legalizeFPowI(MI, LocObserver, Helper);
case TargetOpcode::G_ROTR:
return legalizeRotate(MI, MRI, Helper);
case TargetOpcode::G_CTPOP:
@@ -1344,6 +1346,15 @@ bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
return true;
}
+bool AArch64LegalizerInfo::legalizeFPowI(MachineInstr &MI,
+ LostDebugLocObserver &Observer,
+ LegalizerHelper &Helper) const {
+ if(Helper.lowerFPOWI(MI) == LegalizerHelper::Legalized)
+ return true;
+
+ return Helper.libcall(MI, Observer) == LegalizerHelper::Legalized;
+}
+
bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 00d85a36e4b2c..686c139aa4981 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -56,6 +56,9 @@ class AArch64LegalizerInfo : public LegalizerInfo {
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer,
LegalizerHelper &Helper) const;
+ bool legalizeFPowI(MachineInstr &MI,
+ LostDebugLocObserver &Observer,
+ LegalizerHelper &Helper) const;
bool legalizeCTPOP(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
bool legalizeAtomicCmpxchg128(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee7fb20c23aa7..a38c74382dd3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1217,7 +1217,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_FPOWI)
.clampScalar(0, MinScalarFPTy, S32)
- .lower();
+ .custom();
auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
Log2Ops.customFor({S32});
@@ -2127,6 +2127,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
return legalizeFExp(MI, B);
case TargetOpcode::G_FPOW:
return legalizeFPow(MI, B);
+ case TargetOpcode::G_FPOWI:
+ return legalizeFPowI(Helper, MI, B, LocObserver);
case TargetOpcode::G_FFLOOR:
return legalizeFFloor(MI, MRI, B);
case TargetOpcode::G_BUILD_VECTOR:
@@ -3731,6 +3733,21 @@ bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeFPowI(LegalizerHelper &Helper, MachineInstr &MI, MachineIRBuilder &B,
+ LostDebugLocObserver &LocObserver) const {
+ if(Helper.lowerFPOWI(MI) == LegalizerHelper::Legalized)
+ return true;
+
+ auto [Dst, Base, Exp] = MI.getFirst3Regs();
+ LLT Ty = B.getMRI()->getType(Dst);
+
+ auto CvtSrc1 = B.buildSITOFP(Ty, Exp);
+ B.buildFPow(Dst, Base, CvtSrc1, MI.getFlags());
+ MI.eraseFromParent();
+
+ return true;
+}
+
// Find a source register, ignoring any possible source modifiers.
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
Register ModSrc = OrigSrc;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1d821dadc21..22d4ac0ebb4ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -95,6 +95,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
unsigned Flags) const;
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFPowI(LegalizerHelper &Helper, MachineInstr &MI, MachineIRBuilder &B,
+ LostDebugLocObserver &LocObserver) const;
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir
new file mode 100644
index 0000000000000..c4261277d343e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir
@@ -0,0 +1,145 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-- -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: fpowi_s64_zero
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s64_zero
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; CHECK-NEXT: $d0 = COPY [[C]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s64) = G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_zero
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s32_zero
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: $s0 = COPY [[C]](s32)
+ %0:_(s32) = COPY $s0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s32) = G_FPOWI %0, %2(s32)
+ $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_positive
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_positive
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY2]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[FMUL1]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FMUL2]], [[COPY]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[FMUL3]](s64)
+ ; CHECK-NEXT: $d0 = COPY [[COPY3]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 5
+ %3:_(s64) = G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s64_negative
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s64_negative
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY2]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[FMUL1]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FMUL2]], [[COPY]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s64) = G_FDIV [[C]], [[FMUL3]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[FDIV]](s64)
+ ; CHECK-NEXT: $d0 = COPY [[COPY3]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 -5
+ %3:_(s64) = G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_negative
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s32_negative
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY2]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[COPY]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s32) = G_FDIV [[C]], [[FMUL3]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[FDIV]](s32)
+ ; CHECK-NEXT: $s0 = COPY [[COPY3]](s32)
+ %0:_(s32) = COPY $s0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 -5
+ %3:_(s32) = G_FPOWI %0, %2(s32)
+ $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_libcall
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_libcall
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $d0 = COPY [[COPY]](s64)
+ ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: BL &__powidf2, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $w0, implicit-def $d0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: $d0 = COPY [[COPY2]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s64) = G_FPOWI %0, %1(s32)
+ $d0 = COPY %2(s64)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index b169063d67872..5bd9378a6cf77 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -141,51 +141,57 @@ define float @v_powi_1_f32(float %l) {
}
define float @v_powi_neg1_f32(float %l) {
-; GFX78-LABEL: v_powi_neg1_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg1_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT: v_rcp_f32_e32 v2, v1
+; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg1_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v1
+; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_neg1_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
+; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -1.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 -1)
ret float %res
@@ -195,99 +201,74 @@ define float @v_powi_2_f32(float %l) {
; GFX78-LABEL: v_powi_2_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_2_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 2)
ret float %res
}
define float @v_powi_neg2_f32(float %l) {
-; GFX78-LABEL: v_powi_neg2_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0,...
[truncated]
|
|
Where do you see this optimization? I don't see DAGCombiner visiting FPOWI at all. I'm pretty sure this is not a viable lowering strategy. It would require some set of unsafe math flags, and should go into a combiner, not the legalizer |
It is in SelectionDAGBuilder.cpp |
Ugh, the DAG builder should definitely not be doing optimizations like this. But yes, this only works on constants and thus should not be a legalization strategy. It should be an optimization only in the CombinerHelper |
|
+1 please move this to the combiner. |
e5356f8
to
8887bb2
Compare
I moved the optimization to the combiner as requested. |
a024d3c
to
b48904b
Compare
I addressed all the comments on the PR. Is there anything else I could do to improve it, or can we merge it in this state? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Basically lgtm
46d3de2
to
4f16957
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this call isBeneficialToExpandPowI too? Otherwise it might expand to a lot of multiplies.
Said function is defined like this: /// Return true if it is beneficial to expand an @llvm.powi.* intrinsic.
/// If not optimizing for size, expanding @llvm.powi.* intrinsics is always
/// considered beneficial.
/// If optimizing for size, expansion is only considered beneficial for upto
/// 5 multiplies and a divide (if the exponent is negative).
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const {
if (Exponent < 0)
Exponent = -Exponent;
uint64_t E = static_cast<uint64_t>(Exponent);
return !OptForSize || (llvm::popcount(E) + Log2_64(E) < 7);
} So, the intrinsic is always expanded unless if we optimize for size. I haven't seen any check |
There certainly are some size checks in the DAG, if not many. Corresponds to optsize attribute |
Sorry, I made a typo in my previous comment. I meant that I haven't seen many checks for size optimization in GlobalISel. I added one regardless. Also it turned out I accidentally expanded G_FPOWI into a lot of multiplications instead of using the more optimal binary exponentiation algorithm, which is also used in the DAG and results in fewer multiplications. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. I had actually misread isBeneficialToExpandPowI and hadn't realized the heuristic was OptSize only - that the tree of multiplies was efficient enough for any possible constant. That's cool it works like that.
Other than calling isBeneficialToExpandPowI directly this looks OK to me.
SelectionDAG already converts FPOWI into multiplications, this patch introduces the same optimization into GlobalISel.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM if @arsenm agrees.
…ion (llvm#95217) SelectionDAG already converts FPOWI into a series of optimized multiplications, this patch introduces the same optimization into GlobalISel.
SelectionDAG already converts FPOWI into multiplications, this patch introduces the same optimization into GlobalISel.