Skip to content

Commit 4ed66cb

Browse files
authored
AMDGPU: Improve cost handling of fma/fmuladd (#100798)
We were overcounting the cost of fast f32 FMA. Also address todo and handle fmuladd (which I'm just assuming lowers to FMA, the slow FMA expansion is about as fast on slow targets anyway).
1 parent e90c218 commit 4ed66cb

File tree

3 files changed

+79
-73
lines changed

3 files changed

+79
-73
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -686,7 +686,8 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
686686
// instructions for an intrinsic, even if it requires nontrivial legalization.
687687
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
688688
switch (ID) {
689-
case Intrinsic::fma: // TODO: fmuladd
689+
case Intrinsic::fma:
690+
case Intrinsic::fmuladd:
690691
// There's a small benefit to using vector ops in the legalized code.
691692
case Intrinsic::round:
692693
case Intrinsic::uadd_sat:
@@ -730,8 +731,13 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
730731

731732
switch (ICA.getID()) {
732733
case Intrinsic::fma:
733-
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
734-
: getQuarterRateInstrCost(CostKind);
734+
case Intrinsic::fmuladd:
735+
if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
736+
InstRate = getFullRateInstrCost();
737+
else {
738+
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
739+
: getQuarterRateInstrCost(CostKind);
740+
}
735741
break;
736742
case Intrinsic::uadd_sat:
737743
case Intrinsic::usub_sat:

llvm/test/Analysis/CostModel/AMDGPU/fma.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010

1111
define void @fma_f16() {
1212
; FAST-LABEL: 'fma_f16'
13-
; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
14-
; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
15-
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
16-
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
17-
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
18-
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
19-
; FAST-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
13+
; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
14+
; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
15+
; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
16+
; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
17+
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
18+
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
19+
; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
2020
; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
2121
;
2222
; SLOW-LABEL: 'fma_f16'
@@ -30,13 +30,13 @@ define void @fma_f16() {
3030
; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
3131
;
3232
; FAST-SIZE-LABEL: 'fma_f16'
33-
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
34-
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
35-
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
36-
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
37-
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
38-
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
39-
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
33+
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
34+
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
35+
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
36+
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
37+
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
38+
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
39+
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
4040
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
4141
;
4242
; SLOW-SIZE-LABEL: 'fma_f16'

0 commit comments

Comments
 (0)