Skip to content

Commit 3cb0241

Browse files
authored
[RISCV][CostModel] Estimate cost of llvm.vector.reduce.fmaximum/fminimum (#80697)
The ‘llvm.vector.reduce.fmaximum/fminimum.*’ intrinsics propagate NaNs if any element of the vector is a NaN. Following #79402, the patch adds the cost for NaN check (vmfne + vcpop)
1 parent cbbbab3 commit 3cb0241

File tree

3 files changed

+136
-52
lines changed

3 files changed

+136
-52
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,6 +1001,51 @@ RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
10011001
return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
10021002
}
10031003

1004+
if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1005+
SmallVector<unsigned, 3> Opcodes;
1006+
InstructionCost ExtraCost = 0;
1007+
switch (IID) {
1008+
case Intrinsic::maximum:
1009+
if (FMF.noNaNs()) {
1010+
Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1011+
} else {
1012+
Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1013+
RISCV::VFMV_F_S};
1014+
// Cost of Canonical Nan + branch
1015+
// lui a0, 523264
1016+
// fmv.w.x fa0, a0
1017+
Type *DstTy = Ty->getScalarType();
1018+
const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1019+
Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1020+
ExtraCost = 1 +
1021+
getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1022+
TTI::CastContextHint::None, CostKind) +
1023+
getCFInstrCost(Instruction::Br, CostKind);
1024+
}
1025+
break;
1026+
1027+
case Intrinsic::minimum:
1028+
if (FMF.noNaNs()) {
1029+
Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1030+
} else {
1031+
Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1032+
RISCV::VFMV_F_S};
1033+
// Cost of Canonical Nan + branch
1034+
// lui a0, 523264
1035+
// fmv.w.x fa0, a0
1036+
Type *DstTy = Ty->getScalarType();
1037+
const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1038+
Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1039+
ExtraCost = 1 +
1040+
getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1041+
TTI::CastContextHint::None, CostKind) +
1042+
getCFInstrCost(Instruction::Br, CostKind);
1043+
}
1044+
break;
1045+
}
1046+
return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1047+
}
1048+
10041049
// IR Reduction is composed by two vmv and one rvv reduction instruction.
10051050
InstructionCost BaseCost = 2;
10061051

llvm/test/Analysis/CostModel/RISCV/reduce-fmaximum.ll

Lines changed: 65 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,37 @@
66

77
define float @reduce_fmaximum_f32(float %arg) {
88
; CHECK-LABEL: 'reduce_fmaximum_f32'
9-
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
10-
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
11-
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
12-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
13-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
14-
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
15-
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
14+
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
15+
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %4 = call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
20+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
21+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %6 = call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
22+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %7 = call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
1623
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float undef
1724
;
1825
; SIZE-LABEL: 'reduce_fmaximum_f32'
19-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
20-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
21-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
22-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
23-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
24-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
25-
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
26+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
27+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
28+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
29+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
30+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
31+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
32+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
33+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
34+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
35+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
36+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
37+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
38+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
39+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
2640
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef
2741
;
2842
%V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
@@ -32,6 +46,13 @@ define float @reduce_fmaximum_f32(float %arg) {
3246
%V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
3347
%V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
3448
%V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
49+
call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
50+
call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
51+
call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
52+
call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
53+
call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
54+
call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
55+
call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
3556
ret float undef
3657
}
3758
declare float @llvm.vector.reduce.fmaximum.v2f32(<2 x float>)
@@ -44,21 +65,33 @@ declare float @llvm.vector.reduce.fmaximum.v128f32(<128 x float>)
4465

4566
define double @reduce_fmaximum_f64(double %arg) {
4667
; CHECK-LABEL: 'reduce_fmaximum_f64'
47-
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
48-
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
49-
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
50-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
51-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
52-
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
68+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
69+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
70+
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
71+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
72+
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
73+
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
74+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
75+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
76+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
77+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %4 = call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
78+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
79+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %6 = call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
5380
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double undef
5481
;
5582
; SIZE-LABEL: 'reduce_fmaximum_f64'
56-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
57-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
58-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
59-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
60-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
61-
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
83+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
84+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
85+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
86+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
87+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
88+
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
89+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
90+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
91+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
92+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
93+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
94+
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
6295
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef
6396
;
6497
%V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
@@ -67,6 +100,12 @@ define double @reduce_fmaximum_f64(double %arg) {
67100
%V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
68101
%V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
69102
%V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
103+
call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
104+
call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
105+
call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
106+
call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
107+
call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
108+
call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
70109
ret double undef
71110
}
72111
declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>)

0 commit comments

Comments
 (0)