Skip to content

Commit 532984d

Browse files
committed
FP reduction cost functions (for SLP)
1 parent b75e1b5 commit 532984d

File tree

6 files changed

+976
-0
lines changed

6 files changed

+976
-0
lines changed

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1388,6 +1388,57 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
13881388
return NumVectorMemOps + NumPermutes;
13891389
}
13901390

1391+
// EXPERIMENTAL
1392+
static cl::opt<unsigned> REDLIM("redlim", cl::init(0));
1393+
1394+
InstructionCost getFPReductionCost(unsigned NumVec, unsigned ScalarBits) {
1395+
unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
1396+
InstructionCost Cost = 0;
1397+
Cost += NumVec - 1; // Full vector operations.
1398+
Cost += NumEltsPerVecReg; // Last vector scalar operations.
1399+
return Cost;
1400+
}
1401+
1402+
InstructionCost
1403+
SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1404+
std::optional<FastMathFlags> FMF,
1405+
TTI::TargetCostKind CostKind) {
1406+
if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector() &&
1407+
(Opcode == Instruction::FAdd || Opcode == Instruction::FMul)) {
1408+
unsigned NumVectors = getNumVectorRegs(Ty);
1409+
unsigned ScalarBits = Ty->getScalarSizeInBits();
1410+
1411+
// // EXPERIMENTAL: better to not vectorize small vectors?:
1412+
// unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1413+
// if (NumElts <= REDLIM)
1414+
// return NumVectors * 8; // => MachineCombiner
1415+
1416+
// // EXPERIMENTAL: Return a low cost to enable heavily.
1417+
// return NumVectors / 2;
1418+
1419+
return getFPReductionCost(NumVectors, ScalarBits);
1420+
}
1421+
1422+
return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1423+
}
1424+
1425+
InstructionCost
1426+
SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1427+
FastMathFlags FMF,
1428+
TTI::TargetCostKind CostKind) {
1429+
if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) {
1430+
unsigned NumVectors = getNumVectorRegs(Ty);
1431+
unsigned ScalarBits = Ty->getScalarSizeInBits();
1432+
1433+
// // EXPERIMENTAL: Return a low cost to enable heavily.
1434+
// return NumVectors / 2;
1435+
1436+
return getFPReductionCost(NumVectors, ScalarBits);
1437+
}
1438+
1439+
return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1440+
}
1441+
13911442
static int
13921443
getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
13931444
const SmallVectorImpl<Type *> &ParamTys) {

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,13 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
129129
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
130130
bool UseMaskForCond = false, bool UseMaskForGaps = false);
131131

132+
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
133+
std::optional<FastMathFlags> FMF,
134+
TTI::TargetCostKind CostKind);
135+
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
136+
FastMathFlags FMF,
137+
TTI::TargetCostKind CostKind);
138+
132139
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
133140
TTI::TargetCostKind CostKind);
134141

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
2+
; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
3+
; RUN: -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
4+
5+
define void @fadd_reductions() {
6+
; Z15-LABEL: 'fadd_reductions'
7+
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
8+
; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
9+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
10+
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
11+
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
12+
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
13+
;
14+
%fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
15+
%fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
16+
%fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
17+
%fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
18+
%fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
19+
ret void
20+
}
21+
22+
define void @fast_fadd_reductions() {
23+
; Z15-LABEL: 'fast_fadd_reductions'
24+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
25+
; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
26+
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
27+
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
28+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
29+
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
30+
;
31+
%fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
32+
%fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
33+
%fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
34+
%fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
35+
%fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
36+
37+
ret void
38+
}
39+
40+
define void @fmul_reductions() {
41+
; Z15-LABEL: 'fmul_reductions'
42+
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
43+
; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
44+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
45+
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
46+
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
47+
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
48+
;
49+
%fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
50+
%fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
51+
%fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
52+
%fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
53+
%fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
54+
ret void
55+
}
56+
57+
define void @fast_fmul_reductions() {
58+
; Z15-LABEL: 'fast_fmul_reductions'
59+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
60+
; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
61+
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
62+
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
63+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
64+
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
65+
;
66+
%fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
67+
%fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
68+
%fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
69+
%fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
70+
%fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
71+
72+
ret void
73+
}
74+
75+
define void @fmin_reductions() {
76+
; Z15-LABEL: 'fmin_reductions'
77+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
78+
; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
79+
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
80+
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
81+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
82+
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
83+
;
84+
%V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
85+
%V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
86+
%V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
87+
%V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
88+
%V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
89+
ret void
90+
}
91+
92+
define void @fmax_reductions() {
93+
; Z15-LABEL: 'fmax_reductions'
94+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
95+
; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
96+
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
97+
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
98+
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
99+
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
100+
;
101+
%V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
102+
%V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
103+
%V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
104+
%V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
105+
%V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
106+
ret void
107+
}
108+
109+
declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
110+
declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
111+
declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
112+
declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
113+
declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
114+
115+
declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
116+
declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
117+
declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
118+
declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
119+
declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
120+
121+
declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
122+
declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
123+
declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
124+
declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
125+
declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
126+
127+
declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
128+
declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
129+
declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
130+
declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
131+
declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)

0 commit comments

Comments
 (0)