Skip to content

Commit 4ddc8df

Browse files
authored
[CostModel][ARM]Adjust cost of muls in (U/S)MLAL and patterns (#122713)
PR #117350 made changes to the SLP vectorizer which introduced a regression on some ARM benchmarks. Investigation narrowed it down to suboptimal codegen for benchmarks that previously only used scalar (U/S)MLAL instructions. The linked change meant the SLPVectorizer thought that these could be vectorized. This change makes the cost of muls in (U/S)MLAL patterns slightly cheaper to make sure scalar instructions are preferred in these cases over SLP vectorization on targets supporting DSP
1 parent 3c554de commit 4ddc8df

File tree

3 files changed

+244
-4
lines changed

3 files changed

+244
-4
lines changed

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,16 +1458,73 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
14581458
if (LooksLikeAFreeShift())
14591459
return 0;
14601460

1461+
// When targets have both DSP and MVE we find that the
1462+
// the compiler will attempt to vectorize as well as using
1463+
// scalar (S/U)MLAL operations. This is in cases where we have
1464+
// the pattern ext(mul(ext(i16), ext(i16))) we find
1465+
// that codegen performs better when only using (S/U)MLAL scalar
1466+
// ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1467+
// check if a mul instruction is used in a (U/S)MLAL pattern.
1468+
auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1469+
Type *Ty) -> bool {
1470+
if (!ST->hasDSP())
1471+
return false;
1472+
1473+
if (!I)
1474+
return false;
1475+
1476+
if (Opcode != Instruction::Mul)
1477+
return false;
1478+
1479+
if (Ty->isVectorTy())
1480+
return false;
1481+
1482+
auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1483+
return cast<Instruction>(LHS)->getOpcode() ==
1484+
cast<Instruction>(RHS)->getOpcode();
1485+
};
1486+
auto IsExtInst = [](const Value *V) -> bool {
1487+
return isa<ZExtInst>(V) || isa<SExtInst>(V);
1488+
};
1489+
auto IsExtensionFromHalf = [&, IsExtInst](const Value *V) -> bool {
1490+
return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
1491+
};
1492+
1493+
// We check the arguments of the instruction to see if they're extends
1494+
auto *BinOp = dyn_cast<BinaryOperator>(I);
1495+
if (!BinOp)
1496+
return false;
1497+
Value *Op0 = BinOp->getOperand(0);
1498+
Value *Op1 = BinOp->getOperand(1);
1499+
if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1500+
// We're interested in an ext of an i16
1501+
if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
1502+
!IsExtensionFromHalf(Op1))
1503+
return false;
1504+
// We need to check if this result will be further extended to i64
1505+
// and that all these uses are SExt
1506+
for (auto *U : I->users())
1507+
if (!IsExtInst(U))
1508+
return false;
1509+
return true;
1510+
}
1511+
1512+
return false;
1513+
};
1514+
1515+
if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1516+
return 0;
1517+
14611518
// Default to cheap (throughput/size of 1 instruction) but adjust throughput
14621519
// for "multiple beats" potentially needed by MVE instructions.
14631520
int BaseCost = 1;
14641521
if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
14651522
BaseCost = ST->getMVEVectorCostFactor(CostKind);
14661523

1467-
// The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1468-
// without treating floats as more expensive that scalars or increasing the
1469-
// costs for custom operations. The results is also multiplied by the
1470-
// MVEVectorCostFactor where appropriate.
1524+
// The rest of this mostly follows what is done in
1525+
// BaseT::getArithmeticInstrCost, without treating floats as more expensive
1526+
// that scalars or increasing the costs for custom operations. The results is
1527+
// also multiplied by the MVEVectorCostFactor where appropriate.
14711528
if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
14721529
return LT.first * BaseCost;
14731530

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp < %s | FileCheck %s
3+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
4+
5+
define i64 @test(i16 %a, i16 %b) {
6+
; CHECK-LABEL: 'test'
7+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
12+
;
13+
; CHECK-NO-DSP-LABEL: 'test'
14+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
15+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
16+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
17+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
18+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
19+
;
20+
%as = sext i16 %a to i32
21+
%bs = sext i16 %b to i32
22+
%m = mul i32 %as, %bs
23+
%ms = sext i32 %m to i64
24+
ret i64 %ms
25+
}
26+
27+
define i64 @withadd(i16 %a, i16 %b, i64 %c) {
28+
; CHECK-LABEL: 'withadd'
29+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
30+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
31+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
32+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
33+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
34+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
35+
;
36+
; CHECK-NO-DSP-LABEL: 'withadd'
37+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
38+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
39+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
40+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
41+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
42+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
43+
;
44+
%as = sext i16 %a to i32
45+
%bs = sext i16 %b to i32
46+
%m = mul i32 %as, %bs
47+
%ms = sext i32 %m to i64
48+
%r = add i64 %c, %ms
49+
ret i64 %r
50+
}
51+
52+
define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
53+
; CHECK-LABEL: 'withloads'
54+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
55+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
56+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
57+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
58+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
59+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
60+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
61+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
62+
;
63+
; CHECK-NO-DSP-LABEL: 'withloads'
64+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
65+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
66+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
67+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
68+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
69+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
70+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
71+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
72+
;
73+
%a = load i16, ptr %pa
74+
%b = load i16, ptr %pb
75+
%as = sext i16 %a to i32
76+
%bs = sext i16 %b to i32
77+
%m = mul i32 %as, %bs
78+
%ms = sext i32 %m to i64
79+
%r = add i64 %c, %ms
80+
ret i64 %r
81+
}
82+
83+
define i64 @different_extend_ops(i16 %a, i16 %b) {
84+
; CHECK-LABEL: 'different_extend_ops'
85+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
86+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
87+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
88+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
89+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
90+
;
91+
; CHECK-NO-DSP-LABEL: 'different_extend_ops'
92+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
93+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
94+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
95+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
96+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
97+
;
98+
%as = sext i16 %a to i32
99+
%bs = zext i16 %b to i32
100+
%m = mul i32 %as, %bs
101+
%ms = sext i32 %m to i64
102+
ret i64 %ms
103+
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp < %s | FileCheck %s
3+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
4+
define i64 @test(i16 %a, i16 %b) {
5+
; CHECK-LABEL: 'test'
6+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
7+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
11+
;
12+
; CHECK-NO-DSP-LABEL: 'test'
13+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
14+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
15+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
16+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
17+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
18+
;
19+
%as = zext i16 %a to i32
20+
%bs = zext i16 %b to i32
21+
%m = mul i32 %as, %bs
22+
%ms = zext i32 %m to i64
23+
ret i64 %ms
24+
}
25+
26+
define i64 @withadd(i16 %a, i16 %b, i64 %c) {
27+
; CHECK-LABEL: 'withadd'
28+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
29+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
30+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
31+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
32+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
33+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
34+
;
35+
; CHECK-NO-DSP-LABEL: 'withadd'
36+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
37+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
38+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
39+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
40+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
41+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
42+
;
43+
%as = zext i16 %a to i32
44+
%bs = zext i16 %b to i32
45+
%m = mul i32 %as, %bs
46+
%ms = zext i32 %m to i64
47+
%r = add i64 %c, %ms
48+
ret i64 %r
49+
}
50+
51+
define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
52+
; CHECK-LABEL: 'withloads'
53+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
54+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
55+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
56+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
57+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
58+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
59+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
60+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
61+
;
62+
; CHECK-NO-DSP-LABEL: 'withloads'
63+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
64+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
65+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
66+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
67+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
68+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
69+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
70+
; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
71+
;
72+
%a = load i16, ptr %pa
73+
%b = load i16, ptr %pb
74+
%as = zext i16 %a to i32
75+
%bs = zext i16 %b to i32
76+
%m = mul i32 %as, %bs
77+
%ms = zext i32 %m to i64
78+
%r = add i64 %c, %ms
79+
ret i64 %r
80+
}

0 commit comments

Comments
 (0)