Skip to content

Commit a602844

Browse files
committed
[ARM]Reduce scalar cost of SMLAL muls
Changes to the SLPVectorizer caused a regression in some benchmarks which targetted cores that support both DSP and MVE instructions. The particular regression has been reduced to a case where MUL instructions that are part of a chain of instructions that can be replaced with DSP SMLAL may also be vectorized. The generated code ends up being an inefficient of both scalar and vector ops rather than leaning to one or the other. By reducing the cost of these MUL instructions in these patterns we recover lost performance. Change-Id: I302817cf4fcd18a11d40fba430c44e034a36448b
1 parent 1921822 commit a602844

File tree

3 files changed

+88
-34
lines changed

3 files changed

+88
-34
lines changed

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,16 +1458,62 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
14581458
if (LooksLikeAFreeShift())
14591459
return 0;
14601460

1461+
// When targets have both DSP and MVE we find that the
1462+
// the compiler will attempt to vectorize as well as using
1463+
// scalar SMLAL operations. This is in cases where we have
1464+
// the pattern ext(mul(ext(i16), ext(i16))) we find
1465+
// that generated codegen performs better when only using SMLAL scalar
1466+
// ops instead of trying to mix vector ops with SMLAL ops. We therefore
1467+
// check if a mul instruction is used in a SMLAL pattern.
1468+
auto MulInSMLALPattern = [&](const Instruction *I, unsigned Opcode,
1469+
Type *Ty) -> bool {
1470+
if (!ST->hasDSP() || !ST->hasMVEIntegerOps())
1471+
return false;
1472+
if (!I)
1473+
return false;
1474+
1475+
if (Opcode != Instruction::Mul)
1476+
return false;
1477+
1478+
if (Ty->isVectorTy())
1479+
return false;
1480+
1481+
auto IsSExtInst = [](const Value *V) -> bool {
1482+
return (dyn_cast<SExtInst>(V)) ? true : false;
1483+
};
1484+
1485+
// We check the arguments of the function to see if they're extends
1486+
auto *BinOp = dyn_cast<BinaryOperator>(I);
1487+
if (!BinOp)
1488+
return false;
1489+
auto *Op0 = BinOp->getOperand(0);
1490+
auto *Op1 = BinOp->getOperand(1);
1491+
if (Op0 && Op1 && IsSExtInst(Op0) && IsSExtInst(Op1)) {
1492+
// In this case we're interested in an ext of an i16
1493+
if (!Op0->getType()->isIntegerTy(32) || !Op1->getType()->isIntegerTy(32))
1494+
return false;
1495+
// We need to check if this result will be further extended to i64
1496+
for (auto *U : I->users())
1497+
if (IsSExtInst(dyn_cast<Value>(U)))
1498+
return true;
1499+
}
1500+
1501+
return false;
1502+
};
1503+
1504+
if (MulInSMLALPattern(CxtI, Opcode, Ty))
1505+
return 0;
1506+
14611507
// Default to cheap (throughput/size of 1 instruction) but adjust throughput
14621508
// for "multiple beats" potentially needed by MVE instructions.
14631509
int BaseCost = 1;
14641510
if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
14651511
BaseCost = ST->getMVEVectorCostFactor(CostKind);
14661512

1467-
// The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1468-
// without treating floats as more expensive that scalars or increasing the
1469-
// costs for custom operations. The results is also multiplied by the
1470-
// MVEVectorCostFactor where appropriate.
1513+
// The rest of this mostly follows what is done in
1514+
// BaseT::getArithmeticInstrCost, without treating floats as more expensive
1515+
// that scalars or increasing the costs for custom operations. The results is
1516+
// also multiplied by the MVEVectorCostFactor where appropriate.
14711517
if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
14721518
return LT.first * BaseCost;
14731519

@@ -1806,7 +1852,7 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
18061852
((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
18071853
(LT.second == MVT::v8i16 && RevVTSize <= 32) ||
18081854
(LT.second == MVT::v4i32 && RevVTSize <= 64)))
1809-
return 3 * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1855+
return ST->getMVEVectorCostFactor(CostKind) * LT.first;
18101856
}
18111857
break;
18121858
default:
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+mve,+dsp < %s | FileCheck %s
2+
define i64 @test(i16 %a, i16 %b) {
3+
; CHECK-LABEL: 'test'
4+
; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
5+
;
6+
%as = sext i16 %a to i32
7+
%bs = sext i16 %b to i32
8+
%m = mul i32 %as, %bs
9+
%ms = sext i32 %m to i64
10+
ret i64 %ms
11+
}
12+
13+
define i64 @withadd(i16 %a, i16 %b, i64 %c) {
14+
; CHECK-LABEL: 'withadd'
15+
; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
16+
;
17+
%as = sext i16 %a to i32
18+
%bs = sext i16 %b to i32
19+
%m = mul i32 %as, %bs
20+
%ms = sext i32 %m to i64
21+
%r = add i64 %c, %ms
22+
ret i64 %r
23+
}
24+
25+
define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
26+
; CHECK-LABEL: 'withloads'
27+
; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
28+
;
29+
%a = load i16, ptr %pa
30+
%b = load i16, ptr %pb
31+
%as = sext i16 %a to i32
32+
%bs = sext i16 %b to i32
33+
%m = mul i32 %as, %bs
34+
%ms = sext i32 %m to i64
35+
%r = add i64 %c, %ms
36+
ret i64 %r
37+
}

llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll

Lines changed: 0 additions & 29 deletions
This file was deleted.

0 commit comments

Comments
 (0)