Skip to content

Commit 8afed5f

Browse files
committed
[CostModel] getInstructionCost - improve estimation of costs for length changing shuffles
Fix gap in the cost estimation for length changing shuffles, by widening/narrowing the shuffle mask and either widening the shuffle inputs or extracting the lower elements of the result A small step towards moving some of this implementation inside improveShuffleKindFromMask and/or target getShuffleCost handlers (and reduce the diffs in cost estimation depending on whether come from a ShuffleVectorInst or the raw operands / mask components)
1 parent cf922e5 commit 8afed5f

12 files changed

+763
-514
lines changed

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,28 +1326,29 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
13261326
auto *VecSrcTy = cast<VectorType>(Operands[0]->getType());
13271327
int NumSubElts, SubIndex;
13281328

1329+
// TODO: move more of this inside improveShuffleKindFromMask.
13291330
if (Shuffle->changesLength()) {
1331+
ArrayRef<int> Mask = Shuffle->getShuffleMask();
1332+
13301333
// Treat a 'subvector widening' as a free shuffle.
13311334
if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding())
13321335
return 0;
13331336

13341337
if (Shuffle->isExtractSubvectorMask(SubIndex))
13351338
return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy,
1336-
Shuffle->getShuffleMask(), CostKind,
1337-
SubIndex, VecTy, Operands);
1339+
Mask, CostKind, SubIndex, VecTy,
1340+
Operands);
13381341

13391342
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
13401343
return TargetTTI->getShuffleCost(
1341-
TTI::SK_InsertSubvector, VecTy, Shuffle->getShuffleMask(),
1342-
CostKind, SubIndex,
1344+
TTI::SK_InsertSubvector, VecTy, Mask, CostKind, SubIndex,
13431345
FixedVectorType::get(VecTy->getScalarType(), NumSubElts),
13441346
Operands);
13451347

13461348
int ReplicationFactor, VF;
13471349
if (Shuffle->isReplicationMask(ReplicationFactor, VF)) {
1348-
APInt DemandedDstElts =
1349-
APInt::getZero(Shuffle->getShuffleMask().size());
1350-
for (auto I : enumerate(Shuffle->getShuffleMask())) {
1350+
APInt DemandedDstElts = APInt::getZero(Mask.size());
1351+
for (auto I : enumerate(Mask)) {
13511352
if (I.value() != PoisonMaskElem)
13521353
DemandedDstElts.setBit(I.index());
13531354
}
@@ -1356,7 +1357,35 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
13561357
DemandedDstElts, CostKind);
13571358
}
13581359

1359-
return CostKind == TTI::TCK_RecipThroughput ? -1 : 1;
1360+
bool IsUnary = isa<UndefValue>(Operands[1]);
1361+
NumSubElts = VecSrcTy->getElementCount().getKnownMinValue();
1362+
SmallVector<int, 16> AdjustMask(Mask.begin(), Mask.end());
1363+
1364+
// Widening shuffle - widening the source(s) to the new length
1365+
// (treated as free - see above), and then perform the adjusted
1366+
// shuffle at that width.
1367+
if (Shuffle->increasesLength()) {
1368+
for (int &M : AdjustMask)
1369+
M = M >= NumSubElts ? (M + (Mask.size() - NumSubElts)) : M;
1370+
1371+
return TargetTTI->getShuffleCost(
1372+
IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, VecTy,
1373+
AdjustMask, CostKind, 0, nullptr);
1374+
}
1375+
1376+
// Narrowing shuffle - perform shuffle at original wider width and
1377+
// then extract the lower elements.
1378+
AdjustMask.append(NumSubElts - Mask.size(), -1);
1379+
1380+
InstructionCost ShuffleCost = TargetTTI->getShuffleCost(
1381+
IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc,
1382+
VecSrcTy, AdjustMask, CostKind, 0, nullptr);
1383+
1384+
SmallVector<int, 16> ExtractMask(Mask.size(), -1);
1385+
std::iota(ExtractMask.begin(), ExtractMask.end(), 0);
1386+
return ShuffleCost + TargetTTI->getShuffleCost(
1387+
TTI::SK_ExtractSubvector, VecTy, ExtractMask,
1388+
CostKind, 0, VecSrcTy, Operands);
13601389
}
13611390

13621391
if (Shuffle->isIdentity())

llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

Lines changed: 184 additions & 184 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/RISCV/shuffle-extract_subvector.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
1919
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2020
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2121
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
22-
; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
22+
; CHECK-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
2323
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
2424
;
2525
; VLEN128-LABEL: 'test_vXf64'
@@ -32,7 +32,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
3232
; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3333
; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
3434
; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
35-
; VLEN128-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
35+
; VLEN128-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
3636
; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
3737
;
3838
%V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>

llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ define <8 x i64> @interleave2_v8i64(<4 x i64> %v0, <4 x i64> %v1) {
5656
; TODO: getInstructionCost doesn't call getShuffleCost here because the shuffle changes length
5757
define {<4 x i8>, <4 x i8>} @deinterleave_2(<8 x i8> %v) {
5858
; CHECK-LABEL: 'deinterleave_2'
59-
; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %v0 = shufflevector <8 x i8> %v, <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
60-
; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %v1 = shufflevector <8 x i8> %v, <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
59+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v0 = shufflevector <8 x i8> %v, <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
60+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v1 = shufflevector <8 x i8> %v, <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
6161
; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %res0 = insertvalue { <4 x i8>, <4 x i8> } poison, <4 x i8> %v0, 0
6262
; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %res1 = insertvalue { <4 x i8>, <4 x i8> } %res0, <4 x i8> %v1, 1
6363
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret { <4 x i8>, <4 x i8> } %res1

0 commit comments

Comments
 (0)