Skip to content

Commit 07d284d

Browse files
[SLP]Add cost estimation for gather node reshuffling
Adds cost estimation for the variants of the permutations of the scalar values, used in gather nodes. Currently, SLP just unconditionally emits shuffles for the reused buildvectors, but in some cases better to leave them as buildvectors rather than shuffles, if the cost of such buildvectors is better. X86, AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 912998.00 913238.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 203070.00 203102.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1396320.00 1396448.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1396320.00 1396448.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 309790.00 309678.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12477607.00 12470807.00 -0.1% CINT2006/445.gobmk - extra code vectorized MiBench/consumer-lame - small variations CFP2017speed/638.imagick_s CFP2017rate/538.imagick_r - extra vectorized code Benchmarks/Bullet - extra code vectorized CFP2017rate/526.blender_r - extra vector code RISC-V, sifive-p670, -O3+LTO CFP2006/433.milc - regressions, should be fixed by #115173 CFP2006/453.povray - extra vectorized code CFP2017rate/508.namd_r - better vector code CFP2017rate/510.parest_r - extra vectorized code SPEC/CFP2017rate - extra/better vector code CFP2017rate/526.blender_r - extra vectorized code CFP2017rate/538.imagick_r - extra vectorized code CINT2006/403.gcc - extra vectorized code CINT2006/445.gobmk - extra vectorized code CINT2006/464.h264ref - extra vectorized code CINT2006/483.xalancbmk - small variations CINT2017rate/525.x264_r - better vectorization Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #115201
1 parent 2d038ca commit 07d284d

19 files changed

+955
-967
lines changed

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,33 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
259259
return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
260260
}
261261

262+
/// Checks if the provided mask \p is a splat mask, i.e. it contains only -1
263+
/// or same non -1 index value and this index value contained at least twice.
264+
/// So, mask <0, -1,-1, -1> is not considered splat (it is just identity),
265+
/// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat
266+
/// with \p Index=2.
267+
static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) {
268+
// Check that the broadcast index meets at least twice.
269+
bool IsCompared = false;
270+
if (int SplatIdx = PoisonMaskElem;
271+
all_of(enumerate(Mask), [&](const auto &P) {
272+
if (P.value() == PoisonMaskElem)
273+
return P.index() != Mask.size() - 1 || IsCompared;
274+
if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2)
275+
return false;
276+
if (SplatIdx == PoisonMaskElem) {
277+
SplatIdx = P.value();
278+
return P.index() != Mask.size() - 1;
279+
}
280+
IsCompared = true;
281+
return SplatIdx == P.value();
282+
})) {
283+
Index = SplatIdx;
284+
return true;
285+
}
286+
return false;
287+
}
288+
262289
protected:
263290
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
264291
: BaseT(DL) {}
@@ -1014,17 +1041,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
10141041
return Kind;
10151042
int NumSrcElts = Ty->getElementCount().getKnownMinValue();
10161043
switch (Kind) {
1017-
case TTI::SK_PermuteSingleSrc:
1044+
case TTI::SK_PermuteSingleSrc: {
10181045
if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
10191046
return TTI::SK_Reverse;
10201047
if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
10211048
return TTI::SK_Broadcast;
1049+
if (isSplatMask(Mask, NumSrcElts, Index))
1050+
return TTI::SK_Broadcast;
10221051
if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
10231052
(Index + Mask.size()) <= (size_t)NumSrcElts) {
10241053
SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
10251054
return TTI::SK_ExtractSubvector;
10261055
}
10271056
break;
1057+
}
10281058
case TTI::SK_PermuteTwoSrc: {
10291059
int NumSubElts;
10301060
if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 138 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13199,6 +13199,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1319913199
// No perfect match, just shuffle, so choose the first tree node from the
1320013200
// tree.
1320113201
Entries.push_back(FirstEntries.front());
13202+
VF = FirstEntries.front()->getVectorFactor();
1320213203
} else {
1320313204
// Try to find nodes with the same vector factor.
1320413205
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
@@ -13239,6 +13240,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1323913240
Entries.push_back(SecondEntries.front());
1324013241
VF = std::max(Entries.front()->getVectorFactor(),
1324113242
Entries.back()->getVectorFactor());
13243+
} else {
13244+
VF = Entries.front()->getVectorFactor();
1324213245
}
1324313246
}
1324413247

@@ -13350,17 +13353,141 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1335013353
: Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
1335113354
IsIdentity &= Mask[Idx] == Pair.second;
1335213355
}
13353-
switch (Entries.size()) {
13354-
case 1:
13355-
if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13356-
return TargetTransformInfo::SK_PermuteSingleSrc;
13357-
break;
13358-
case 2:
13359-
if (EntryLanes.size() > 2 || VL.size() <= 2)
13360-
return TargetTransformInfo::SK_PermuteTwoSrc;
13361-
break;
13362-
default:
13363-
break;
13356+
if (ForOrder || IsIdentity || Entries.empty()) {
13357+
switch (Entries.size()) {
13358+
case 1:
13359+
if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13360+
return TargetTransformInfo::SK_PermuteSingleSrc;
13361+
break;
13362+
case 2:
13363+
if (EntryLanes.size() > 2 || VL.size() <= 2)
13364+
return TargetTransformInfo::SK_PermuteTwoSrc;
13365+
break;
13366+
default:
13367+
break;
13368+
}
13369+
} else if (!isa<VectorType>(VL.front()->getType()) &&
13370+
(EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13371+
// Do the cost estimation if shuffle beneficial than buildvector.
13372+
SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13373+
std::next(Mask.begin(), (Part + 1) * VL.size()));
13374+
int MinElement = SubMask.front(), MaxElement = SubMask.front();
13375+
for (int Idx : SubMask) {
13376+
if (Idx == PoisonMaskElem)
13377+
continue;
13378+
if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13379+
MinElement = Idx;
13380+
if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13381+
MaxElement = Idx;
13382+
}
13383+
assert(MaxElement >= 0 && MinElement >= 0 &&
13384+
MaxElement % VF >= MinElement % VF &&
13385+
"Expected at least single element.");
13386+
unsigned NewVF = std::max<unsigned>(
13387+
VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13388+
(MaxElement % VF) -
13389+
(MinElement % VF) + 1));
13390+
if (NewVF < VF) {
13391+
for_each(SubMask, [&](int &Idx) {
13392+
if (Idx == PoisonMaskElem)
13393+
return;
13394+
Idx = (Idx % VF) - (MinElement % VF) +
13395+
(Idx >= static_cast<int>(VF) ? NewVF : 0);
13396+
});
13397+
VF = NewVF;
13398+
}
13399+
13400+
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13401+
auto *VecTy = getWidenedType(VL.front()->getType(), VF);
13402+
auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13403+
auto GetShuffleCost = [&,
13404+
&TTI = *TTI](ArrayRef<int> Mask,
13405+
ArrayRef<const TreeEntry *> Entries,
13406+
VectorType *VecTy) -> InstructionCost {
13407+
if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13408+
ShuffleVectorInst::isDeInterleaveMaskOfFactor(
13409+
Mask, Entries.front()->getInterleaveFactor()))
13410+
return TTI::TCC_Free;
13411+
return ::getShuffleCost(TTI,
13412+
Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13413+
: TTI::SK_PermuteSingleSrc,
13414+
VecTy, Mask, CostKind);
13415+
};
13416+
InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13417+
InstructionCost FirstShuffleCost = 0;
13418+
SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13419+
if (Entries.size() == 1 || !Entries[0]->isGather()) {
13420+
FirstShuffleCost = ShuffleCost;
13421+
} else {
13422+
// Transform mask to include only first entry.
13423+
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13424+
bool IsIdentity = true;
13425+
for (auto [I, Idx] : enumerate(FirstMask)) {
13426+
if (Idx >= static_cast<int>(VF)) {
13427+
Idx = PoisonMaskElem;
13428+
} else {
13429+
DemandedElts.clearBit(I);
13430+
if (Idx != PoisonMaskElem)
13431+
IsIdentity &= static_cast<int>(I) == Idx;
13432+
}
13433+
}
13434+
if (!IsIdentity)
13435+
FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13436+
FirstShuffleCost += TTI->getScalarizationOverhead(
13437+
MaskVecTy, DemandedElts, /*Insert=*/true,
13438+
/*Extract=*/false, CostKind);
13439+
}
13440+
InstructionCost SecondShuffleCost = 0;
13441+
SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13442+
if (Entries.size() == 1 || !Entries[1]->isGather()) {
13443+
SecondShuffleCost = ShuffleCost;
13444+
} else {
13445+
// Transform mask to include only first entry.
13446+
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13447+
bool IsIdentity = true;
13448+
for (auto [I, Idx] : enumerate(SecondMask)) {
13449+
if (Idx < static_cast<int>(VF) && Idx >= 0) {
13450+
Idx = PoisonMaskElem;
13451+
} else {
13452+
DemandedElts.clearBit(I);
13453+
if (Idx != PoisonMaskElem) {
13454+
Idx -= VF;
13455+
IsIdentity &= static_cast<int>(I) == Idx;
13456+
}
13457+
}
13458+
}
13459+
if (!IsIdentity)
13460+
SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13461+
SecondShuffleCost += TTI->getScalarizationOverhead(
13462+
MaskVecTy, DemandedElts, /*Insert=*/true,
13463+
/*Extract=*/false, CostKind);
13464+
}
13465+
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13466+
for (auto [I, Idx] : enumerate(SubMask))
13467+
if (Idx == PoisonMaskElem)
13468+
DemandedElts.clearBit(I);
13469+
InstructionCost BuildVectorCost =
13470+
TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13471+
/*Extract=*/false, CostKind);
13472+
const TreeEntry *BestEntry = nullptr;
13473+
if (FirstShuffleCost < ShuffleCost) {
13474+
copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
13475+
BestEntry = Entries.front();
13476+
ShuffleCost = FirstShuffleCost;
13477+
}
13478+
if (SecondShuffleCost < ShuffleCost) {
13479+
copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
13480+
BestEntry = Entries[1];
13481+
ShuffleCost = SecondShuffleCost;
13482+
}
13483+
if (BuildVectorCost >= ShuffleCost) {
13484+
if (BestEntry) {
13485+
Entries.clear();
13486+
Entries.push_back(BestEntry);
13487+
}
13488+
return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13489+
: TargetTransformInfo::SK_PermuteSingleSrc;
13490+
}
1336413491
}
1336513492
Entries.clear();
1336613493
// Clear the corresponding mask elements.

llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -399,13 +399,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
399399
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
400400
; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
401401
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
402-
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
402+
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
403403
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
404404
; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
405405
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
406406
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
407407
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
408-
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
408+
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
409409
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
410410
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
411411
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
@@ -436,13 +436,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
436436
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
437437
; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
438438
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
439-
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
439+
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
440440
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
441441
; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
442442
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
443443
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
444444
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
445-
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
445+
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
446446
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
447447
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
448448
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
@@ -476,13 +476,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
476476
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
477477
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
478478
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
479-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
479+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
480480
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
481481
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
482482
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
483483
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
484484
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
485-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
485+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
486486
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
487487
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
488488
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
@@ -513,13 +513,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
513513
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
514514
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
515515
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
516-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
516+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
517517
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
518518
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
519519
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
520520
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
521521
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
522-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
522+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
523523
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
524524
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
525525
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>

0 commit comments

Comments
 (0)