-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV][TTI]Use processShuffleMasks for cost estimations/actual per-register shuffles #118103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-x86 Author: Alexey Bataev (alexey-bataev) ChangesPatch adds usage of processShuffleMasks in TTI for RISCV and in codegen Patch is 20.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118103.diff 8 Files Affected:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 989090b80e1c87..5f7aa530342489 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -504,25 +504,26 @@ void llvm::processShuffleMasks(
unsigned SzSrc = Sz / NumOfSrcRegs;
for (unsigned I = 0; I < NumOfDestRegs; ++I) {
auto &RegMasks = Res[I];
- RegMasks.assign(NumOfSrcRegs, {});
+ RegMasks.assign(2 * NumOfSrcRegs, {});
// Check that the values in dest registers are in the one src
// register.
for (unsigned K = 0; K < SzDest; ++K) {
int Idx = I * SzDest + K;
if (Idx == Sz)
break;
- if (Mask[Idx] >= Sz || Mask[Idx] == PoisonMaskElem)
+ if (Mask[Idx] >= 2 * Sz || Mask[Idx] == PoisonMaskElem)
continue;
- int SrcRegIdx = Mask[Idx] / SzSrc;
+ int MaskIdx = Mask[Idx] % Sz;
+ int SrcRegIdx = MaskIdx / SzSrc + (Mask[Idx] >= Sz ? NumOfSrcRegs : 0);
// Add a cost of PermuteTwoSrc for each new source register permute,
// if we have more than one source registers.
if (RegMasks[SrcRegIdx].empty())
RegMasks[SrcRegIdx].assign(SzDest, PoisonMaskElem);
- RegMasks[SrcRegIdx][K] = Mask[Idx] % SzSrc;
+ RegMasks[SrcRegIdx][K] = MaskIdx % SzSrc;
}
}
// Process split mask.
- for (unsigned I = 0; I < NumOfUsedRegs; ++I) {
+ for (unsigned I : seq<unsigned>(NumOfUsedRegs)) {
auto &Dest = Res[I];
int NumSrcRegs =
count_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
@@ -567,7 +568,7 @@ void llvm::processShuffleMasks(
int FirstIdx = -1;
SecondIdx = -1;
MutableArrayRef<int> FirstMask, SecondMask;
- for (unsigned I = 0; I < NumOfDestRegs; ++I) {
+ for (unsigned I : seq<unsigned>(2 * NumOfSrcRegs)) {
SmallVectorImpl<int> &RegMask = Dest[I];
if (RegMask.empty())
continue;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 78dc3cb27a6988..9947b733037657 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5081,7 +5081,6 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
SDValue V1 = SVN->getOperand(0);
SDValue V2 = SVN->getOperand(1);
ArrayRef<int> Mask = SVN->getMask();
- unsigned NumElts = VT.getVectorNumElements();
// If we don't know exact data layout, not much we can do. If this
// is already m1 or smaller, no point in splitting further.
@@ -5098,58 +5097,70 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
MVT ElemVT = VT.getVectorElementType();
unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
- unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
-
- SmallVector<std::pair<int, SmallVector<int>>>
- OutMasks(VRegsPerSrc, {-1, {}});
-
- // Check if our mask can be done as a 1-to-1 mapping from source
- // to destination registers in the group without needing to
- // write each destination more than once.
- for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
- int DstVecIdx = DstIdx / ElemsPerVReg;
- int DstSubIdx = DstIdx % ElemsPerVReg;
- int SrcIdx = Mask[DstIdx];
- if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
- continue;
- int SrcVecIdx = SrcIdx / ElemsPerVReg;
- int SrcSubIdx = SrcIdx % ElemsPerVReg;
- if (OutMasks[DstVecIdx].first == -1)
- OutMasks[DstVecIdx].first = SrcVecIdx;
- if (OutMasks[DstVecIdx].first != SrcVecIdx)
- // Note: This case could easily be handled by keeping track of a chain
- // of source values and generating two element shuffles below. This is
- // less an implementation question, and more a profitability one.
- return SDValue();
-
- OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
- OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
- }
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
assert(M1VT == getLMUL1VT(M1VT));
unsigned NumOpElts = M1VT.getVectorMinNumElements();
- SDValue Vec = DAG.getUNDEF(ContainerVT);
+ unsigned NormalizedVF = ContainerVT.getVectorMinNumElements();
+ unsigned NumOfSrcRegs = NormalizedVF / NumOpElts;
+ unsigned NumOfDestRegs = NormalizedVF / NumOpElts;
// The following semantically builds up a fixed length concat_vector
// of the component shuffle_vectors. We eagerly lower to scalable here
// to avoid DAG combining it back to a large shuffle_vector again.
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
- for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
- auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
- if (SrcVecIdx == -1)
+ SmallVector<SDValue> SubRegs(NumOfDestRegs);
+ unsigned RegCnt = 0;
+ unsigned PrevCnt = 0;
+ processShuffleMasks(
+ Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs,
+ [&]() {
+ PrevCnt = RegCnt;
+ ++RegCnt;
+ },
+ [&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned SrcVecIdx,
+ unsigned DstVecIdx) {
+ SDValue SrcVec = SrcVecIdx >= NumOfSrcRegs ? V2 : V1;
+ unsigned ExtractIdx = (SrcVecIdx % NumOfSrcRegs) * NumOpElts;
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
+ DAG.getVectorIdxConstant(ExtractIdx, DL));
+ SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
+ SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
+ SubRegs[RegCnt] = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
+ PrevCnt = RegCnt;
+ ++RegCnt;
+ },
+ [&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned Idx1, unsigned Idx2) {
+ if (PrevCnt + 1 == RegCnt)
+ ++RegCnt;
+ SDValue SubVec1 = SubRegs[PrevCnt + 1];
+ if (!SubVec1) {
+ SDValue SrcVec = Idx1 >= NumOfSrcRegs ? V2 : V1;
+ unsigned ExtractIdx = (Idx1 % NumOfSrcRegs) * NumOpElts;
+ SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
+ DAG.getVectorIdxConstant(ExtractIdx, DL));
+ }
+ SubVec1 = convertFromScalableVector(OneRegVT, SubVec1, DAG, Subtarget);
+ SDValue SrcVec = Idx2 >= NumOfSrcRegs ? V2 : V1;
+ unsigned ExtractIdx = (Idx2 % NumOfSrcRegs) * NumOpElts;
+ SDValue SubVec2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
+ DAG.getVectorIdxConstant(ExtractIdx, DL));
+ SubVec2 = convertFromScalableVector(OneRegVT, SubVec2, DAG, Subtarget);
+ SubVec1 =
+ DAG.getVectorShuffle(OneRegVT, DL, SubVec1, SubVec2, SrcSubMask);
+ SubVec1 = convertToScalableVector(M1VT, SubVec1, DAG, Subtarget);
+ SubRegs[PrevCnt + 1] = SubVec1;
+ });
+ assert(RegCnt == NumOfDestRegs && "Whole vector must be processed");
+ SDValue Vec = DAG.getUNDEF(ContainerVT);
+ for (auto [I, V] : enumerate(SubRegs)) {
+ if (!V)
continue;
- unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
- SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
- SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
- DAG.getVectorIdxConstant(ExtractIdx, DL));
- SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
- SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
- SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
- unsigned InsertIdx = DstVecIdx * NumOpElts;
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
+ unsigned InsertIdx = I * NumOpElts;
+
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, V,
DAG.getVectorIdxConstant(InsertIdx, DL));
}
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bbded57bb92ab0..8e2e3daf3d0d0d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -389,6 +389,105 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// give a more accurate cost than falling back to generic scalable codegen.
// TODO: Each of these cases hints at a modeling gap around scalable vectors.
if (isa<FixedVectorType>(Tp)) {
+ MVT LegalVT = LT.second;
+ InstructionCost NumOfDests = LT.first;
+ if (ST->hasVInstructions() &&
+ LT.second.getSizeInBits().getFixedValue() >
+ ST->getRealVLen().value_or(UINT_MAX) &&
+ !Mask.empty() && NumOfDests.isValid() && NumOfDests > 1 &&
+ LegalVT.isFixedLengthVector() &&
+ LegalVT.getVectorElementType().getSizeInBits() ==
+ Tp->getElementType()->getPrimitiveSizeInBits() &&
+ LegalVT.getVectorNumElements() <
+ Tp->getElementCount().getFixedValue()) {
+ unsigned VecTySize = DL.getTypeStoreSize(Tp);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ // Number of source vectors after legalization:
+ unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+ // Number of destination vectors after legalization:
+
+ auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
+ LegalVT.getVectorNumElements());
+
+ // Try to perform better estimation of the permutation.
+ // 1. Split the source/destination vectors into real registers.
+ // 2. Do the mask analysis to identify which real registers are
+ // permuted. If more than 1 source registers are used for the
+ // destination register building, the cost for this destination register
+ // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
+ // source register is used, build mask and calculate the cost as a cost
+ // of PermuteSingleSrc.
+ // Also, for the single register permute we try to identify if the
+ // destination register is just a copy of the source register or the
+ // copy of the previous destination register (the cost is
+ // TTI::TCC_Basic). If the source register is just reused, the cost for
+ // this operation is 0.
+ NumOfDests = getTypeLegalizationCost(
+ FixedVectorType::get(Tp->getElementType(), Mask.size()))
+ .first;
+ unsigned E = *NumOfDests.getValue();
+ unsigned NormalizedVF =
+ LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
+ unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
+ unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
+ SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
+ copy(Mask, NormalizedMask.begin());
+ unsigned PrevSrcReg = 0;
+ ArrayRef<int> PrevRegMask;
+ InstructionCost Cost = 0;
+ SmallBitVector ExtractedRegs(2 * NumOfSrcRegs);
+ processShuffleMasks(
+ NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
+ [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
+ if (ExtractedRegs.test(SrcReg)) {
+ Cost += getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind,
+ (SrcReg % NumOfSrcRegs) *
+ SingleOpTy->getNumElements(),
+ SingleOpTy);
+ ExtractedRegs.set(SrcReg);
+ }
+ if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
+ // Check if the previous register can be just copied to the next
+ // one.
+ if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
+ PrevRegMask != RegMask) {
+ Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
+ RegMask, CostKind, 0, nullptr);
+ } else {
+ // Just a copy of previous destination register.
+ Cost += TTI::TCC_Basic;
+ }
+ return;
+ }
+ if (SrcReg != DestReg &&
+ any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
+ // Just a copy of the source register.
+ Cost += TTI::TCC_Basic;
+ }
+ PrevSrcReg = SrcReg;
+ PrevRegMask = RegMask;
+ ExtractedRegs.set(DestReg);
+ },
+ [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2) {
+ if (ExtractedRegs.test(Idx1)) {
+ Cost += getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind,
+ (Idx1 % NumOfSrcRegs) *
+ SingleOpTy->getNumElements(),
+ SingleOpTy);
+ ExtractedRegs.set(Idx1);
+ }
+ if (ExtractedRegs.test(Idx2)) {
+ Cost += getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind,
+ (Idx2 % NumOfSrcRegs) *
+ SingleOpTy->getNumElements(),
+ SingleOpTy);
+ ExtractedRegs.set(Idx2);
+ }
+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
+ CostKind, 0, nullptr);
+ });
+ return Cost;
+ }
switch (Kind) {
default:
break;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
index 5d629022c148fb..39c935fff6b76b 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
index 3d743c17715e20..2a89924dc77800 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
index 53262d8e4f564a..848e7b4e611a7e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
index 6913c753f36fa4..4c6d1ccd5ca342 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index d461fa8378cffc..afa0e7cc32ef4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -164,12 +164,11 @@ define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64>
define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_rang...
[truncated]
|
@llvm/pr-subscribers-backend-risc-v Author: Alexey Bataev (alexey-bataev) ChangesPatch adds usage of processShuffleMasks in TTI for RISCV and in codegen Patch is 20.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118103.diff 8 Files Affected:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 989090b80e1c87..5f7aa530342489 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -504,25 +504,26 @@ void llvm::processShuffleMasks(
unsigned SzSrc = Sz / NumOfSrcRegs;
for (unsigned I = 0; I < NumOfDestRegs; ++I) {
auto &RegMasks = Res[I];
- RegMasks.assign(NumOfSrcRegs, {});
+ RegMasks.assign(2 * NumOfSrcRegs, {});
// Check that the values in dest registers are in the one src
// register.
for (unsigned K = 0; K < SzDest; ++K) {
int Idx = I * SzDest + K;
if (Idx == Sz)
break;
- if (Mask[Idx] >= Sz || Mask[Idx] == PoisonMaskElem)
+ if (Mask[Idx] >= 2 * Sz || Mask[Idx] == PoisonMaskElem)
continue;
- int SrcRegIdx = Mask[Idx] / SzSrc;
+ int MaskIdx = Mask[Idx] % Sz;
+ int SrcRegIdx = MaskIdx / SzSrc + (Mask[Idx] >= Sz ? NumOfSrcRegs : 0);
// Add a cost of PermuteTwoSrc for each new source register permute,
// if we have more than one source registers.
if (RegMasks[SrcRegIdx].empty())
RegMasks[SrcRegIdx].assign(SzDest, PoisonMaskElem);
- RegMasks[SrcRegIdx][K] = Mask[Idx] % SzSrc;
+ RegMasks[SrcRegIdx][K] = MaskIdx % SzSrc;
}
}
// Process split mask.
- for (unsigned I = 0; I < NumOfUsedRegs; ++I) {
+ for (unsigned I : seq<unsigned>(NumOfUsedRegs)) {
auto &Dest = Res[I];
int NumSrcRegs =
count_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
@@ -567,7 +568,7 @@ void llvm::processShuffleMasks(
int FirstIdx = -1;
SecondIdx = -1;
MutableArrayRef<int> FirstMask, SecondMask;
- for (unsigned I = 0; I < NumOfDestRegs; ++I) {
+ for (unsigned I : seq<unsigned>(2 * NumOfSrcRegs)) {
SmallVectorImpl<int> &RegMask = Dest[I];
if (RegMask.empty())
continue;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 78dc3cb27a6988..9947b733037657 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5081,7 +5081,6 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
SDValue V1 = SVN->getOperand(0);
SDValue V2 = SVN->getOperand(1);
ArrayRef<int> Mask = SVN->getMask();
- unsigned NumElts = VT.getVectorNumElements();
// If we don't know exact data layout, not much we can do. If this
// is already m1 or smaller, no point in splitting further.
@@ -5098,58 +5097,70 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
MVT ElemVT = VT.getVectorElementType();
unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
- unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
-
- SmallVector<std::pair<int, SmallVector<int>>>
- OutMasks(VRegsPerSrc, {-1, {}});
-
- // Check if our mask can be done as a 1-to-1 mapping from source
- // to destination registers in the group without needing to
- // write each destination more than once.
- for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
- int DstVecIdx = DstIdx / ElemsPerVReg;
- int DstSubIdx = DstIdx % ElemsPerVReg;
- int SrcIdx = Mask[DstIdx];
- if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
- continue;
- int SrcVecIdx = SrcIdx / ElemsPerVReg;
- int SrcSubIdx = SrcIdx % ElemsPerVReg;
- if (OutMasks[DstVecIdx].first == -1)
- OutMasks[DstVecIdx].first = SrcVecIdx;
- if (OutMasks[DstVecIdx].first != SrcVecIdx)
- // Note: This case could easily be handled by keeping track of a chain
- // of source values and generating two element shuffles below. This is
- // less an implementation question, and more a profitability one.
- return SDValue();
-
- OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
- OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
- }
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
assert(M1VT == getLMUL1VT(M1VT));
unsigned NumOpElts = M1VT.getVectorMinNumElements();
- SDValue Vec = DAG.getUNDEF(ContainerVT);
+ unsigned NormalizedVF = ContainerVT.getVectorMinNumElements();
+ unsigned NumOfSrcRegs = NormalizedVF / NumOpElts;
+ unsigned NumOfDestRegs = NormalizedVF / NumOpElts;
// The following semantically builds up a fixed length concat_vector
// of the component shuffle_vectors. We eagerly lower to scalable here
// to avoid DAG combining it back to a large shuffle_vector again.
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
- for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
- auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
- if (SrcVecIdx == -1)
+ SmallVector<SDValue> SubRegs(NumOfDestRegs);
+ unsigned RegCnt = 0;
+ unsigned PrevCnt = 0;
+ processShuffleMasks(
+ Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs,
+ [&]() {
+ PrevCnt = RegCnt;
+ ++RegCnt;
+ },
+ [&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned SrcVecIdx,
+ unsigned DstVecIdx) {
+ SDValue SrcVec = SrcVecIdx >= NumOfSrcRegs ? V2 : V1;
+ unsigned ExtractIdx = (SrcVecIdx % NumOfSrcRegs) * NumOpElts;
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
+ DAG.getVectorIdxConstant(ExtractIdx, DL));
+ SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
+ SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
+ SubRegs[RegCnt] = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
+ PrevCnt = RegCnt;
+ ++RegCnt;
+ },
+ [&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned Idx1, unsigned Idx2) {
+ if (PrevCnt + 1 == RegCnt)
+ ++RegCnt;
+ SDValue SubVec1 = SubRegs[PrevCnt + 1];
+ if (!SubVec1) {
+ SDValue SrcVec = Idx1 >= NumOfSrcRegs ? V2 : V1;
+ unsigned ExtractIdx = (Idx1 % NumOfSrcRegs) * NumOpElts;
+ SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
+ DAG.getVectorIdxConstant(ExtractIdx, DL));
+ }
+ SubVec1 = convertFromScalableVector(OneRegVT, SubVec1, DAG, Subtarget);
+ SDValue SrcVec = Idx2 >= NumOfSrcRegs ? V2 : V1;
+ unsigned ExtractIdx = (Idx2 % NumOfSrcRegs) * NumOpElts;
+ SDValue SubVec2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
+ DAG.getVectorIdxConstant(ExtractIdx, DL));
+ SubVec2 = convertFromScalableVector(OneRegVT, SubVec2, DAG, Subtarget);
+ SubVec1 =
+ DAG.getVectorShuffle(OneRegVT, DL, SubVec1, SubVec2, SrcSubMask);
+ SubVec1 = convertToScalableVector(M1VT, SubVec1, DAG, Subtarget);
+ SubRegs[PrevCnt + 1] = SubVec1;
+ });
+ assert(RegCnt == NumOfDestRegs && "Whole vector must be processed");
+ SDValue Vec = DAG.getUNDEF(ContainerVT);
+ for (auto [I, V] : enumerate(SubRegs)) {
+ if (!V)
continue;
- unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
- SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
- SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
- DAG.getVectorIdxConstant(ExtractIdx, DL));
- SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
- SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
- SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
- unsigned InsertIdx = DstVecIdx * NumOpElts;
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
+ unsigned InsertIdx = I * NumOpElts;
+
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, V,
DAG.getVectorIdxConstant(InsertIdx, DL));
}
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bbded57bb92ab0..8e2e3daf3d0d0d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -389,6 +389,105 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// give a more accurate cost than falling back to generic scalable codegen.
// TODO: Each of these cases hints at a modeling gap around scalable vectors.
if (isa<FixedVectorType>(Tp)) {
+ MVT LegalVT = LT.second;
+ InstructionCost NumOfDests = LT.first;
+ if (ST->hasVInstructions() &&
+ LT.second.getSizeInBits().getFixedValue() >
+ ST->getRealVLen().value_or(UINT_MAX) &&
+ !Mask.empty() && NumOfDests.isValid() && NumOfDests > 1 &&
+ LegalVT.isFixedLengthVector() &&
+ LegalVT.getVectorElementType().getSizeInBits() ==
+ Tp->getElementType()->getPrimitiveSizeInBits() &&
+ LegalVT.getVectorNumElements() <
+ Tp->getElementCount().getFixedValue()) {
+ unsigned VecTySize = DL.getTypeStoreSize(Tp);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ // Number of source vectors after legalization:
+ unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+ // Number of destination vectors after legalization:
+
+ auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
+ LegalVT.getVectorNumElements());
+
+ // Try to perform better estimation of the permutation.
+ // 1. Split the source/destination vectors into real registers.
+ // 2. Do the mask analysis to identify which real registers are
+ // permuted. If more than 1 source registers are used for the
+ // destination register building, the cost for this destination register
+ // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
+ // source register is used, build mask and calculate the cost as a cost
+ // of PermuteSingleSrc.
+ // Also, for the single register permute we try to identify if the
+ // destination register is just a copy of the source register or the
+ // copy of the previous destination register (the cost is
+ // TTI::TCC_Basic). If the source register is just reused, the cost for
+ // this operation is 0.
+ NumOfDests = getTypeLegalizationCost(
+ FixedVectorType::get(Tp->getElementType(), Mask.size()))
+ .first;
+ unsigned E = *NumOfDests.getValue();
+ unsigned NormalizedVF =
+ LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
+ unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
+ unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
+ SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
+ copy(Mask, NormalizedMask.begin());
+ unsigned PrevSrcReg = 0;
+ ArrayRef<int> PrevRegMask;
+ InstructionCost Cost = 0;
+ SmallBitVector ExtractedRegs(2 * NumOfSrcRegs);
+ processShuffleMasks(
+ NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
+ [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
+ if (ExtractedRegs.test(SrcReg)) {
+ Cost += getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind,
+ (SrcReg % NumOfSrcRegs) *
+ SingleOpTy->getNumElements(),
+ SingleOpTy);
+ ExtractedRegs.set(SrcReg);
+ }
+ if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
+ // Check if the previous register can be just copied to the next
+ // one.
+ if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
+ PrevRegMask != RegMask) {
+ Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
+ RegMask, CostKind, 0, nullptr);
+ } else {
+ // Just a copy of previous destination register.
+ Cost += TTI::TCC_Basic;
+ }
+ return;
+ }
+ if (SrcReg != DestReg &&
+ any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
+ // Just a copy of the source register.
+ Cost += TTI::TCC_Basic;
+ }
+ PrevSrcReg = SrcReg;
+ PrevRegMask = RegMask;
+ ExtractedRegs.set(DestReg);
+ },
+ [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2) {
+ if (ExtractedRegs.test(Idx1)) {
+ Cost += getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind,
+ (Idx1 % NumOfSrcRegs) *
+ SingleOpTy->getNumElements(),
+ SingleOpTy);
+ ExtractedRegs.set(Idx1);
+ }
+ if (ExtractedRegs.test(Idx2)) {
+ Cost += getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind,
+ (Idx2 % NumOfSrcRegs) *
+ SingleOpTy->getNumElements(),
+ SingleOpTy);
+ ExtractedRegs.set(Idx2);
+ }
+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
+ CostKind, 0, nullptr);
+ });
+ return Cost;
+ }
switch (Kind) {
default:
break;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
index 5d629022c148fb..39c935fff6b76b 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
index 3d743c17715e20..2a89924dc77800 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
index 53262d8e4f564a..848e7b4e611a7e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
index 6913c753f36fa4..4c6d1ccd5ca342 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index d461fa8378cffc..afa0e7cc32ef4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -164,12 +164,11 @@ define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64>
define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_rang...
[truncated]
|
|
||
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); | ||
MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg); | ||
MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget); | ||
assert(M1VT == getLMUL1VT(M1VT)); | ||
unsigned NumOpElts = M1VT.getVectorMinNumElements(); | ||
SDValue Vec = DAG.getUNDEF(ContainerVT); | ||
unsigned NormalizedVF = ContainerVT.getVectorMinNumElements(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think VF is a term used outside of the vectorizers much. Maybe use something like ContainerNumOpElts?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The llvm::processShuffleMasks + x86 changes LGTM - but I defer to the RISCV team for the rest of it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are at least three changes here which need to be separated.
- Change to generic processShuffleMask, and tests for existing targets. Is this is a generally beneficial change? That needs to be justified, and reviewed. (I had somehow missed in the github interface that Simon had already LGTMed this bit. Please separate and land.)
- Change to RISCV lowering to use processShuffleMask. In particular, this needs significantly more testing!
- Change to RISCV cost model.
I have no opinion on the order of 1 and 2, but 3 definitely needs to follow both 1 and 2.
@alexey-bataev If you can pull out the x86 / processShuffleMasks changes - I can complete some wip x86 changes I have to use it for SK_PermuteTwoSrc |
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
Ping! |
1 similar comment
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM w/ comments addressed
…ctual per-register shuffles Patch adds usage of processShuffleMasks in TTI for RISCV. This function is already used for X86 shuffles estimations and in DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE functions and in RISCV codegen. Patch allows better cost estimation for sparse masks and unifies cost/codegen between different targets/passes Reviewers: preames Reviewed By: preames Pull Request: llvm/llvm-project#118103
Patch adds usage of processShuffleMasks in TTI for RISCV. This function is already used for X86
shuffles estimations and in DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE
functions and in RISCV codegen.
Patch allows better cost estimation for sparse masks and unifies
cost/codegen between different targets/passes