Skip to content

[SLP][REVEC] Fix cost model for getGatherCost with FixedVectorType ScalarTy. #109369

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11732,8 +11732,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
// Find the cost of inserting/extracting values from the vector.
// Check if the same elements are inserted several times and count them as
// shuffle candidates.
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
APInt ShuffledElements = APInt::getZero(VecTy->getNumElements());
APInt ShuffledElements = APInt::getZero(VL.size());
DenseMap<Value *, unsigned> UniqueElements;
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost;
Expand All @@ -11753,8 +11752,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
Value *V = VL[I];
// No need to shuffle duplicates for constants.
if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
ShuffledElements.setBits(I * ScalarTyNumElements,
I * ScalarTyNumElements + ScalarTyNumElements);
ShuffledElements.setBit(I);
ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
continue;
}
Expand All @@ -11767,14 +11765,27 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
}

DuplicateNonConst = true;
ShuffledElements.setBits(I * ScalarTyNumElements,
I * ScalarTyNumElements + ScalarTyNumElements);
ShuffledElements.setBit(I);
ShuffleMask[I] = Res.first->second;
}
if (ForPoisonSrc)
Cost =
TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
/*Extract*/ false, CostKind);
if (ForPoisonSrc) {
if (isa<FixedVectorType>(ScalarTy)) {
assert(SLPReVec && "Only supported by REVEC.");
// We don't need to insert elements one by one. Instead, we can insert the
// entire vector into the destination.
Cost = 0;
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
for (unsigned I : seq<unsigned>(VL.size()))
if (!ShuffledElements[I])
Cost += TTI->getShuffleCost(
TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
} else {
Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
/*Insert*/ true,
/*Extract*/ false, CostKind);
}
}
if (DuplicateNonConst)
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
VecTy, ShuffleMask);
Expand Down
42 changes: 42 additions & 0 deletions llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-10 -pass-remarks-output=%t %s | FileCheck %s
; RUN: FileCheck --input-file=%t --check-prefix=YAML %s

; YAML: --- !Passed
; YAML: Pass: slp-vectorizer
; YAML: Name: StoresVectorized
; YAML: Function: test
; YAML: Args:
; YAML: - String: 'Stores SLP vectorized with cost '
; YAML: - Cost: '6'
; YAML: - String: ' and with tree size '
; YAML: - TreeSize: '5'

define void @test(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4 x float> %load17, <4 x float> %fmuladd7, <4 x float> %fmuladd16, ptr %out_ptr) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[VEXT165_I:%.*]] = shufflevector <4 x float> [[LOAD6:%.*]], <4 x float> [[LOAD7:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; CHECK-NEXT: [[VEXT309_I:%.*]] = shufflevector <4 x float> [[LOAD7]], <4 x float> [[LOAD8:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[VEXT165_I]], i64 0)
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP0]], <4 x float> [[VEXT309_I]], i64 4)
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> poison, i64 4)
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP2]], <4 x float> [[LOAD17:%.*]], i64 0)
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[FMULADD7:%.*]], i64 0)
; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP5]], <4 x float> [[FMULADD16:%.*]], i64 4)
; CHECK-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
; CHECK-NEXT: store <8 x float> [[TMP7]], ptr [[OUT_PTR:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%vext165.i = shufflevector <4 x float> %load6, <4 x float> %load7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%vext309.i = shufflevector <4 x float> %load7, <4 x float> %load8, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%fmuladd8 = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> %vext165.i, <4 x float> %load17, <4 x float> %fmuladd7)
%fmuladd17 = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> %vext309.i, <4 x float> %load17, <4 x float> %fmuladd16)
%add.ptr.i.i = getelementptr inbounds i8, ptr %out_ptr, i64 16
store <4 x float> %fmuladd8, ptr %out_ptr, align 4
store <4 x float> %fmuladd17, ptr %add.ptr.i.i, align 4
ret void
}

declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
Loading