Skip to content

Commit 5915170

Browse files
committed
[SLP][REVEC] Fix cost model for getGatherCost with FixedVectorType
ScalarTy.
1 parent 95191e0 commit 5915170

File tree

2 files changed

+30
-15
lines changed

2 files changed

+30
-15
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11732,8 +11732,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1173211732
// Find the cost of inserting/extracting values from the vector.
1173311733
// Check if the same elements are inserted several times and count them as
1173411734
// shuffle candidates.
11735-
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
11736-
APInt ShuffledElements = APInt::getZero(VecTy->getNumElements());
11735+
APInt ShuffledElements = APInt::getZero(VL.size());
1173711736
DenseMap<Value *, unsigned> UniqueElements;
1173811737
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1173911738
InstructionCost Cost;
@@ -11753,8 +11752,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1175311752
Value *V = VL[I];
1175411753
// No need to shuffle duplicates for constants.
1175511754
if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11756-
ShuffledElements.setBits(I * ScalarTyNumElements,
11757-
I * ScalarTyNumElements + ScalarTyNumElements);
11755+
ShuffledElements.setBit(I);
1175811756
ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
1175911757
continue;
1176011758
}
@@ -11767,14 +11765,27 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1176711765
}
1176811766

1176911767
DuplicateNonConst = true;
11770-
ShuffledElements.setBits(I * ScalarTyNumElements,
11771-
I * ScalarTyNumElements + ScalarTyNumElements);
11768+
ShuffledElements.setBit(I);
1177211769
ShuffleMask[I] = Res.first->second;
1177311770
}
11774-
if (ForPoisonSrc)
11775-
Cost =
11776-
TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11777-
/*Extract*/ false, CostKind);
11771+
if (ForPoisonSrc) {
11772+
if (isa<FixedVectorType>(ScalarTy)) {
11773+
assert(SLPReVec && "Only supported by REVEC.");
11774+
// We don't need to insert elements one by one. Instead, we can insert the
11775+
// entire vector into the destination.
11776+
Cost = 0;
11777+
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
11778+
for (unsigned I = 0, E = VL.size(); I != E; ++I)
11779+
if (!ShuffledElements[I])
11780+
Cost += TTI->getShuffleCost(
11781+
TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
11782+
I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
11783+
} else {
11784+
Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
11785+
/*Insert*/ true,
11786+
/*Extract*/ false, CostKind);
11787+
}
11788+
}
1177811789
if (DuplicateNonConst)
1177911790
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
1178011791
VecTy, ShuffleMask);

llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@ define void @test(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[VEXT165_I:%.*]] = shufflevector <4 x float> [[LOAD6:%.*]], <4 x float> [[LOAD7:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
88
; CHECK-NEXT: [[VEXT309_I:%.*]] = shufflevector <4 x float> [[LOAD7]], <4 x float> [[LOAD8:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9-
; CHECK-NEXT: [[FMULADD8:%.*]] = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[VEXT165_I]], <4 x float> [[LOAD17:%.*]], <4 x float> [[FMULADD7:%.*]])
10-
; CHECK-NEXT: [[FMULADD17:%.*]] = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[VEXT309_I]], <4 x float> [[LOAD17]], <4 x float> [[FMULADD16:%.*]])
11-
; CHECK-NEXT: [[ADD_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[OUT_PTR:%.*]], i64 16
12-
; CHECK-NEXT: store <4 x float> [[FMULADD8]], ptr [[OUT_PTR]], align 4
13-
; CHECK-NEXT: store <4 x float> [[FMULADD17]], ptr [[ADD_PTR_I_I]], align 4
9+
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[VEXT165_I]], i64 0)
10+
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP0]], <4 x float> [[VEXT309_I]], i64 4)
11+
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> poison, i64 4)
12+
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP2]], <4 x float> [[LOAD17:%.*]], i64 0)
13+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
14+
; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[FMULADD7:%.*]], i64 0)
15+
; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP5]], <4 x float> [[FMULADD16:%.*]], i64 4)
16+
; CHECK-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
17+
; CHECK-NEXT: store <8 x float> [[TMP7]], ptr [[OUT_PTR:%.*]], align 4
1418
; CHECK-NEXT: ret void
1519
;
1620
entry:

0 commit comments

Comments
 (0)