Skip to content

Commit a795a18

Browse files
authored
[SLP][REVEC] VF should be scaled when ScalarTy is FixedVectorType. (#114551)
1 parent 5445edb commit a795a18

File tree

2 files changed

+61
-5
lines changed

2 files changed

+61
-5
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5086,6 +5086,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
50865086
VecLdCost +=
50875087
TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
50885088
}
5089+
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
50895090
auto *SubVecTy = getWidenedType(ScalarTy, VF);
50905091
for (auto [I, LS] : enumerate(States)) {
50915092
auto *LI0 = cast<LoadInst>(VL[I * VF]);
@@ -5109,11 +5110,12 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
51095110
SubVecTy, APInt::getAllOnes(VF),
51105111
/*Insert=*/true, /*Extract=*/false, CostKind);
51115112
else
5112-
VectorGEPCost += TTI.getScalarizationOverhead(
5113-
SubVecTy, APInt::getOneBitSet(VF, 0),
5114-
/*Insert=*/true, /*Extract=*/false, CostKind) +
5115-
::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy,
5116-
{}, CostKind);
5113+
VectorGEPCost +=
5114+
TTI.getScalarizationOverhead(
5115+
SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5116+
/*Insert=*/true, /*Extract=*/false, CostKind) +
5117+
::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5118+
CostKind);
51175119
}
51185120
switch (LS) {
51195121
case LoadsState::Vectorize:

llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,57 @@ sw.bb509.i: ; preds = %if.then458.i, %if.e
4040
%5 = phi <2 x i32> [ %1, %if.then458.i ], [ zeroinitializer, %if.end.i87 ], [ zeroinitializer, %if.end.i87 ]
4141
ret i32 0
4242
}
43+
44+
define void @test2() {
45+
; CHECK-LABEL: @test2(
46+
; CHECK-NEXT: entry:
47+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr null, i64 132
48+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 200
49+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr null, i64 300
50+
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
51+
; CHECK-NEXT: [[TMP4:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
52+
; CHECK-NEXT: [[TMP5:%.*]] = load <16 x float>, ptr [[TMP0]], align 4
53+
; CHECK-NEXT: [[TMP6:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> [[TMP4]], i64 0)
54+
; CHECK-NEXT: [[TMP7:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP6]], <8 x float> [[TMP3]], i64 8)
55+
; CHECK-NEXT: [[TMP8:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v16f32(<32 x float> [[TMP7]], <16 x float> [[TMP5]], i64 16)
56+
; CHECK-NEXT: [[TMP9:%.*]] = fpext <32 x float> [[TMP8]] to <32 x double>
57+
; CHECK-NEXT: [[TMP10:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> poison, <8 x double> zeroinitializer, i64 0)
58+
; CHECK-NEXT: [[TMP11:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP10]], <8 x double> zeroinitializer, i64 8)
59+
; CHECK-NEXT: [[TMP12:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP11]], <8 x double> zeroinitializer, i64 16)
60+
; CHECK-NEXT: [[TMP13:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP12]], <8 x double> zeroinitializer, i64 24)
61+
; CHECK-NEXT: [[TMP14:%.*]] = fadd <32 x double> [[TMP13]], [[TMP9]]
62+
; CHECK-NEXT: [[TMP15:%.*]] = fptrunc <32 x double> [[TMP14]] to <32 x float>
63+
; CHECK-NEXT: [[TMP16:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> zeroinitializer, i64 0)
64+
; CHECK-NEXT: [[TMP17:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP16]], <8 x float> zeroinitializer, i64 8)
65+
; CHECK-NEXT: [[TMP18:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP17]], <8 x float> zeroinitializer, i64 16)
66+
; CHECK-NEXT: [[TMP19:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP18]], <8 x float> zeroinitializer, i64 24)
67+
; CHECK-NEXT: [[TMP20:%.*]] = fcmp ogt <32 x float> [[TMP19]], [[TMP15]]
68+
; CHECK-NEXT: ret void
69+
;
70+
entry:
71+
%0 = getelementptr i8, ptr null, i64 132
72+
%1 = getelementptr i8, ptr null, i64 164
73+
%2 = getelementptr i8, ptr null, i64 200
74+
%3 = getelementptr i8, ptr null, i64 300
75+
%4 = load <8 x float>, ptr %0, align 4
76+
%5 = load <8 x float>, ptr %1, align 4
77+
%6 = load <8 x float>, ptr %2, align 4
78+
%7 = load <8 x float>, ptr %3, align 4
79+
%8 = fpext <8 x float> %4 to <8 x double>
80+
%9 = fpext <8 x float> %5 to <8 x double>
81+
%10 = fpext <8 x float> %6 to <8 x double>
82+
%11 = fpext <8 x float> %7 to <8 x double>
83+
%12 = fadd <8 x double> zeroinitializer, %8
84+
%13 = fadd <8 x double> zeroinitializer, %9
85+
%14 = fadd <8 x double> zeroinitializer, %10
86+
%15 = fadd <8 x double> zeroinitializer, %11
87+
%16 = fptrunc <8 x double> %12 to <8 x float>
88+
%17 = fptrunc <8 x double> %13 to <8 x float>
89+
%18 = fptrunc <8 x double> %14 to <8 x float>
90+
%19 = fptrunc <8 x double> %15 to <8 x float>
91+
%20 = fcmp ogt <8 x float> zeroinitializer, %16
92+
%21 = fcmp ogt <8 x float> zeroinitializer, %17
93+
%22 = fcmp ogt <8 x float> zeroinitializer, %18
94+
%23 = fcmp ogt <8 x float> zeroinitializer, %19
95+
ret void
96+
}

0 commit comments

Comments
 (0)