Skip to content

Commit aeedab7

Browse files
committed
[SLP]Correctly decide if the non-power-of-2 number of stores can be vectorized.
Need to consider the maximum type size in the graph before doing attempt for the vectorization of non-power-of-2 number of elements, which may be less than MinVF.
1 parent d5c292d commit aeedab7

File tree

2 files changed

+28
-39
lines changed

2 files changed

+28
-39
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16476,8 +16476,9 @@ bool SLPVectorizerPass::vectorizeStores(
1647616476
// First try vectorizing with a non-power-of-2 VF. At the moment, only
1647716477
// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
1647816478
// lanes are used.
16479-
unsigned CandVF = Operands.size();
16480-
if (has_single_bit(CandVF + 1) && CandVF <= MaxRegVF)
16479+
unsigned CandVF =
16480+
std::clamp<unsigned>(Operands.size(), MaxVF, MaxRegVF);
16481+
if (has_single_bit(CandVF + 1))
1648116482
NonPowerOf2VF = CandVF;
1648216483
}
1648316484

llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll

Lines changed: 25 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,43 +9,31 @@
99
;}
1010

1111
define i32 @foo(ptr noalias nocapture %A, ptr noalias nocapture %B, float %T) {
12-
; NON-POW2-LABEL: @foo(
13-
; NON-POW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
14-
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP1]], align 4
15-
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[T:%.*]], i32 0
16-
; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> zeroinitializer
17-
; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP2]], [[TMP4]]
18-
; NON-POW2-NEXT: [[TMP6:%.*]] = fpext <3 x float> [[TMP5]] to <3 x double>
19-
; NON-POW2-NEXT: [[TMP7:%.*]] = fadd <3 x double> [[TMP6]], <double 4.000000e+00, double 5.000000e+00, double 6.000000e+00>
20-
; NON-POW2-NEXT: [[TMP8:%.*]] = fptosi <3 x double> [[TMP7]] to <3 x i8>
21-
; NON-POW2-NEXT: store <3 x i8> [[TMP8]], ptr [[A:%.*]], align 1
22-
; NON-POW2-NEXT: ret i32 undef
23-
;
24-
; POW2-ONLY-LABEL: @foo(
25-
; POW2-ONLY-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
26-
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
27-
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
28-
; POW2-ONLY-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double
29-
; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
30-
; POW2-ONLY-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
31-
; POW2-ONLY-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1
32-
; POW2-ONLY-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
33-
; POW2-ONLY-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
34-
; POW2-ONLY-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
35-
; POW2-ONLY-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double
36-
; POW2-ONLY-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
37-
; POW2-ONLY-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
38-
; POW2-ONLY-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
39-
; POW2-ONLY-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1
40-
; POW2-ONLY-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
41-
; POW2-ONLY-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
42-
; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
43-
; POW2-ONLY-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double
44-
; POW2-ONLY-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
45-
; POW2-ONLY-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
46-
; POW2-ONLY-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
47-
; POW2-ONLY-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1
48-
; POW2-ONLY-NEXT: ret i32 undef
12+
; CHECK-LABEL: @foo(
13+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
14+
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
15+
; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
16+
; CHECK-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double
17+
; CHECK-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
18+
; CHECK-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
19+
; CHECK-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1
20+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
21+
; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
22+
; CHECK-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
23+
; CHECK-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double
24+
; CHECK-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
25+
; CHECK-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
26+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
27+
; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1
28+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
29+
; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
30+
; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
31+
; CHECK-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double
32+
; CHECK-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
33+
; CHECK-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
34+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
35+
; CHECK-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1
36+
; CHECK-NEXT: ret i32 undef
4937
;
5038
%1 = getelementptr inbounds float, ptr %B, i64 10
5139
%2 = load float, ptr %1, align 4

0 commit comments

Comments
 (0)