Skip to content

Commit e7eaf3a

Browse files
alexey-bataevDanielCChen
authored andcommitted
[SLP]Initial non-power-of-2 support (but still whole register) for reductions
Enables initial non-power-of-2 support (but still requiresnumber of elements, forming whole registers) for reductions. Enables extra vectorization for MultiSource/Benchmarks/7zip/7zip-benchmark, CINT2006/464.h264ref and CFP2017rate/526.blender_r (checked for SSE2) Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#112361
1 parent 72c22dc commit e7eaf3a

File tree

2 files changed

+21
-31
lines changed

2 files changed

+21
-31
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,8 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
291291
if (NumParts == 0 || NumParts >= Sz)
292292
return bit_floor(Sz);
293293
unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
294+
if (RegVF > Sz)
295+
return bit_floor(Sz);
294296
return (Sz / RegVF) * RegVF;
295297
}
296298

@@ -19061,18 +19063,16 @@ class HorizontalReduction {
1906119063

1906219064
unsigned ReduxWidth = NumReducedVals;
1906319065
if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
19064-
ReduxWidth = bit_floor(ReduxWidth);
19066+
ReduxWidth = getFloorFullVectorNumberOfElements(
19067+
*TTI, Candidates.front()->getType(), ReduxWidth);
1906519068
ReduxWidth = std::min(ReduxWidth, MaxElts);
1906619069

1906719070
unsigned Start = 0;
1906819071
unsigned Pos = Start;
1906919072
// Restarts vectorization attempt with lower vector factor.
1907019073
unsigned PrevReduxWidth = ReduxWidth;
1907119074
bool CheckForReusedReductionOpsLocal = false;
19072-
auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
19073-
&CheckForReusedReductionOpsLocal,
19074-
&PrevReduxWidth, &V,
19075-
&IgnoreList](bool IgnoreVL = false) {
19075+
auto AdjustReducedVals = [&](bool IgnoreVL = false) {
1907619076
bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
1907719077
if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
1907819078
// Check if any of the reduction ops are gathered. If so, worth
@@ -19083,7 +19083,10 @@ class HorizontalReduction {
1908319083
if (Pos < NumReducedVals - ReduxWidth + 1)
1908419084
return IsAnyRedOpGathered;
1908519085
Pos = Start;
19086-
ReduxWidth = bit_ceil(ReduxWidth) / 2;
19086+
--ReduxWidth;
19087+
if (ReduxWidth > 1)
19088+
ReduxWidth = getFloorFullVectorNumberOfElements(
19089+
*TTI, Candidates.front()->getType(), ReduxWidth);
1908719090
return IsAnyRedOpGathered;
1908819091
};
1908919092
bool AnyVectorized = false;
@@ -19315,7 +19318,10 @@ class HorizontalReduction {
1931519318
}
1931619319
Pos += ReduxWidth;
1931719320
Start = Pos;
19318-
ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
19321+
ReduxWidth = NumReducedVals - Pos;
19322+
if (ReduxWidth > 1)
19323+
ReduxWidth = getFloorFullVectorNumberOfElements(
19324+
*TTI, Candidates.front()->getType(), NumReducedVals - Pos);
1931919325
AnyVectorized = true;
1932019326
}
1932119327
if (OptReusedScalars && !AnyVectorized) {

llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -318,22 +318,14 @@ entry:
318318
define float @f(ptr nocapture readonly %x) {
319319
; CHECK-LABEL: @f(
320320
; CHECK-NEXT: entry:
321-
; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
322-
; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
323-
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
324-
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
325-
; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
326-
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
321+
; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
322+
; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
327323
; CHECK-NEXT: ret float [[OP_RDX]]
328324
;
329325
; THRESHOLD-LABEL: @f(
330326
; THRESHOLD-NEXT: entry:
331-
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
332-
; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
333-
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
334-
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
335-
; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
336-
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
327+
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
328+
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
337329
; THRESHOLD-NEXT: ret float [[OP_RDX]]
338330
;
339331
entry:
@@ -606,18 +598,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
606598
; CHECK-LABEL: @loadadd31(
607599
; CHECK-NEXT: entry:
608600
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
609-
; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
610-
; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
611-
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
601+
; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
612602
; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
613603
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
614604
; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
615605
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
616606
; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
617607
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
618-
; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
619-
; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
620-
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
608+
; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
621609
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
622610
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
623611
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
@@ -627,18 +615,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
627615
; THRESHOLD-LABEL: @loadadd31(
628616
; THRESHOLD-NEXT: entry:
629617
; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
630-
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
631-
; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
632-
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
618+
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
633619
; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
634620
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
635621
; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
636622
; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
637623
; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
638624
; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
639-
; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
640-
; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
641-
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
625+
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
642626
; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
643627
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
644628
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]

0 commit comments

Comments
 (0)