Skip to content

Commit 685bec7

Browse files
committed
Revert "[SLP]Initial non-power-of-2 support (but still whole register) for reductions"
This reverts commit 8287fa8 to investigate and fix compile time regressions reported by https://llvm-compile-time-tracker.com/compare.php?from=ec78f0da0e9b1b8e2b2323e434ea742e272dd913&to=8287fa8e596d8fc8655c8df3bc99e068ad9f7d4b&stat=instructions:u
1 parent d1a4791 commit 685bec7

File tree

2 files changed

+31
-21
lines changed

2 files changed

+31
-21
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,6 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
291291
if (NumParts == 0 || NumParts >= Sz)
292292
return bit_floor(Sz);
293293
unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
294-
if (RegVF > Sz)
295-
return bit_floor(Sz);
296294
return (Sz / RegVF) * RegVF;
297295
}
298296

@@ -19073,16 +19071,18 @@ class HorizontalReduction {
1907319071

1907419072
unsigned ReduxWidth = NumReducedVals;
1907519073
if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
19076-
ReduxWidth = getFloorFullVectorNumberOfElements(
19077-
*TTI, Candidates.front()->getType(), ReduxWidth);
19074+
ReduxWidth = bit_floor(ReduxWidth);
1907819075
ReduxWidth = std::min(ReduxWidth, MaxElts);
1907919076

1908019077
unsigned Start = 0;
1908119078
unsigned Pos = Start;
1908219079
// Restarts vectorization attempt with lower vector factor.
1908319080
unsigned PrevReduxWidth = ReduxWidth;
1908419081
bool CheckForReusedReductionOpsLocal = false;
19085-
auto AdjustReducedVals = [&](bool IgnoreVL = false) {
19082+
auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
19083+
&CheckForReusedReductionOpsLocal,
19084+
&PrevReduxWidth, &V,
19085+
&IgnoreList](bool IgnoreVL = false) {
1908619086
bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
1908719087
if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
1908819088
// Check if any of the reduction ops are gathered. If so, worth
@@ -19093,10 +19093,7 @@ class HorizontalReduction {
1909319093
if (Pos < NumReducedVals - ReduxWidth + 1)
1909419094
return IsAnyRedOpGathered;
1909519095
Pos = Start;
19096-
--ReduxWidth;
19097-
if (ReduxWidth > 1)
19098-
ReduxWidth = getFloorFullVectorNumberOfElements(
19099-
*TTI, Candidates.front()->getType(), ReduxWidth);
19096+
ReduxWidth = bit_ceil(ReduxWidth) / 2;
1910019097
return IsAnyRedOpGathered;
1910119098
};
1910219099
bool AnyVectorized = false;
@@ -19328,10 +19325,7 @@ class HorizontalReduction {
1932819325
}
1932919326
Pos += ReduxWidth;
1933019327
Start = Pos;
19331-
ReduxWidth = NumReducedVals - Pos;
19332-
if (ReduxWidth > 1)
19333-
ReduxWidth = getFloorFullVectorNumberOfElements(
19334-
*TTI, Candidates.front()->getType(), NumReducedVals - Pos);
19328+
ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
1933519329
AnyVectorized = true;
1933619330
}
1933719331
if (OptReusedScalars && !AnyVectorized) {

llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -318,14 +318,22 @@ entry:
318318
define float @f(ptr nocapture readonly %x) {
319319
; CHECK-LABEL: @f(
320320
; CHECK-NEXT: entry:
321-
; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
322-
; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
321+
; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
322+
; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
323+
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
324+
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
325+
; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
326+
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
323327
; CHECK-NEXT: ret float [[OP_RDX]]
324328
;
325329
; THRESHOLD-LABEL: @f(
326330
; THRESHOLD-NEXT: entry:
327-
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
328-
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
331+
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
332+
; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
333+
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
334+
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
335+
; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
336+
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
329337
; THRESHOLD-NEXT: ret float [[OP_RDX]]
330338
;
331339
entry:
@@ -598,14 +606,18 @@ define float @loadadd31(ptr nocapture readonly %x) {
598606
; CHECK-LABEL: @loadadd31(
599607
; CHECK-NEXT: entry:
600608
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
601-
; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
609+
; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
610+
; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
611+
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
602612
; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
603613
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
604614
; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
605615
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
606616
; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
607617
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
608-
; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
618+
; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
619+
; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
620+
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
609621
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
610622
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
611623
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
@@ -615,14 +627,18 @@ define float @loadadd31(ptr nocapture readonly %x) {
615627
; THRESHOLD-LABEL: @loadadd31(
616628
; THRESHOLD-NEXT: entry:
617629
; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
618-
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
630+
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
631+
; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
632+
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
619633
; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
620634
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
621635
; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
622636
; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
623637
; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
624638
; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
625-
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
639+
; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
640+
; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
641+
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
626642
; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
627643
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
628644
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]

0 commit comments

Comments
 (0)