-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Initial non-power-of-2 support (but still whole register) for reductions #112361
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Initial non-power-of-2 support (but still whole register) for reductions #112361
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-vectorizers Author: Alexey Bataev (alexey-bataev) ChangesEnables initial non-power-of-2 support (but still requiresnumber of Full diff: https://github.com/llvm/llvm-project/pull/112361.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 595aed2cab182b..0d4a0b3745ddff 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -291,6 +291,8 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
if (NumParts == 0 || NumParts >= Sz)
return bit_floor(Sz);
unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
+ if (RegVF > Sz)
+ return bit_floor(Sz);
return (Sz / RegVF) * RegVF;
}
@@ -19061,7 +19063,8 @@ class HorizontalReduction {
unsigned ReduxWidth = NumReducedVals;
if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
- ReduxWidth = bit_floor(ReduxWidth);
+ ReduxWidth = getFloorFullVectorNumberOfElements(
+ *TTI, Candidates.front()->getType(), ReduxWidth);
ReduxWidth = std::min(ReduxWidth, MaxElts);
unsigned Start = 0;
@@ -19069,10 +19072,7 @@ class HorizontalReduction {
// Restarts vectorization attempt with lower vector factor.
unsigned PrevReduxWidth = ReduxWidth;
bool CheckForReusedReductionOpsLocal = false;
- auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
- &CheckForReusedReductionOpsLocal,
- &PrevReduxWidth, &V,
- &IgnoreList](bool IgnoreVL = false) {
+ auto AdjustReducedVals = [&](bool IgnoreVL = false) {
bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
// Check if any of the reduction ops are gathered. If so, worth
@@ -19083,7 +19083,10 @@ class HorizontalReduction {
if (Pos < NumReducedVals - ReduxWidth + 1)
return IsAnyRedOpGathered;
Pos = Start;
- ReduxWidth = bit_ceil(ReduxWidth) / 2;
+ --ReduxWidth;
+ if (ReduxWidth > 1)
+ ReduxWidth = getFloorFullVectorNumberOfElements(
+ *TTI, Candidates.front()->getType(), ReduxWidth);
return IsAnyRedOpGathered;
};
bool AnyVectorized = false;
@@ -19315,7 +19318,10 @@ class HorizontalReduction {
}
Pos += ReduxWidth;
Start = Pos;
- ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
+ ReduxWidth = NumReducedVals - Pos;
+ if (ReduxWidth > 1)
+ ReduxWidth = getFloorFullVectorNumberOfElements(
+ *TTI, Candidates.front()->getType(), NumReducedVals - Pos);
AnyVectorized = true;
}
if (OptReusedScalars && !AnyVectorized) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 72e29839230e81..c9ff2d6426d2b6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -318,22 +318,14 @@ entry:
define float @f(ptr nocapture readonly %x) {
; CHECK-LABEL: @f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
-; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
-; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
-; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
; CHECK-NEXT: ret float [[OP_RDX]]
;
; THRESHOLD-LABEL: @f(
; THRESHOLD-NEXT: entry:
-; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
-; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
-; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
-; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
+; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
; THRESHOLD-NEXT: ret float [[OP_RDX]]
;
entry:
@@ -606,18 +598,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
; CHECK-LABEL: @loadadd31(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
-; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
@@ -627,18 +615,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
; THRESHOLD-LABEL: @loadadd31(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
-; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
+; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
-; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
+; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
|
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesEnables initial non-power-of-2 support (but still requiresnumber of Full diff: https://github.com/llvm/llvm-project/pull/112361.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 595aed2cab182b..0d4a0b3745ddff 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -291,6 +291,8 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
if (NumParts == 0 || NumParts >= Sz)
return bit_floor(Sz);
unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
+ if (RegVF > Sz)
+ return bit_floor(Sz);
return (Sz / RegVF) * RegVF;
}
@@ -19061,7 +19063,8 @@ class HorizontalReduction {
unsigned ReduxWidth = NumReducedVals;
if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
- ReduxWidth = bit_floor(ReduxWidth);
+ ReduxWidth = getFloorFullVectorNumberOfElements(
+ *TTI, Candidates.front()->getType(), ReduxWidth);
ReduxWidth = std::min(ReduxWidth, MaxElts);
unsigned Start = 0;
@@ -19069,10 +19072,7 @@ class HorizontalReduction {
// Restarts vectorization attempt with lower vector factor.
unsigned PrevReduxWidth = ReduxWidth;
bool CheckForReusedReductionOpsLocal = false;
- auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
- &CheckForReusedReductionOpsLocal,
- &PrevReduxWidth, &V,
- &IgnoreList](bool IgnoreVL = false) {
+ auto AdjustReducedVals = [&](bool IgnoreVL = false) {
bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
// Check if any of the reduction ops are gathered. If so, worth
@@ -19083,7 +19083,10 @@ class HorizontalReduction {
if (Pos < NumReducedVals - ReduxWidth + 1)
return IsAnyRedOpGathered;
Pos = Start;
- ReduxWidth = bit_ceil(ReduxWidth) / 2;
+ --ReduxWidth;
+ if (ReduxWidth > 1)
+ ReduxWidth = getFloorFullVectorNumberOfElements(
+ *TTI, Candidates.front()->getType(), ReduxWidth);
return IsAnyRedOpGathered;
};
bool AnyVectorized = false;
@@ -19315,7 +19318,10 @@ class HorizontalReduction {
}
Pos += ReduxWidth;
Start = Pos;
- ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
+ ReduxWidth = NumReducedVals - Pos;
+ if (ReduxWidth > 1)
+ ReduxWidth = getFloorFullVectorNumberOfElements(
+ *TTI, Candidates.front()->getType(), NumReducedVals - Pos);
AnyVectorized = true;
}
if (OptReusedScalars && !AnyVectorized) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 72e29839230e81..c9ff2d6426d2b6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -318,22 +318,14 @@ entry:
define float @f(ptr nocapture readonly %x) {
; CHECK-LABEL: @f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
-; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
-; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
-; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
; CHECK-NEXT: ret float [[OP_RDX]]
;
; THRESHOLD-LABEL: @f(
; THRESHOLD-NEXT: entry:
-; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
-; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
-; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
-; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
+; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
; THRESHOLD-NEXT: ret float [[OP_RDX]]
;
entry:
@@ -606,18 +598,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
; CHECK-LABEL: @loadadd31(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
-; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
@@ -627,18 +615,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
; THRESHOLD-LABEL: @loadadd31(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
-; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
+; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
-; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
+; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
It looks like this causes some compile-time regressions: https://llvm-compile-time-tracker.com/compare.php?from=ec78f0da0e9b1b8e2b2323e434ea742e272dd913&to=8287fa8e596d8fc8655c8df3bc99e068ad9f7d4b&stat=instructions:u For example lencod me_distortion.c regresses by 14% without code changes. |
Will double check |
…ductions Enables initial non-power-of-2 support (but still requiresnumber of elements, forming whole registers) for reductions. Enables extra vectorization for MultiSource/Benchmarks/7zip/7zip-benchmark, CINT2006/464.h264ref and CFP2017rate/526.blender_r (checked for SSE2) Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#112361
…ductions Enables initial non-power-of-2 support (but still requires number of elements, forming whole registers) for reductions. Enables extra vectorization for MultiSource/Benchmarks/7zip/7zip-benchmark, CINT2006/464.h264ref and CFP2017rate/526.blender_r (checked for SSE2) Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #112361
Hi - Sorry but I had to revert this again. It caused regressions in the Phase Ordering test due to the order of shuffle elements being a lot worse than they were. ([0,2,4,6] is much better than [0,3,4,7]). It is a fairly gnarly SLP vectorization issue, where the order of elements matters a lot for performance and is tricky to get right. That was fixed in #100653, but the second patch here (7f2e937) undid it again I think. That might be something that is necessary if we can't figure out how to do it nicely within the compile-time budget that we have, but that should happen properly in a reviewed patch not in a fixup to something unrelated. The test case is quite subtle, and the whole thing feels a bit delicate at the moment. There is some adjustments made in VectorCombiner that help code like this that uses select-shuffles, and was at least partially done there so that it didn't have to be ran on every SLP case. I know there are other cases were we could be doing better if we could get the ordering a bit nicer, but it's tough to do that quickly for every bit of potentially SLP vectorizable code. |
…ductions Enables initial non-power-of-2 support (but still requires number of elements, forming whole registers) for reductions. Enables extra vectorization for MultiSource/Benchmarks/7zip/7zip-benchmark, CINT2006/464.h264ref and CFP2017rate/526.blender_r (checked for SSE2) Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #112361
@alexey-bataev we (at google) bisected a compilation breakage (infinite loop in Here's a reproducer: float *a;
float b, c, d, e, f, g;
void h() {
g += 0.0f * a[21];
g += 0.0f * a[7];
f += 0.0f * a[2];
g += 0.0f * a[17];
f += 0.0f * a[22];
f += c * a[23];
g += d * b;
f += e * a[24];
f += d * a[20];
g += e * a[25];
g += f;
} Compilation command: $ clang -target riscv64-unknown-linux \
-march=rv64gcv1p0_zfh_zvfh \
-std=c99 \
-O2 \
-c repro.i \
-o /tmp/out The compilation takes under 0.1s with the previous Could you please revert to unblock us? |
For convenience: https://gcc.godbolt.org/z/nYds3sWvW |
Need to check if the interleaving factor is less than total number of elements in loads slice to handle it correctly and avoid compiler crash. Fixes report #112361 (comment)
Must be fixed in 0c18def |
Thank you @alexey-bataev for the prompt fix!! |
Need to check if the interleaving factor is less than total number of elements in loads slice to handle it correctly and avoid compiler crash. Fixes report llvm#112361 (comment)
@alexey-bataev, is there a chance you can provide a switch to turn this feature on/off? Thanks for considering! |
Why? |
We've run into a performance regression and would like to have a choice to disable the feature to see how it impacts other workloads. I'm gathering more information on the regression and can probably provide a test case in the near future. |
Enables initial non-power-of-2 support (but still requiresnumber of
elements, forming whole registers) for reductions.
Enables extra vectorization for
MultiSource/Benchmarks/7zip/7zip-benchmark, CINT2006/464.h264ref and
CFP2017rate/526.blender_r (checked for SSE2)