-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Enable splat ordering for loads #115173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Enable splat ordering for loads #115173
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesEnables splat support for loads with lanes> 2 or number of operands> 2. Allows better detect splats of loads and reduces number of shuffles in X86, AVX512, -O3+LTO Metric: size..text Better vectorization quality Full diff: https://github.com/llvm/llvm-project/pull/115173.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4454eb3e34d983..bf7bc570000dde 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2386,6 +2386,9 @@ class BoUpSLP {
/// the whole vector (it is mixed with constants or loop invariant values).
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+ // Small number of loads - try load matching.
+ if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
+ return false;
bool OpAPO = getData(OpIdx, Lane).APO;
bool IsInvariant = L && L->isLoopInvariant(Op);
unsigned Cnt = 0;
@@ -2511,23 +2514,23 @@ class BoUpSLP {
Value *OpLane0 = getValue(OpIdx, FirstLane);
// Keep track if we have instructions with all the same opcode on one
// side.
- if (isa<LoadInst>(OpLane0))
- ReorderingModes[OpIdx] = ReorderingMode::Load;
- else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
+ if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
// Check if OpLane0 should be broadcast.
if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
!canBeVectorized(OpILane0, OpIdx, FirstLane))
ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else if (isa<LoadInst>(OpILane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Load;
else
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
- } else if (isa<Constant>(OpLane0))
+ } else if (isa<Constant>(OpLane0)) {
ReorderingModes[OpIdx] = ReorderingMode::Constant;
- else if (isa<Argument>(OpLane0))
+ } else if (isa<Argument>(OpLane0)) {
// Our best hope is a Splat. It may save some cost in some cases.
ReorderingModes[OpIdx] = ReorderingMode::Splat;
- else
- // NOTE: This should be unreachable.
- ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ } else {
+ llvm_unreachable("Unexpected value kind.");
+ }
}
// Check that we don't have same operands. No need to reorder if operands
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
index 928cbe36554119..1e7cc9c268cfa1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
@@ -8,19 +8,13 @@ define fastcc void @rephase(ptr %phases_in, ptr %157, i64 %158) {
; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
; CHECK-NEXT: [[IMAG_247:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 408
-; CHECK-NEXT: [[MUL35_248:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT: store double [[MUL35_248]], ptr [[IMAG_247]], align 8
-; CHECK-NEXT: [[ARRAYIDX23_1_249:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 416
-; CHECK-NEXT: [[MUL_1_250:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT: store double [[MUL_1_250]], ptr [[ARRAYIDX23_1_249]], align 8
; CHECK-NEXT: [[IMAG_1_251:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 424
-; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT: [[MUL35_1_252:%.*]] = fmul double [[TMP2]], [[TMP3]]
-; CHECK-NEXT: store double [[MUL35_1_252]], ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT: [[ARRAYIDX23_2_253:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 432
-; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX23_2_253]], align 8
-; CHECK-NEXT: [[MUL_2_254:%.*]] = fmul double [[TMP2]], [[TMP4]]
-; CHECK-NEXT: store double [[MUL_2_254]], ptr [[ARRAYIDX23_2_253]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IMAG_1_251]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <2 x double> [[TMP3]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: store <4 x double> [[TMP7]], ptr [[IMAG_247]], align 8
; CHECK-NEXT: store double [[TMP2]], ptr [[PHASES_IN]], align 8
; CHECK-NEXT: ret void
;
|
@llvm/pr-subscribers-vectorizers Author: Alexey Bataev (alexey-bataev) ChangesEnables splat support for loads with lanes> 2 or number of operands> 2. Allows better detect splats of loads and reduces number of shuffles in X86, AVX512, -O3+LTO Metric: size..text Better vectorization quality Full diff: https://github.com/llvm/llvm-project/pull/115173.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4454eb3e34d983..bf7bc570000dde 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2386,6 +2386,9 @@ class BoUpSLP {
/// the whole vector (it is mixed with constants or loop invariant values).
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+ // Small number of loads - try load matching.
+ if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
+ return false;
bool OpAPO = getData(OpIdx, Lane).APO;
bool IsInvariant = L && L->isLoopInvariant(Op);
unsigned Cnt = 0;
@@ -2511,23 +2514,23 @@ class BoUpSLP {
Value *OpLane0 = getValue(OpIdx, FirstLane);
// Keep track if we have instructions with all the same opcode on one
// side.
- if (isa<LoadInst>(OpLane0))
- ReorderingModes[OpIdx] = ReorderingMode::Load;
- else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
+ if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
// Check if OpLane0 should be broadcast.
if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
!canBeVectorized(OpILane0, OpIdx, FirstLane))
ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else if (isa<LoadInst>(OpILane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Load;
else
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
- } else if (isa<Constant>(OpLane0))
+ } else if (isa<Constant>(OpLane0)) {
ReorderingModes[OpIdx] = ReorderingMode::Constant;
- else if (isa<Argument>(OpLane0))
+ } else if (isa<Argument>(OpLane0)) {
// Our best hope is a Splat. It may save some cost in some cases.
ReorderingModes[OpIdx] = ReorderingMode::Splat;
- else
- // NOTE: This should be unreachable.
- ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ } else {
+ llvm_unreachable("Unexpected value kind.");
+ }
}
// Check that we don't have same operands. No need to reorder if operands
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
index 928cbe36554119..1e7cc9c268cfa1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
@@ -8,19 +8,13 @@ define fastcc void @rephase(ptr %phases_in, ptr %157, i64 %158) {
; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
; CHECK-NEXT: [[IMAG_247:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 408
-; CHECK-NEXT: [[MUL35_248:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT: store double [[MUL35_248]], ptr [[IMAG_247]], align 8
-; CHECK-NEXT: [[ARRAYIDX23_1_249:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 416
-; CHECK-NEXT: [[MUL_1_250:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT: store double [[MUL_1_250]], ptr [[ARRAYIDX23_1_249]], align 8
; CHECK-NEXT: [[IMAG_1_251:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 424
-; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT: [[MUL35_1_252:%.*]] = fmul double [[TMP2]], [[TMP3]]
-; CHECK-NEXT: store double [[MUL35_1_252]], ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT: [[ARRAYIDX23_2_253:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 432
-; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX23_2_253]], align 8
-; CHECK-NEXT: [[MUL_2_254:%.*]] = fmul double [[TMP2]], [[TMP4]]
-; CHECK-NEXT: store double [[MUL_2_254]], ptr [[ARRAYIDX23_2_253]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IMAG_1_251]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <2 x double> [[TMP3]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: store <4 x double> [[TMP7]], ptr [[IMAG_247]], align 8
; CHECK-NEXT: store double [[TMP2]], ptr [[PHASES_IN]], align 8
; CHECK-NEXT: ret void
;
|
Ping! |
1 similar comment
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Adds cost estimation for the variants of the permutations of the scalar values, used in gather nodes. Currently, SLP just unconditionally emits shuffles for the reused buildvectors, but in some cases better to leave them as buildvectors rather than shuffles, if the cost of such buildvectors is better. X86, AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 912998.00 913238.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 203070.00 203102.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1396320.00 1396448.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1396320.00 1396448.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 309790.00 309678.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12477607.00 12470807.00 -0.1% CINT2006/445.gobmk - extra code vectorized MiBench/consumer-lame - small variations CFP2017speed/638.imagick_s CFP2017rate/538.imagick_r - extra vectorized code Benchmarks/Bullet - extra code vectorized CFP2017rate/526.blender_r - extra vector code RISC-V, sifive-p670, -O3+LTO CFP2006/433.milc - regressions, should be fixed by #115173 CFP2006/453.povray - extra vectorized code CFP2017rate/508.namd_r - better vector code CFP2017rate/510.parest_r - extra vectorized code SPEC/CFP2017rate - extra/better vector code CFP2017rate/526.blender_r - extra vectorized code CFP2017rate/538.imagick_r - extra vectorized code CINT2006/403.gcc - extra vectorized code CINT2006/445.gobmk - extra vectorized code CINT2006/464.h264ref - extra vectorized code CINT2006/483.xalancbmk - small variations CINT2017rate/525.x264_r - better vectorization Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #115201
Adds cost estimation for the variants of the permutations of the scalar values, used in gather nodes. Currently, SLP just unconditionally emits shuffles for the reused buildvectors, but in some cases better to leave them as buildvectors rather than shuffles, if the cost of such buildvectors is better. X86, AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 912998.00 913238.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 203070.00 203102.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1396320.00 1396448.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1396320.00 1396448.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 309790.00 309678.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12477607.00 12470807.00 -0.1% CINT2006/445.gobmk - extra code vectorized MiBench/consumer-lame - small variations CFP2017speed/638.imagick_s CFP2017rate/538.imagick_r - extra vectorized code Benchmarks/Bullet - extra code vectorized CFP2017rate/526.blender_r - extra vector code RISC-V, sifive-p670, -O3+LTO CFP2006/433.milc - regressions, should be fixed by llvm/llvm-project#115173 CFP2006/453.povray - extra vectorized code CFP2017rate/508.namd_r - better vector code CFP2017rate/510.parest_r - extra vectorized code SPEC/CFP2017rate - extra/better vector code CFP2017rate/526.blender_r - extra vectorized code CFP2017rate/538.imagick_r - extra vectorized code CINT2006/403.gcc - extra vectorized code CINT2006/445.gobmk - extra vectorized code CINT2006/464.h264ref - extra vectorized code CINT2006/483.xalancbmk - small variations CINT2017rate/525.x264_r - better vectorization Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm/llvm-project#115201
Enables splat support for loads with lanes> 2 or number of operands> 2.
Allows better detect splats of loads and reduces number of shuffles in
some cases.
X86, AVX512, -O3+LTO
Metric: size..text
results results0 diff
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 154867.00 156723.00 1.2%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12467735.00 12468023.00 0.0%
Better vectorization quality