-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Do not include the cost of and -1, <v> and emit just <v> after MinBitWidth. #90739
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Do not include the cost of and -1, <v> and emit just <v> after MinBitWidth. #90739
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesAfter minbitwidth analysis, and <v>, (power_of_2 - 1 const) can be Full diff: https://github.com/llvm/llvm-project/pull/90739.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c33d90d531bf53..914a7b205256c2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9484,6 +9484,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Op1Info, Op2Info, Operands, VI);
};
auto GetVectorCost = [=](InstructionCost CommonCost) {
+ if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
+ for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
+ ArrayRef<Value *> Ops = E->getOperand(I);
+ if (all_of(Ops, [&](Value *Op) {
+ auto *CI = dyn_cast<ConstantInt>(Op);
+ return CI && CI->getValue().countr_one() == It->second.first;
+ }))
+ return CommonCost;
+ }
+ }
unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
@@ -12969,6 +12979,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
+ for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
+ ArrayRef<Value *> Ops = E->getOperand(I);
+ if (all_of(Ops, [&](Value *Op) {
+ auto *CI = dyn_cast<ConstantInt>(Op);
+ return CI && CI->getValue().countr_one() == It->second.first;
+ })) {
+ V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ }
+ }
if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
assert((It != MinBWs.end() ||
getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll
index fc977585614baf..d6dc3bcc3354c1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll
@@ -12,8 +12,7 @@ define i16 @test() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
-; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
; CHECK-NEXT: [[T:%.*]] = trunc i32 [[TMP5]] to i16
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll
index 04d275742832ef..0c0c723e669961 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll
@@ -9,8 +9,7 @@ define i32 @test() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
-; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP5]], i32 1)
|
ArrayRef<Value *> Ops = E->getOperand(I); | ||
if (all_of(Ops, [&](Value *Op) { | ||
auto *CI = dyn_cast<ConstantInt>(Op); | ||
return CI && CI->getValue().countr_one() == It->second.first; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not CI->getValue().isAllOnes()? Ideally we'd have a isNeutralValue helper that would work for other binops as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the original bitwidth it might be not all-ones value, e.g.
%conv.1 = trunc i64 %1 to i32
%conv3.1 = and i32 %conv.1, 65535
With original i32 type 65535 is not all-ones. But SLP can transform it to <n x i16>
, where <i16 65535, i16 65535>
becomes all-ones <i16 -1, i16 -1>
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So should it be CI->getValue().countr_one() >= It->second.first
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, will adjust the check
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
After minbitwidth analysis, and , (power_of_2 - 1 const) can be
transformed into just an , (all_ones const), which can be ignored at
the cost estimation and at the codegen. x264 benchmark has this pattern.