Skip to content

Commit 59ef94d

Browse files
[SLP]Do not include the cost of and -1, <v> and emit just <v> after MinBitWidth.
After minbitwidth analysis, and <v>, (power_of_2 - 1 const) can be transformed into just an <v>, (all_ones const), which can be ignored at the cost estimation and at the codegen. x264 benchmark has this pattern. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #90739
1 parent e846778 commit 59ef94d

File tree

3 files changed

+26
-4
lines changed

3 files changed

+26
-4
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9484,6 +9484,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
94849484
Op1Info, Op2Info, Operands, VI);
94859485
};
94869486
auto GetVectorCost = [=](InstructionCost CommonCost) {
9487+
if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9488+
for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9489+
ArrayRef<Value *> Ops = E->getOperand(I);
9490+
if (all_of(Ops, [&](Value *Op) {
9491+
auto *CI = dyn_cast<ConstantInt>(Op);
9492+
return CI && CI->getValue().countr_one() >= It->second.first;
9493+
}))
9494+
return CommonCost;
9495+
}
9496+
}
94879497
unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
94889498
TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
94899499
TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
@@ -12969,6 +12979,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1296912979
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
1297012980
return E->VectorizedValue;
1297112981
}
12982+
if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
12983+
for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
12984+
ArrayRef<Value *> Ops = E->getOperand(I);
12985+
if (all_of(Ops, [&](Value *Op) {
12986+
auto *CI = dyn_cast<ConstantInt>(Op);
12987+
return CI && CI->getValue().countr_one() >= It->second.first;
12988+
})) {
12989+
V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
12990+
E->VectorizedValue = V;
12991+
++NumVectorInstructions;
12992+
return V;
12993+
}
12994+
}
12995+
}
1297212996
if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
1297312997
assert((It != MinBWs.end() ||
1297412998
getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||

llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@ define i16 @test() {
1212
; CHECK-NEXT: entry:
1313
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
1414
; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16>
15-
; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
16-
; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1>
15+
; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
1716
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]])
1817
; CHECK-NEXT: [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
1918
; CHECK-NEXT: [[T:%.*]] = trunc i32 [[TMP5]] to i16

llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ define i32 @test() {
99
; CHECK-NEXT: entry:
1010
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
1111
; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16>
12-
; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
13-
; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1>
12+
; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
1413
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]])
1514
; CHECK-NEXT: [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
1615
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP5]], i32 1)

0 commit comments

Comments
 (0)