Skip to content

[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions. #85966

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 40 additions & 10 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,7 @@ class BoUpSLP {
MinBWs.clear();
ReductionBitWidth = 0;
CastMaxMinBWSizes.reset();
TruncNodes.clear();
ExtraBitWidthNodes.clear();
InstrElementSize.clear();
UserIgnoreList = nullptr;
PostponedGathers.clear();
Expand Down Expand Up @@ -3683,8 +3683,9 @@ class BoUpSLP {
/// type sizes, used in the tree.
std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;

/// Indices of the vectorized trunc nodes.
DenseSet<unsigned> TruncNodes;
/// Indices of the vectorized nodes, which supposed to be the roots of the new
/// bitwidth analysis attempt, like trunc, IToFP or ICmp.
DenseSet<unsigned> ExtraBitWidthNodes;
};

} // end namespace slpvectorizer
Expand Down Expand Up @@ -6612,7 +6613,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
PrevMaxBW),
std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
PrevMinBW));
TruncNodes.insert(VectorizableTree.size());
ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
} else if (ShuffleOrOp == Instruction::SIToFP ||
ShuffleOrOp == Instruction::UIToFP) {
unsigned NumSignBits =
ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
APInt Mask = DB->getDemandedBits(OpI);
NumSignBits = std::max(NumSignBits, Mask.countl_zero());
}
if (NumSignBits * 2 >=
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
}
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
Expand Down Expand Up @@ -6660,6 +6672,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
if (ShuffleOrOp == Instruction::ICmp) {
unsigned NumSignBits0 =
ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
if (NumSignBits0 * 2 >=
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
unsigned NumSignBits1 =
ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
if (NumSignBits1 * 2 >=
DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
}
return;
}
case Instruction::Select:
Expand Down Expand Up @@ -14302,7 +14326,8 @@ void BoUpSLP::computeMinimumValueSizes() {
bool IsStoreOrInsertElt =
VectorizableTree.front()->getOpcode() == Instruction::Store ||
VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 &&
if ((IsStoreOrInsertElt || UserIgnoreList) &&
ExtraBitWidthNodes.size() <= 1 &&
(!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
return;
Expand Down Expand Up @@ -14506,16 +14531,21 @@ void BoUpSLP::computeMinimumValueSizes() {
IsTopRoot = false;
IsProfitableToDemoteRoot = true;

if (TruncNodes.empty()) {
if (ExtraBitWidthNodes.empty()) {
NodeIdx = VectorizableTree.size();
} else {
unsigned NewIdx = 0;
do {
NewIdx = *TruncNodes.begin() + 1;
TruncNodes.erase(TruncNodes.begin());
} while (NewIdx <= NodeIdx && !TruncNodes.empty());
NewIdx = *ExtraBitWidthNodes.begin();
ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
} while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
NodeIdx = NewIdx;
IsTruncRoot = true;
IsTruncRoot = any_of(
VectorizableTree[NewIdx]->UserTreeIndices, [](const EdgeInfo &EI) {
return EI.EdgeIdx == 0 &&
EI.UserTE->getOpcode() == Instruction::ICmp &&
!EI.UserTE->isAltShuffle();
});
}

// If the maximum bit width we compute is less than the with of the roots'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
; CHECK-NEXT: [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP26]], <i32 254, i32 254>
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
; CHECK-NEXT: [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ define void @test() {
; CHECK-NEXT: [[TMP3:%.*]] = select i1 false, i32 0, i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> <i8 poison, i8 0, i8 poison, i8 poison>, i8 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i8> [[TMP5]] to <4 x i32>
; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i8> [[TMP8]] to <4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> zeroinitializer, [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[TMP10]]
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
Expand Down