Skip to content

Commit da118c9

Browse files
committed
[SLP]Do extra analysis int minbitwidth if some checks return false.
The instruction itself can be considered good for minbitwidth casting, even if one of the operand checks returns false. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #84363
1 parent e4f9175 commit da118c9

File tree

2 files changed

+64
-51
lines changed

2 files changed

+64
-51
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 53 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -13987,15 +13987,6 @@ bool BoUpSLP::collectValuesToDemote(
1398713987
// If the value is not a vectorized instruction in the expression and not used
1398813988
// by the insertelement instruction and not used in multiple vector nodes, it
1398913989
// cannot be demoted.
13990-
// TODO: improve handling of gathered values and others.
13991-
auto *I = dyn_cast<Instruction>(V);
13992-
const TreeEntry *ITE = I ? getTreeEntry(I) : nullptr;
13993-
if (!ITE || !Visited.insert(I).second || MultiNodeScalars.contains(I) ||
13994-
all_of(I->users(), [&](User *U) {
13995-
return isa<InsertElementInst>(U) && !getTreeEntry(U);
13996-
}))
13997-
return false;
13998-
1399913990
auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
1400013991
if (MultiNodeScalars.contains(V))
1400113992
return false;
@@ -14010,8 +14001,44 @@ bool BoUpSLP::collectValuesToDemote(
1401014001
BitWidth = std::max(BitWidth, BitWidth1);
1401114002
return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
1401214003
};
14004+
auto FinalAnalysis = [&](const TreeEntry *ITE = nullptr) {
14005+
if (!IsProfitableToDemote)
14006+
return false;
14007+
return (ITE && ITE->UserTreeIndices.size() > 1) ||
14008+
IsPotentiallyTruncated(V, BitWidth);
14009+
};
14010+
// TODO: improve handling of gathered values and others.
14011+
auto *I = dyn_cast<Instruction>(V);
14012+
const TreeEntry *ITE = I ? getTreeEntry(I) : nullptr;
14013+
if (!ITE || !Visited.insert(I).second || MultiNodeScalars.contains(I) ||
14014+
all_of(I->users(), [&](User *U) {
14015+
return isa<InsertElementInst>(U) && !getTreeEntry(U);
14016+
}))
14017+
return FinalAnalysis();
14018+
1401314019
unsigned Start = 0;
1401414020
unsigned End = I->getNumOperands();
14021+
14022+
auto ProcessOperands = [&](ArrayRef<Value *> Operands, bool &NeedToExit) {
14023+
NeedToExit = false;
14024+
unsigned InitLevel = MaxDepthLevel;
14025+
for (Value *IncValue : Operands) {
14026+
unsigned Level = InitLevel;
14027+
if (!collectValuesToDemote(IncValue, IsProfitableToDemoteRoot, BitWidth,
14028+
ToDemote, DemotedConsts, Visited, Level,
14029+
IsProfitableToDemote, IsTruncRoot)) {
14030+
if (!IsProfitableToDemote)
14031+
return false;
14032+
NeedToExit = true;
14033+
if (!FinalAnalysis(ITE))
14034+
return false;
14035+
continue;
14036+
}
14037+
MaxDepthLevel = std::max(MaxDepthLevel, Level);
14038+
}
14039+
return true;
14040+
};
14041+
bool NeedToExit = false;
1401514042
switch (I->getOpcode()) {
1401614043

1401714044
// We can always demote truncations and extensions. Since truncations can
@@ -14037,35 +14064,21 @@ bool BoUpSLP::collectValuesToDemote(
1403714064
case Instruction::And:
1403814065
case Instruction::Or:
1403914066
case Instruction::Xor: {
14040-
unsigned Level1 = MaxDepthLevel, Level2 = MaxDepthLevel;
14041-
if ((ITE->UserTreeIndices.size() > 1 &&
14042-
!IsPotentiallyTruncated(I, BitWidth)) ||
14043-
!collectValuesToDemote(I->getOperand(0), IsProfitableToDemoteRoot,
14044-
BitWidth, ToDemote, DemotedConsts, Visited,
14045-
Level1, IsProfitableToDemote, IsTruncRoot) ||
14046-
!collectValuesToDemote(I->getOperand(1), IsProfitableToDemoteRoot,
14047-
BitWidth, ToDemote, DemotedConsts, Visited,
14048-
Level2, IsProfitableToDemote, IsTruncRoot))
14067+
if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
14068+
return false;
14069+
if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
1404914070
return false;
14050-
MaxDepthLevel = std::max(Level1, Level2);
1405114071
break;
1405214072
}
1405314073

1405414074
// We can demote selects if we can demote their true and false values.
1405514075
case Instruction::Select: {
14076+
if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
14077+
return false;
1405614078
Start = 1;
14057-
unsigned Level1 = MaxDepthLevel, Level2 = MaxDepthLevel;
14058-
SelectInst *SI = cast<SelectInst>(I);
14059-
if ((ITE->UserTreeIndices.size() > 1 &&
14060-
!IsPotentiallyTruncated(I, BitWidth)) ||
14061-
!collectValuesToDemote(SI->getTrueValue(), IsProfitableToDemoteRoot,
14062-
BitWidth, ToDemote, DemotedConsts, Visited,
14063-
Level1, IsProfitableToDemote, IsTruncRoot) ||
14064-
!collectValuesToDemote(SI->getFalseValue(), IsProfitableToDemoteRoot,
14065-
BitWidth, ToDemote, DemotedConsts, Visited,
14066-
Level2, IsProfitableToDemote, IsTruncRoot))
14079+
auto *SI = cast<SelectInst>(I);
14080+
if (!ProcessOperands({SI->getTrueValue(), SI->getFalseValue()}, NeedToExit))
1406714081
return false;
14068-
MaxDepthLevel = std::max(Level1, Level2);
1406914082
break;
1407014083
}
1407114084

@@ -14075,23 +14088,20 @@ bool BoUpSLP::collectValuesToDemote(
1407514088
PHINode *PN = cast<PHINode>(I);
1407614089
if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
1407714090
return false;
14078-
unsigned InitLevel = MaxDepthLevel;
14079-
for (Value *IncValue : PN->incoming_values()) {
14080-
unsigned Level = InitLevel;
14081-
if (!collectValuesToDemote(IncValue, IsProfitableToDemoteRoot, BitWidth,
14082-
ToDemote, DemotedConsts, Visited, Level,
14083-
IsProfitableToDemote, IsTruncRoot))
14084-
return false;
14085-
MaxDepthLevel = std::max(MaxDepthLevel, Level);
14086-
}
14091+
SmallVector<Value *> Ops(PN->incoming_values().begin(),
14092+
PN->incoming_values().end());
14093+
if (!ProcessOperands(Ops, NeedToExit))
14094+
return false;
1408714095
break;
1408814096
}
1408914097

1409014098
// Otherwise, conservatively give up.
1409114099
default:
1409214100
MaxDepthLevel = 1;
14093-
return IsProfitableToDemote && IsPotentiallyTruncated(I, BitWidth);
14101+
return FinalAnalysis();
1409414102
}
14103+
if (NeedToExit)
14104+
return true;
1409514105

1409614106
++MaxDepthLevel;
1409714107
// Gather demoted constant operands.
@@ -14130,15 +14140,17 @@ void BoUpSLP::computeMinimumValueSizes() {
1413014140

1413114141
// The first value node for store/insertelement is sext/zext/trunc? Skip it,
1413214142
// resize to the final type.
14143+
bool IsTruncRoot = false;
1413314144
bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
1413414145
if (NodeIdx != 0 &&
1413514146
VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
1413614147
(VectorizableTree[NodeIdx]->getOpcode() == Instruction::ZExt ||
1413714148
VectorizableTree[NodeIdx]->getOpcode() == Instruction::SExt ||
1413814149
VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc)) {
1413914150
assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
14140-
++NodeIdx;
14151+
IsTruncRoot = VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc;
1414114152
IsProfitableToDemoteRoot = true;
14153+
++NodeIdx;
1414214154
}
1414314155

1414414156
// Analyzed in reduction already and not profitable - exit.
@@ -14270,7 +14282,6 @@ void BoUpSLP::computeMinimumValueSizes() {
1427014282
ReductionBitWidth = bit_ceil(ReductionBitWidth);
1427114283
}
1427214284
bool IsTopRoot = NodeIdx == 0;
14273-
bool IsTruncRoot = false;
1427414285
while (NodeIdx < VectorizableTree.size() &&
1427514286
VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
1427614287
VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {

llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ for.end: ; preds = %for.end.loopexit, %
228228
; YAML-NEXT: Function: test_unrolled_select
229229
; YAML-NEXT: Args:
230230
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
231-
; YAML-NEXT: - Cost: '-36'
231+
; YAML-NEXT: - Cost: '-41'
232232
; YAML-NEXT: - String: ' and with tree size '
233233
; YAML-NEXT: - TreeSize: '10'
234234

@@ -246,15 +246,17 @@ define i32 @test_unrolled_select(ptr noalias nocapture readonly %blk1, ptr noali
246246
; CHECK-NEXT: [[P2_045:%.*]] = phi ptr [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR88:%.*]], [[IF_END_86]] ]
247247
; CHECK-NEXT: [[P1_044:%.*]] = phi ptr [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END_86]] ]
248248
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[P1_044]], align 1
249-
; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
249+
; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
250250
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[P2_045]], align 1
251-
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
252-
; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
253-
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[TMP4]], zeroinitializer
254-
; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP4]]
255-
; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> [[TMP4]]
256-
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
257-
; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP8]], [[S_047]]
251+
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
252+
; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
253+
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[TMP4]] to <8 x i32>
254+
; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[TMP5]], zeroinitializer
255+
; CHECK-NEXT: [[TMP7:%.*]] = sub <8 x i16> zeroinitializer, [[TMP4]]
256+
; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP4]]
257+
; CHECK-NEXT: [[TMP9:%.*]] = sext <8 x i16> [[TMP8]] to <8 x i32>
258+
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
259+
; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP10]], [[S_047]]
258260
; CHECK-NEXT: [[CMP83:%.*]] = icmp slt i32 [[OP_RDX]], [[LIM:%.*]]
259261
; CHECK-NEXT: br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]]
260262
; CHECK: if.end.86:

0 commit comments

Comments
 (0)