@@ -11571,6 +11571,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
11571
11571
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
11572
11572
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
11573
11573
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
11574
+ SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
11574
11575
for (ExternalUser &EU : ExternalUses) {
11575
11576
// Uses by ephemeral values are free (because the ephemeral value will be
11576
11577
// removed prior to code generation, and so the extraction will be
@@ -11706,7 +11707,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
11706
11707
// Can use original instruction, if no operands vectorized or they are
11707
11708
// marked as externally used already.
11708
11709
auto *Inst = cast<Instruction>(EU.Scalar);
11709
- bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
11710
+ InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
11711
+ auto OperandIsScalar = [&](Value *V) {
11710
11712
if (!getTreeEntry(V)) {
11711
11713
// Some extractelements might be not vectorized, but
11712
11714
// transformed into shuffle and removed from the function,
@@ -11716,9 +11718,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
11716
11718
return true;
11717
11719
}
11718
11720
return ValueToExtUses->contains(V);
11719
- });
11721
+ };
11722
+ bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
11723
+ bool CanBeUsedAsScalarCast = false;
11724
+ if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
11725
+ if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
11726
+ Op && all_of(Op->operands(), OperandIsScalar)) {
11727
+ InstructionCost OpCost =
11728
+ (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
11729
+ ? TTI->getInstructionCost(Op, CostKind)
11730
+ : 0;
11731
+ if (ScalarCost + OpCost <= ExtraCost) {
11732
+ CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
11733
+ ScalarCost += OpCost;
11734
+ }
11735
+ }
11736
+ }
11720
11737
if (CanBeUsedAsScalar) {
11721
- InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
11722
11738
bool KeepScalar = ScalarCost <= ExtraCost;
11723
11739
// Try to keep original scalar if the user is the phi node from the same
11724
11740
// block as the root phis, currently vectorized. It allows to keep
@@ -11774,12 +11790,34 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
11774
11790
ExtraCost = ScalarCost;
11775
11791
if (!IsPhiInLoop(EU))
11776
11792
ExtractsCount[Entry].insert(Inst);
11793
+ if (CanBeUsedAsScalarCast) {
11794
+ ScalarOpsFromCasts.insert(Inst->getOperand(0));
11795
+ // Update the users of the operands of the cast operand to avoid
11796
+ // compiler crash.
11797
+ if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
11798
+ for_each(IOp->operands(), [&](Value *V) {
11799
+ auto It = ValueToExtUses->find(V);
11800
+ if (It != ValueToExtUses->end()) {
11801
+ // Replace all uses to avoid compiler crash.
11802
+ ExternalUses[It->second].User = nullptr;
11803
+ }
11804
+ });
11805
+ }
11806
+ }
11777
11807
}
11778
11808
}
11779
11809
}
11780
11810
11781
11811
ExtractCost += ExtraCost;
11782
11812
}
11813
+ // Insert externals for extract of operands of casts to be emitted as scalars
11814
+ // instead of extractelement.
11815
+ for (Value *V : ScalarOpsFromCasts) {
11816
+ ExternalUsesAsOriginalScalar.insert(V);
11817
+ if (const TreeEntry *E = getTreeEntry(V)) {
11818
+ ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
11819
+ }
11820
+ }
11783
11821
// Add reduced value cost, if resized.
11784
11822
if (!VectorizedVals.empty()) {
11785
11823
const TreeEntry &Root = *VectorizableTree.front();
@@ -13095,7 +13133,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
13095
13133
UniqueBases.insert(VecBase);
13096
13134
// If the only one use is vectorized - can delete the extractelement
13097
13135
// itself.
13098
- if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
13136
+ if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
13137
+ (NumParts != 1 && count(E->Scalars, EI) > 1) ||
13099
13138
any_of(EI->users(), [&](User *U) {
13100
13139
const TreeEntry *UTE = R.getTreeEntry(U);
13101
13140
return !UTE || R.MultiNodeScalars.contains(U) ||
0 commit comments