@@ -7934,6 +7934,33 @@ void BoUpSLP::transformNodes() {
7934
7934
}
7935
7935
break;
7936
7936
}
7937
+ case Instruction::Store: {
7938
+ Type *ScalarTy =
7939
+ cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
7940
+ auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
7941
+ Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
7942
+ // Check if profitable to represent consecutive load + reverse as strided
7943
+ // load with stride -1.
7944
+ if (isReverseOrder(E.ReorderIndices) &&
7945
+ TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
7946
+ SmallVector<int> Mask;
7947
+ inversePermutation(E.ReorderIndices, Mask);
7948
+ auto *BaseSI = cast<StoreInst>(E.Scalars.back());
7949
+ InstructionCost OriginalVecCost =
7950
+ TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
7951
+ BaseSI->getPointerAddressSpace(), CostKind,
7952
+ TTI::OperandValueInfo()) +
7953
+ ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
7954
+ InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
7955
+ Instruction::Store, VecTy, BaseSI->getPointerOperand(),
7956
+ /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
7957
+ if (StridedCost < OriginalVecCost)
7958
+ // Strided load is more profitable than consecutive load + reverse -
7959
+ // transform the node to strided load.
7960
+ E.State = TreeEntry::StridedVectorize;
7961
+ }
7962
+ break;
7963
+ }
7937
7964
default:
7938
7965
break;
7939
7966
}
@@ -9466,11 +9493,22 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9466
9493
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9467
9494
auto GetVectorCost = [=](InstructionCost CommonCost) {
9468
9495
// We know that we can merge the stores. Calculate the cost.
9469
- TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9470
- return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9471
- BaseSI->getPointerAddressSpace(), CostKind,
9472
- OpInfo) +
9473
- CommonCost;
9496
+ InstructionCost VecStCost;
9497
+ if (E->State == TreeEntry::StridedVectorize) {
9498
+ Align CommonAlignment =
9499
+ computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9500
+ VecStCost = TTI->getStridedMemoryOpCost(
9501
+ Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9502
+ /*VariableMask=*/false, CommonAlignment, CostKind);
9503
+ } else {
9504
+ assert(E->State == TreeEntry::Vectorize &&
9505
+ "Expected either strided or consecutive stores.");
9506
+ TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9507
+ VecStCost = TTI->getMemoryOpCost(
9508
+ Instruction::Store, VecTy, BaseSI->getAlign(),
9509
+ BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
9510
+ }
9511
+ return VecStCost + CommonCost;
9474
9512
};
9475
9513
SmallVector<Value *> PointerOps(VL.size());
9476
9514
for (auto [I, V] : enumerate(VL)) {
@@ -12398,7 +12436,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12398
12436
bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12399
12437
auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12400
12438
ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12401
- if (E->getOpcode() == Instruction::Store) {
12439
+ if (E->getOpcode() == Instruction::Store &&
12440
+ E->State == TreeEntry::Vectorize) {
12402
12441
ArrayRef<int> Mask =
12403
12442
ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12404
12443
E->ReorderIndices.size());
@@ -12986,8 +13025,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12986
13025
VecValue = FinalShuffle(VecValue, E, VecTy);
12987
13026
12988
13027
Value *Ptr = SI->getPointerOperand();
12989
- StoreInst *ST =
12990
- Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13028
+ Instruction *ST;
13029
+ if (E->State == TreeEntry::Vectorize) {
13030
+ ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13031
+ } else {
13032
+ assert(E->State == TreeEntry::StridedVectorize &&
13033
+ "Expected either strided or conseutive stores.");
13034
+ Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13035
+ Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13036
+ auto *Inst = Builder.CreateIntrinsic(
13037
+ Intrinsic::experimental_vp_strided_store,
13038
+ {VecTy, Ptr->getType(), StrideTy},
13039
+ {VecValue, Ptr,
13040
+ ConstantInt::get(
13041
+ StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13042
+ Builder.getAllOnesMask(VecTy->getElementCount()),
13043
+ Builder.getInt32(E->Scalars.size())});
13044
+ Inst->addParamAttr(
13045
+ /*ArgNo=*/1,
13046
+ Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13047
+ ST = Inst;
13048
+ }
12991
13049
12992
13050
Value *V = propagateMetadata(ST, E->Scalars);
12993
13051
0 commit comments