@@ -408,6 +408,10 @@ static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
408
408
// after prolog. See `emitIterationCountCheck`.
409
409
static constexpr uint32_t MinItersBypassWeights[] = {1 , 127 };
410
410
411
+ static cl::opt<bool > EnableShufflePadding (
412
+ " enable-shuffle-padding" , cl::init(true ), cl::Hidden,
413
+ cl::desc(" Enable shuffle padding to generate structure store." ));
414
+
411
415
// / A helper function that returns true if the given type is irregular. The
412
416
// / type is irregular if its allocated size doesn't equal the store size of an
413
417
// / element of the corresponding vector type.
@@ -796,6 +800,11 @@ class InnerLoopVectorizer {
796
800
// correct start value of reduction PHIs when vectorizing the epilogue.
797
801
SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4 >
798
802
ReductionResumeValues;
803
+
804
+ // / The map stores shuffles which are used to pad the gap of the interleaved
805
+ // / store groups. The key for the map is the entry of the load group who is
806
+ // / matched to the related store group.
807
+ MapVector<Value *, SmallVector<SmallVector<Value *, 4 >, 4 >> PaddedShufflesMap;
799
808
};
800
809
801
810
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -1702,6 +1711,11 @@ class LoopVectorizationCostModel {
1702
1711
// / \p VF is the vectorization factor chosen for the original loop.
1703
1712
bool isEpilogueVectorizationProfitable (const ElementCount VF) const ;
1704
1713
1714
+ Value *hasMatchedLoadGroupForStore (Instruction *Inst, BasicBlock *BB,
1715
+ Value *Ptr) const {
1716
+ return InterleaveInfo.hasMatchedLoadGroupForStore (Inst, BB, Ptr);
1717
+ }
1718
+
1705
1719
private:
1706
1720
unsigned NumPredStores = 0 ;
1707
1721
@@ -2557,6 +2571,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
2557
2571
: ShuffledMask;
2558
2572
};
2559
2573
2574
+ Value *MatchedLoad = nullptr ;
2575
+ bool IsShufflePadding = false ;
2576
+ if (EnableShufflePadding && useMaskedInterleavedAccesses (*TTI) &&
2577
+ TTI->enableScalableVectorization ()) {
2578
+ IsShufflePadding = true ;
2579
+ if (isa<StoreInst>(Instr) && (Group->getNumMembers () != Group->getFactor ()))
2580
+ MatchedLoad = Cost->hasMatchedLoadGroupForStore (
2581
+ Instr, Instr->getParent (), getUnderlyingObject (Instr->getOperand (1 )));
2582
+ }
2583
+
2560
2584
// Vectorize the interleaved load group.
2561
2585
if (isa<LoadInst>(Instr)) {
2562
2586
Value *MaskForGaps = nullptr ;
@@ -2626,8 +2650,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
2626
2650
for (unsigned I = 0 ; I < InterleaveFactor; ++I) {
2627
2651
Instruction *Member = Group->getMember (I);
2628
2652
2629
- // Skip the gaps in the group.
2630
- if (!Member)
2653
+ SmallVector<Value *, 4 > Shuffles;
2654
+ // Skip the gaps in the group if there are no paddings.
2655
+ if (!Member && !IsShufflePadding)
2631
2656
continue ;
2632
2657
2633
2658
auto StrideMask =
@@ -2636,6 +2661,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
2636
2661
Value *StridedVec = Builder.CreateShuffleVector (
2637
2662
NewLoads[Part], StrideMask, " strided.vec" );
2638
2663
2664
+ if (!Member) {
2665
+ if (Group->isReverse ())
2666
+ StridedVec = Builder.CreateVectorReverse (StridedVec, " reverse" );
2667
+ Shuffles.push_back (StridedVec);
2668
+ continue ;
2669
+ }
2639
2670
// If this member has different type, cast the result type.
2640
2671
if (Member->getType () != ScalarTy) {
2641
2672
assert (!VF.isScalable () && " VF is assumed to be non scalable." );
@@ -2646,9 +2677,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
2646
2677
if (Group->isReverse ())
2647
2678
StridedVec = Builder.CreateVectorReverse (StridedVec, " reverse" );
2648
2679
2680
+ Shuffles.push_back (StridedVec);
2681
+
2649
2682
State.set (VPDefs[J], StridedVec, Part);
2650
2683
}
2651
- ++J;
2684
+ PaddedShufflesMap[Instr].push_back (Shuffles);
2685
+ if (Member)
2686
+ ++J;
2652
2687
}
2653
2688
return ;
2654
2689
}
@@ -2672,6 +2707,24 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
2672
2707
" Fail to get a member from an interleaved store group" );
2673
2708
Instruction *Member = Group->getMember (i);
2674
2709
2710
+ if (!Member && MatchedLoad) {
2711
+ // %wide.vec = load <12 x float>; 0,1,2,3,...,11
2712
+ // %shuffle1 = shuffle %wide.vec, poison, <0, 3, 6, 9> ; 0,3,6,9
2713
+ // %shuffle2 = shuffle %wide.vec, poison, <1, 4, 7, 10> ; 1,4,7,10
2714
+ // %padded = shuffle %wide.vec, poison, <2, 5, 8, 11> ; 2,5,8,11
2715
+ //
2716
+ // %concate1 = shuffle %op1, %op2, <0, 1, ..., 7> ; 0,3,6,9,1,4,7,10
2717
+ // %concate2 = shuffle %padded, poison,
2718
+ // <0, 1, ..., 3, undef, undef, undef, undef>
2719
+ // ; 2,5,8,11,poison,...,poison
2720
+ // %concateFinal = shuffle %concate1, %concate2,
2721
+ // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; 0,1,2,3,...,11
2722
+ // store <12 x float> %concateFinal
2723
+ Value *PaddedShuffle = PaddedShufflesMap[MatchedLoad][i][Part];
2724
+ StoredVecs.push_back (PaddedShuffle);
2725
+ continue ;
2726
+ }
2727
+
2675
2728
// Skip the gaps in the group.
2676
2729
if (!Member) {
2677
2730
Value *Undef = PoisonValue::get (SubVT);
@@ -2696,7 +2749,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
2696
2749
// Interleave all the smaller vectors into one wider vector.
2697
2750
Value *IVec = interleaveVectors (Builder, StoredVecs, " interleaved.vec" );
2698
2751
Instruction *NewStoreInstr;
2699
- if (BlockInMask || MaskForGaps) {
2752
+ if (( BlockInMask || MaskForGaps) && !MatchedLoad ) {
2700
2753
Value *GroupMask = CreateGroupMask (Part, MaskForGaps);
2701
2754
NewStoreInstr = Builder.CreateMaskedStore (IVec, AddrParts[Part],
2702
2755
Group->getAlign (), GroupMask);
@@ -6325,10 +6378,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6325
6378
if (Group->getMember (IF))
6326
6379
Indices.push_back (IF);
6327
6380
6381
+ bool IsShufflePaddingStore = false ;
6382
+ if (EnableShufflePadding && useMaskedInterleavedAccesses (TTI) &&
6383
+ TTI.enableScalableVectorization () && !VF.isScalable ())
6384
+ IsShufflePaddingStore = true ;
6385
+
6328
6386
// Calculate the cost of the whole interleaved group.
6387
+ // If shuffle padding is enabled, ignore gaps.
6329
6388
bool UseMaskForGaps =
6330
6389
(Group->requiresScalarEpilogue () && !isScalarEpilogueAllowed ()) ||
6331
- (isa<StoreInst>(I) && (Group->getNumMembers () < Group->getFactor ()));
6390
+ (isa<StoreInst>(I) && (Group->getNumMembers () < Group->getFactor ()) &&
6391
+ (!IsShufflePaddingStore ||
6392
+ !hasMatchedLoadGroupForStore (I, I->getParent (),
6393
+ getUnderlyingObject (I->getOperand (1 )))));
6332
6394
InstructionCost Cost = TTI.getInterleavedMemoryOpCost (
6333
6395
I->getOpcode (), WideVecTy, Group->getFactor (), Indices, Group->getAlign (),
6334
6396
AS, CostKind, Legal->isMaskRequired (I), UseMaskForGaps);
0 commit comments