[LoopVectorize] Enable shuffle padding for masked interleaved accesses

zhangtiehu · zhangtiehu · commit dc582745cb2b · 2023-12-14T22:44:14.000+08:00
typedef struct {
    float x;
    float y;
    float z;
} patic;

for (int i = 0; i &lt; num; i++) {
  ps[i].x = factor * ps[i].x;
  ps[i].y = factor * ps[i].y;
}

This patch pads the gap of the interleave store group to eliminate
masked.store, which helps to generate better code in Interleaved Access pass,
as shown,

%wide.vec = load &lt;12 x float&gt;; 0,1,2,3,...,11
%shuffle1 = shuffle %wide.vec, poison, &lt;0, 3, 6, 9&gt; ; 0,3,6,9
%shuffle2 = shuffle %wide.vec, poison, &lt;1, 4, 7, 10&gt; ; 1,4,7,10
%padded = shuffle %wide.vec, poison, &lt;2, 5, 8, 11&gt; ; 2,5,8,11

%concate1 = shuffle %op1, %op2, &lt;0, 1, ..., 7&gt; ; 0,3,6,9,1,4,7,10
%concate2 = shuffle %padded, poison,
            &lt;0, 1, ..., 3, undef, undef, undef, undef&gt; ; 2,5,8,11,poison,...,poison
%concateFinal = shuffle %concate1, %concate2,
                &lt;0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11&gt; ; 0,1,2,3,...,11
store &lt;12 x float&gt; %concateFinal

This patch adds some restrictions for shuffle padding, that is
target interleave store groups should have matched interleave
load groups, which means,
1. The value operand of StoreInst should comes from LoadInst.
2. The store group and the load group accesses the same strcut
memory.
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -819,6 +819,13 @@ class InterleavedAccessInfo {
   /// Returns true if we have any interleave groups.
   bool hasGroups() const { return !InterleaveGroups.empty(); }
 
+  /// Check if the interleaved store group has matched load group, which means
+  /// the store should be satisfied with some restrictions,
+  /// 1. The value operand of StoreInst should comes from LoadInst.
+  /// 2. The store group and the load group accesses the same memory.
+  Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
+                                     Value *Ptr) const;
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1441,6 +1441,29 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
   RequiresScalarEpilogue = false;
 }
 
+Value *InterleavedAccessInfo::hasMatchedLoadGroupForStore(Instruction *Inst,
+                                                          BasicBlock *BB,
+                                                          Value *Ptr) const {
+  if (isa<PHINode>(Inst) || Inst->getParent() != BB)
+    return nullptr;
+
+  if (isa<LoadInst>(Inst)) {
+    Value *V = getUnderlyingObject(Inst->getOperand(0));
+    auto Group = getInterleaveGroup(Inst);
+    if (Group && (V == Ptr))
+      return Group->getInsertPos();
+  }
+
+  for (unsigned It = 0; It < Inst->getNumOperands(); It++) {
+    if (Instruction *I = dyn_cast<Instruction>(Inst->getOperand(It)))
+      if (Value *MatchedLoadGroupEntry =
+              hasMatchedLoadGroupForStore(I, BB, Ptr))
+        return MatchedLoadGroupEntry;
+  }
+
+  return nullptr;
+}
+
 template <typename InstT>
 void InterleaveGroup<InstT>::addMetadata(InstT *NewInst) const {
   llvm_unreachable("addMetadata can only be used for Instruction");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -408,6 +408,10 @@ static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
 // after prolog. See `emitIterationCountCheck`.
 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
 
+static cl::opt<bool> EnableShufflePadding(
+    "enable-shuffle-padding", cl::init(true), cl::Hidden,
+    cl::desc("Enable shuffle padding to generate structure store."));
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type.
@@ -796,6 +800,11 @@ class InnerLoopVectorizer {
   // correct start value of reduction PHIs when vectorizing the epilogue.
   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
       ReductionResumeValues;
+
+  /// The map stores shuffles which are used to pad the gap of the interleaved
+  /// store groups. The key for the map is the entry of the load group who is
+  /// matched to the related store group.
+  MapVector<Value *, SmallVector<SmallVector<Value *, 4>, 4>> PaddedShufflesMap;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -1702,6 +1711,11 @@ class LoopVectorizationCostModel {
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
+  Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
+                                     Value *Ptr) const {
+    return InterleaveInfo.hasMatchedLoadGroupForStore(Inst, BB, Ptr);
+  }
+
 private:
   unsigned NumPredStores = 0;
 
@@ -2557,6 +2571,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
                        : ShuffledMask;
   };
 
+  Value *MatchedLoad = nullptr;
+  bool IsShufflePadding = false;
+  if (EnableShufflePadding && useMaskedInterleavedAccesses(*TTI) &&
+      TTI->enableScalableVectorization()) {
+    IsShufflePadding = true;
+    if (isa<StoreInst>(Instr) && (Group->getNumMembers() != Group->getFactor()))
+      MatchedLoad = Cost->hasMatchedLoadGroupForStore(
+          Instr, Instr->getParent(), getUnderlyingObject(Instr->getOperand(1)));
+  }
+
   // Vectorize the interleaved load group.
   if (isa<LoadInst>(Instr)) {
     Value *MaskForGaps = nullptr;
@@ -2626,8 +2650,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     for (unsigned I = 0; I < InterleaveFactor; ++I) {
       Instruction *Member = Group->getMember(I);
 
-      // Skip the gaps in the group.
-      if (!Member)
+      SmallVector<Value *, 4> Shuffles;
+      // Skip the gaps in the group if there are no paddings.
+      if (!Member && !IsShufflePadding)
         continue;
 
       auto StrideMask =
@@ -2636,6 +2661,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         Value *StridedVec = Builder.CreateShuffleVector(
             NewLoads[Part], StrideMask, "strided.vec");
 
+        if (!Member) {
+          if (Group->isReverse())
+            StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
+          Shuffles.push_back(StridedVec);
+          continue;
+        }
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
@@ -2646,9 +2677,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         if (Group->isReverse())
           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
 
+        Shuffles.push_back(StridedVec);
+
         State.set(VPDefs[J], StridedVec, Part);
       }
-      ++J;
+      PaddedShufflesMap[Instr].push_back(Shuffles);
+      if (Member)
+        ++J;
     }
     return;
   }
@@ -2672,6 +2707,24 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
              "Fail to get a member from an interleaved store group");
       Instruction *Member = Group->getMember(i);
 
+      if (!Member && MatchedLoad) {
+        // %wide.vec = load <12 x float>; 0,1,2,3,...,11
+        // %shuffle1 = shuffle %wide.vec, poison, <0, 3, 6, 9> ; 0,3,6,9
+        // %shuffle2 = shuffle %wide.vec, poison, <1, 4, 7, 10> ; 1,4,7,10
+        // %padded = shuffle %wide.vec, poison, <2, 5, 8, 11> ; 2,5,8,11
+        //
+        // %concate1 = shuffle %op1, %op2, <0, 1, ..., 7> ; 0,3,6,9,1,4,7,10
+        // %concate2 = shuffle %padded, poison,
+        //    <0, 1, ..., 3, undef, undef, undef, undef>
+        //    ; 2,5,8,11,poison,...,poison
+        // %concateFinal = shuffle %concate1, %concate2,
+        //    <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; 0,1,2,3,...,11
+        // store <12 x float> %concateFinal
+        Value *PaddedShuffle = PaddedShufflesMap[MatchedLoad][i][Part];
+        StoredVecs.push_back(PaddedShuffle);
+        continue;
+      }
+
       // Skip the gaps in the group.
       if (!Member) {
         Value *Undef = PoisonValue::get(SubVT);
@@ -2696,7 +2749,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     // Interleave all the smaller vectors into one wider vector.
     Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
     Instruction *NewStoreInstr;
-    if (BlockInMask || MaskForGaps) {
+    if ((BlockInMask || MaskForGaps) && !MatchedLoad) {
       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
                                                 Group->getAlign(), GroupMask);
@@ -6325,10 +6378,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
     if (Group->getMember(IF))
       Indices.push_back(IF);
 
+  bool IsShufflePaddingStore = false;
+  if (EnableShufflePadding && useMaskedInterleavedAccesses(TTI) &&
+      TTI.enableScalableVectorization() && !VF.isScalable())
+    IsShufflePaddingStore = true;
+
   // Calculate the cost of the whole interleaved group.
+  // If shuffle padding is enabled, ignore gaps.
   bool UseMaskForGaps =
       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
-      (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
+      (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()) &&
+       (!IsShufflePaddingStore ||
+        !hasMatchedLoadGroupForStore(I, I->getParent(),
+                                     getUnderlyingObject(I->getOperand(1)))));
   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll
@@ -0,0 +1,46 @@
+; REQUIRES: asserts
+; RUN: opt -enable-shuffle-padding=true -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize  -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1  | FileCheck %s --check-prefixes=PADDING
+; RUN: opt -enable-shuffle-padding=false -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize  -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1  | FileCheck %s --check-prefixes=NO-PADDING
+
+%struct.patic = type { float, float, float }
+
+; for (int i = 0; i < num; i++) {
+;   ps[i].x = factor * ps[i].x;
+;   ps[i].y = factor * ps[i].y;
+; }
+;
+define void @shufflePadding(i32 noundef %num, ptr nocapture noundef %ps) {
+; PADDING-LABEL: 'shufflePadding'
+; PADDING: LV: Found an estimated cost of 3 for VF 16 For instruction:   store float %mul6, ptr %y, align 4
+
+; NO-PADDING-LABEL: 'shufflePadding'
+; NO-PADDING: LV: Found an estimated cost of 188 for VF 16 For instruction:   store float %mul6, ptr %y, align 4
+entry:
+  %cmp19 = icmp sgt i32 %num, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %mul = fmul fast float %0, 0x40019999A0000000
+  store float %mul, ptr %arrayidx, align 4
+  %y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  %1 = load float, ptr %y, align 4
+  %mul6 = fmul fast float %1, 0x40019999A0000000
+  store float %mul6, ptr %y, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll