[LV] Support strided load with a stride of -1 #128718

Mel-Chen · 2025-02-25T14:15:48Z

Initial patch to support strided memory accesses. Currently, only stride -1 is supported, with future extensions planned for additional strides and strided store.

llvmbot · 2025-02-25T14:16:22Z

@llvm/pr-subscribers-vectorizers
@llvm/pr-subscribers-backend-risc-v

@llvm/pr-subscribers-llvm-transforms

Author: Mel Chen (Mel-Chen)

Changes

Patch is 118.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128718.diff

9 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+45-2)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+19-10)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+91-22)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+2-2)
(added) llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll (+642)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll (+93-125)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll (+17-180)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll (+2-50)
(modified) llvm/unittests/Transforms/Vectorize/VPlanTest.cpp (+4-3)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..6635a91563384 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1143,6 +1143,7 @@ class LoopVectorizationCostModel {
     CM_Widen_Reverse, // For consecutive accesses with stride -1.
     CM_Interleave,
     CM_GatherScatter,
+    CM_Strided,
     CM_Scalarize,
     CM_VectorCall,
     CM_IntrinsicCall
@@ -1308,6 +1309,18 @@ class LoopVectorizationCostModel {
            (SI && TTI.isLegalMaskedScatter(Ty, Align));
   }
 
+  /// Returns true if the target machine can represent \p V as a strided load
+  /// or store operation.
+  bool isLegalStridedLoadStore(Value *V, ElementCount VF) {
+    if (!isa<LoadInst, StoreInst>(V))
+      return false;
+    auto *Ty = getLoadStoreType(V);
+    Align Align = getLoadStoreAlignment(V);
+    if (VF.isVector())
+      Ty = VectorType::get(Ty, VF);
+    return TTI.isLegalStridedLoadStore(Ty, Align);
+  }
+
   /// Returns true if the target machine supports all of the reduction
   /// variables found for the given VF.
   bool canVectorizeReductions(ElementCount VF) const {
@@ -1645,6 +1658,9 @@ class LoopVectorizationCostModel {
   /// element)
   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
 
+  /// The cost computation for strided load/store instruction.
+  InstructionCost getStridedLoadStoreCost(Instruction *I, ElementCount VF);
+
   /// Estimate the overhead of scalarizing an instruction. This is a
   /// convenience wrapper for the type-based getScalarizationOverhead API.
   InstructionCost getScalarizationOverhead(Instruction *I,
@@ -5813,6 +5829,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   return Cost;
 }
 
+InstructionCost
+LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
+                                                    ElementCount VF) {
+  Type *ValTy = getLoadStoreType(I);
+  auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
+  const Value *Ptr = getLoadStorePointerOperand(I);
+
+  return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
+                                    Legal->isMaskRequired(I), Alignment,
+                                    CostKind, I);
+}
+
 std::optional<InstructionCost>
 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
                                                     ElementCount VF,
@@ -6132,6 +6161,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
                "Expected consecutive stride.");
         InstWidening Decision =
             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+        // Consider using strided load/store for consecutive reverse accesses to
+        // achieve more efficient memory operations.
+        if (ConsecutiveStride == -1) {
+          const InstructionCost StridedLoadStoreCost =
+              isLegalStridedLoadStore(&I, VF) ? getStridedLoadStoreCost(&I, VF)
+                                              : InstructionCost::getInvalid();
+          if (StridedLoadStoreCost < Cost) {
+            Decision = CM_Strided;
+            Cost = StridedLoadStoreCost;
+          }
+        }
         setWideningDecision(&I, VF, Decision, Cost);
         continue;
       }
@@ -6777,6 +6817,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
         return TTI::CastContextHint::Normal;
 
       switch (getWideningDecision(I, VF)) {
+      // TODO: New CastContextHint for strided accesses.
+      case LoopVectorizationCostModel::CM_Strided:
       case LoopVectorizationCostModel::CM_GatherScatter:
         return TTI::CastContextHint::GatherScatter;
       case LoopVectorizationCostModel::CM_Interleave:
@@ -8335,6 +8377,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
   bool Consecutive =
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+  bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
 
   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
   if (Consecutive) {
@@ -8362,11 +8405,11 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   }
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
-                                 I->getDebugLoc());
+                                 Strided, I->getDebugLoc());
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
-                                Reverse, I->getDebugLoc());
+                                Reverse, Strided, I->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8089cfd1ce802..34b4c2c8a0be1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2611,6 +2611,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
   /// Whether the consecutive accessed addresses are in reverse order.
   bool Reverse;
 
+  /// Whether the accessed addresses are evenly spaced apart by a fixed stride.
+  bool Strided;
+
   /// Whether the memory access is masked.
   bool IsMasked = false;
 
@@ -2624,9 +2627,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
 
   VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
                       std::initializer_list<VPValue *> Operands,
-                      bool Consecutive, bool Reverse, DebugLoc DL)
+                      bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
       : VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
-        Reverse(Reverse) {
+        Reverse(Reverse), Strided(Strided) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
   }
 
@@ -2654,6 +2657,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
   /// order.
   bool isReverse() const { return Reverse; }
 
+  /// Return whether the accessed addresses are evenly spaced apart by a fixed
+  /// stride.
+  bool isStrided() const { return Strided; }
+
   /// Return the address accessed by this recipe.
   VPValue *getAddr() const { return getOperand(0); }
 
@@ -2683,16 +2690,16 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
 /// optional mask.
 struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
   VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                    bool Consecutive, bool Reverse, DebugLoc DL)
+                    bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
-                            Reverse, DL),
+                            Reverse, Strided, DL),
         VPValue(this, &Load) {
     setMask(Mask);
   }
 
   VPWidenLoadRecipe *clone() override {
     return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
-                                 getMask(), Consecutive, Reverse,
+                                 getMask(), Consecutive, Reverse, Strided,
                                  getDebugLoc());
   }
 
@@ -2724,7 +2731,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
                             {L.getAddr(), &EVL}, L.isConsecutive(),
-                            L.isReverse(), L.getDebugLoc()),
+                            L.isReverse(), L.isStrided(), L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -2761,16 +2768,17 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 /// to store to and an optional mask.
 struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
-                     VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
+                     VPValue *Mask, bool Consecutive, bool Reverse,
+                     bool Strided, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
-                            Consecutive, Reverse, DL) {
+                            Consecutive, Reverse, Strided, DL) {
     setMask(Mask);
   }
 
   VPWidenStoreRecipe *clone() override {
     return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
                                   getStoredValue(), getMask(), Consecutive,
-                                  Reverse, getDebugLoc());
+                                  Reverse, Strided, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -2804,7 +2812,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
                             {S.getAddr(), S.getStoredValue(), &EVL},
-                            S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
+                            S.isConsecutive(), S.isReverse(), S.isStrided(),
+                            S.getDebugLoc()) {
     setMask(Mask);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d57a6c481748c..e62ae690caa6d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2609,10 +2609,15 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
     assert(!Reverse &&
            "Inconsecutive memory access should not have the order.");
-    return Ctx.TTI.getAddressComputationCost(Ty) +
-           Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
-                                          IsMasked, Alignment, Ctx.CostKind,
-                                          &Ingredient);
+    if (Strided)
+      return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
+                                            IsMasked, Alignment, Ctx.CostKind,
+                                            &Ingredient);
+    else
+      return Ctx.TTI.getAddressComputationCost(Ty) +
+             Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+                                            IsMasked, Alignment, Ctx.CostKind,
+                                            &Ingredient);
   }
 
   InstructionCost Cost = 0;
@@ -2639,11 +2644,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
-  bool CreateGather = !isConsecutive();
+  bool CreateGather = !isConsecutive() && !isStrided();
 
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
-  Value *Mask = nullptr;
+  Value *Mask = isStrided()
+                    ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+                    : nullptr;
   if (auto *VPMask = getMask()) {
     // Mask reversal is only needed for non-all-one (null) masks, as reverse
     // of a null all-one mask is a null mask.
@@ -2658,9 +2665,25 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
                                        "wide.masked.gather");
   } else if (Mask) {
-    NewLI =
-        Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
-                                 PoisonValue::get(DataTy), "wide.masked.load");
+    if (isStrided()) {
+      const DataLayout &DL = LI->getDataLayout();
+      auto *PtrTy = Addr->getType();
+      auto *StrideTy = DL.getIndexType(PtrTy);
+      // TODO: Support non-unit-reverse strided accesses.
+      auto *StrideVal =
+          ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+      Value *RuntimeVF =
+          getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+      NewLI = Builder.CreateIntrinsic(
+          Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+          {Addr, StrideVal, Mask, RuntimeVF}, nullptr, "wide.strided.load");
+      cast<CallInst>(NewLI)->addParamAttr(
+          0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+    } else {
+      NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+                                       PoisonValue::get(DataTy),
+                                       "wide.masked.load");
+    }
   } else {
     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
   }
@@ -2698,7 +2721,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
-  bool CreateGather = !isConsecutive();
+  bool CreateGather = !isConsecutive() && !isStrided();
 
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
@@ -2718,6 +2741,16 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
     NewLI =
         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                 nullptr, "wide.masked.gather");
+  } else if (isStrided()) {
+    const DataLayout &DL = LI->getDataLayout();
+    auto *PtrTy = Addr->getType();
+    auto *StrideTy = DL.getIndexType(PtrTy);
+    // TODO: Support non-unit-reverse strided accesses.
+    auto *StrideVal =
+        ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+    NewLI = Builder.CreateIntrinsic(
+        Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+        {Addr, StrideVal, Mask, EVL}, nullptr, "wide.strided.load");
   } else {
     VectorBuilder VBuilder(Builder);
     VBuilder.setEVL(EVL).setMask(Mask);
@@ -2772,13 +2805,15 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
   auto *SI = cast<StoreInst>(&Ingredient);
 
   VPValue *StoredVPValue = getStoredValue();
-  bool CreateScatter = !isConsecutive();
+  bool CreateScatter = !isConsecutive() && !isStrided();
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
 
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
 
-  Value *Mask = nullptr;
+  Value *Mask = isStrided()
+                    ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+                    : nullptr;
   if (auto *VPMask = getMask()) {
     // Mask reversal is only needed for non-all-one (null) masks, as reverse
     // of a null all-one mask is a null mask.
@@ -2797,12 +2832,32 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
   }
   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
   Instruction *NewSI = nullptr;
-  if (CreateScatter)
+  if (CreateScatter) {
     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
-  else if (Mask)
-    NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
-  else
+  } else if (Mask) {
+    if (isStrided()) {
+      const DataLayout &DL = SI->getDataLayout();
+      auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+      Type *StoredEltTy = StoredVecTy->getElementType();
+      auto *PtrTy = Addr->getType();
+      auto *StrideTy = DL.getIndexType(PtrTy);
+      // TODO: Support non-unit-reverse strided accesses.
+      auto *StrideVal =
+          ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+      Value *RuntimeVF =
+          getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+      NewSI = Builder.CreateIntrinsic(
+          Intrinsic::experimental_vp_strided_store,
+          {StoredVecTy, PtrTy, StrideTy},
+          {StoredVal, Addr, StrideVal, Mask, RuntimeVF});
+      cast<CallInst>(NewSI)->addParamAttr(
+          1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
+    } else {
+      NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+    }
+  } else {
     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
+  }
   State.addMetadata(NewSI, SI);
 }
 
@@ -2818,7 +2873,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   auto *SI = cast<StoreInst>(&Ingredient);
 
   VPValue *StoredValue = getStoredValue();
-  bool CreateScatter = !isConsecutive();
+  bool CreateScatter = !isConsecutive() && !isStrided();
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
 
   auto &Builder = State.Builder;
@@ -2843,11 +2898,25 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
                                     Intrinsic::vp_scatter,
                                     {StoredVal, Addr, Mask, EVL});
   } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Store, Type::getVoidTy(EVL->getContext()),
-        {StoredVal, Addr}));
+    if (isStrided()) {
+      const DataLayout &DL = SI->getDataLayout();
+      auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+      Type *StoredEltTy = StoredVecTy->getElementType();
+      auto *PtrTy = Addr->getType();
+      auto *StrideTy = DL.getIndexType(PtrTy);
+      // TODO: Support non-unit-reverse strided accesses.
+      auto *StrideVal =
+          ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+      NewSI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+                                      {StoredVecTy, PtrTy, StrideTy},
+                                      {StoredVal, Addr, StrideVal, Mask, EVL});
+    } else {
+      VectorBuilder VBuilder(Builder);
+      VBuilder.setEVL(EVL).setMask(Mask);
+      NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+          Instruction::Store, Type::getVoidTy(EVL->getContext()),
+          {StoredVal, Addr}));
+    }
   }
   NewSI->addParamAttr(
       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6c917e4eef655..fd6984208a760 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -73,13 +73,13 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/,
+              false /*Consecutive*/, false /*Reverse*/, false /*Strided*/,
               Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
-              Ingredient.getDebugLoc());
+              false /*Strided*/, Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
new file mode 100644
index 0000000000000..1aa01d7aabee9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -0,0 +1,642 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+;; This is the loop in c++ being vectorize in this file with
+;; vector.reverse
+;;  #pragma clang loop vectorize_width(4, scalable)
+;;  for (int i = N-1; i >= 0; --i)
+;;    a[i] = b[i] ...
[truncated]

alexey-bataev · 2025-02-27T16:52:24Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -1308,6 +1309,18 @@ class LoopVectorizationCostModel {
           (SI && TTI.isLegalMaskedScatter(Ty, Align));
  }

+  /// Returns true if the target machine can represent \p V as a strided load
+  /// or store operation.
+  bool isLegalStridedLoadStore(Value *V, ElementCount VF) {


I think better to have the format of the function, similar to isLegalGatherOrScatter, isLegalMaskedLoad etc.

I copy the function name from TTI function. Do you have any recommended names?

It is not about the name, more about the function parameters

The prototype of isLegalGatherOrScatter is:

bool isLegalGatherOrScatter(Value *V, ElementCount VF)

And the prototype of isLegalMaskedLoad is:

bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment)

isLegalStridedLoadStore is the same as isLegalGatherOrScatter now.
Do you prefer the prototype of isLegalMaskedLoad?

llvm/lib/Transforms/Vectorize/VPlan.h

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

github-actions · 2025-03-07T07:49:47Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Mel-Chen · 2025-03-10T09:20:04Z

This patch is ready for review. Please take a look, thanks.

alexey-bataev · 2025-03-10T13:05:18Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -1312,6 +1313,22 @@ class LoopVectorizationCostModel {
           (SI && TTI.isLegalMaskedScatter(Ty, Align));
  }

+  /// Returns true if the target machine can represent \p V as a strided load
+  /// or store operation.
+  bool isLegalStridedLoadStore(Value *V, ElementCount VF) const {


I'm suggesting to use the prototype of isLegalMaskedLoad here, most of the function do the analysis based on type/alignment info

After consideration, I have decided to rename isLegalStridedLoadStore to stridedAccessCanBeWidened and align its prototype with memoryInstructionCanBeWidened and interleavedAccessCanBeWidened.
The reason is that, unlike isLegalGatherOrScatter and isLegalMaskedLoad, which primarily check whether the target supports certain operations, strided memory access requires additional checks, such as check stride. Its functionality is likely closer to memoryInstructionCanBeWidened and interleavedAccessCanBeWidened.

fhahn · 2025-03-10T21:17:38Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+  // TODO: Support non-unit-reverse strided accesses.
+  Value *Index =
+      Strided
+          ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))


Strided means negative stride? Would be good to clarify in the comment for Strided..

Just for now. Later it is supposed to support positive (and <-1 negative) and runtime stride values

Should Strided eventually be some sort of std::optional<int>? Or is the plan to also allow runtime strided values? In which case I presume we need to model it as a VPValue operand.

This is the initial patch for strided memory accesses, currently only supporting cases where the stride is -1. A follow-up patch will focus on stride analysis to support more strided memory accesses.

However, I believe it would be beneficial to discuss and decide how VPlan should handle stride in this initial patch. My current plan is to extend the existing VPWidenLoadRecipe/VPWidenStoreRecipe and VPVectorPointerRecipe by explicitly passing the stride into the recipe. For example:

VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, VPValue *Stride, bool Consecutive, bool Reverse, DebugLoc DL)

A memory access will only be considered strided when Stride is explicitly provided (i.e., not nullptr).

@fhahn @ayalz What do you think about this approach?

Ping @fhahn @ayalz. Do you think this is a good approach? Or any suggestion?

@Mel-Chen yes, that's a good idea. Since it will generate a vp intrinsic either way.

It is a bit confusing that VPVectorPointerRecipe w/o stride steps by positive increment and with Strided by negative increment. Should it be VectorPointerEndRecipe?

Let me explain why we need a vector-pointer here instead of a vector-end-pointer.

In the current implementation of reverse loads (CM_WidenReverse), the vectorizer emits a regular load followed by a reverse.
For example, to load VF elements in reverse starting from index n (say VF = 4), we want to access [n, n-1, n-2, n-3].
With load + reverse, we actually load [n-3, n-2, n-1, n], and then reverse the vector:

%L = load [n-3, n-2, n-1, n] %RL = reverse %L

So in this case, the vector-end-pointer should get the pointer that points to n-3.

However, when using a strided load with a negative stride to implement reverse loads, it behaves more like a regular consecutive load in terms of pointer computation:

%RL = strided.load n, -1

That’s why I chose to extend vector-pointer rather than rely on vector-end-pointer.
That said, I now think it would be better to explicitly pass in the stride to vector-pointer, rather than using a bool Strided flag.

Alternatively, would you prefer introducing a new pointer recipe specifically for strided cases where the stride is not 1?

@fhahn Do you think we should introduce a new pointer recipe, or is it better to just extend the existing vector-pointer recipe? I prefer the later one.

I think it should be fine to add a stride argument for now, with a brief comment documenting the behavior.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/VPlan.h

lukel97 · 2025-03-12T09:33:38Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+  // TODO: Support non-unit-reverse strided accesses.
+  Value *Index =
+      Strided
+          ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))


Should Strided eventually be some sort of std::optional<int>? Or is the plan to also allow runtime strided values? In which case I presume we need to model it as a VPValue operand.

llvm/lib/Transforms/Vectorize/VPlan.h

…pe. NFC After llvm#128718 lands there will be two ways of performing a reversed widened memory access, either by performing a consecutive unit-stride access and a reverse, or a strided access with a negative stride. Even though both produce a reversed vector, only the former needs VPReverseVectorPointerRecipe which computes a pointer to the last element of the widened vector. A strided reverse still needs a pointer to the first element of each part so it will use VPVectorPointerRecipe. This renames VPReverseVectorPointerRecipe to VPVectorEndPointerRecipe to clarify that a reversed access may not necessarily need a pointer to the last element.

fhahn · 2025-05-29T08:36:11Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -1720,16 +1723,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
 };

 /// A recipe to compute the pointers for widened memory accesses of IndexTy.
+/// Supports both consecutive and reverse consecutive accesses.


Is this still accurate? VPVectorPointerEndRecipe also is for reversed accesses?

fhahn · 2025-05-29T08:37:53Z

llvm/lib/Transforms/Vectorize/VPlan.h

 class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
                              public VPUnrollPartAccessor<1> {
  Type *IndexedTy;

+  /// Indicate whether to compute the pointer for strided memory accesses.


Should document what the stride is.

fhahn · 2025-05-29T08:38:53Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+  // TODO: Support non-unit-reverse strided accesses.
+  Value *Index =
+      Strided
+          ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))


It is a bit confusing that VPVectorPointerRecipe w/o stride steps by positive increment and with Strided by negative increment. Should it be VectorPointerEndRecipe?

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

fhahn · 2025-05-29T08:44:38Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+        const InstructionCost StridedLoadStoreCost =
+            Ctx.TTI.getStridedMemoryOpCost(
+                Ingredient.getOpcode(), DataTy,
+                getLoadStorePointerOperand(&Ingredient), MemR->isMasked(),


Would be good to avoid passing the pointer operand from the ingredient here, as it may not be accurate any longer. What does the interface use the pointer operand for? Ideally we would pass the required info directly w/o having to pass an IR value. Do you think that would be feasible?

Otherwise, can we pass the underlying value from the pointer operand, if it exists?

getStridedMemoryOpCost falls back to getGatherScatterOpCost when the target doesn't support strided access.

InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override { // For a target without strided memory operations (or for an illegal // operation type on one which does), assume we lower to a gather/scatter // operation. (Which may in turn be scalarized.) return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I); }

Since getGatherScatterOpCost uses pointer operand, I went ahead with the latter approach you suggested.
8875c32

Mel-Chen · 2025-06-11T11:04:27Z

@fhahn Ping, I extended VPVectorPointerRecipe to achieve the goal for now. If you think we should introduce a new pointer recipe instead, please let me know.
746caae

fhahn · 2025-06-12T20:18:24Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -1743,18 +1746,20 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,

 /// A recipe to compute the pointers for widened memory accesses of IndexTy.


Would be good to document the arguments

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

fhahn · 2025-06-12T20:23:46Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+
+      // The stride of consecutive reverse access must be -1.
+      int64_t Stride = -1;
+      auto *GEP = dyn_cast<GetElementPtrInst>(PtrUV->stripPointerCasts());


Is stripping the pointer casts needed here? If it is just needed for the flags, can we get the wrap flags from the recipe?

We can't directly pass the GEPNoWrapFlags from VPVectorEndPointerRecipe here. Tail folding may cause VPVectorEndPointerRecipe to compute addresses that the original scalar loop wouldn't, which results in the removal of the 'inbounds' flag.

// When folding the tail, we may compute an address that we don't in the // original scalar loop and it may not be inbounds. Drop Inbounds in that // case. GEPNoWrapFlags Flags = (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) ? GEPNoWrapFlags::none() : GEPNoWrapFlags::inBounds(); VectorPtr = new VPVectorEndPointerRecipe( Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());

However, this doesn't apply to VPVectorPointerRecipe — if the original IR had the 'inbounds' flag, it should be preserved.

fhahn · 2025-06-12T20:25:09Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+      auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R);
+      // TODO: support strided store
+      // TODO: support strided accesses with stride not equal to -1
+      if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || !MemR->isReverse())


Do we need to check if MemR is consecutive here?

I think we don't have to check if MemR is consecutive. There is an assertion in constructor of VPWidenMemoryRecipe:

assert((Consecutive || !Reverse) && "Reverse implies consecutive");

So I add an assertion for consecutive.
72c17a1

fhahn · 2025-06-12T20:41:10Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+  // TODO: Support non-unit-reverse strided accesses.
+  Value *Index =
+      Strided
+          ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))


I think it should be fine to add a stride argument for now, with a brief comment documenting the behavior.

fhahn · 2025-06-12T20:42:41Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+
+      auto *VecEndPtr = cast<VPVectorEndPointerRecipe>(MemR->getAddr());
+      VPValue *Ptr = VecEndPtr->getPtr();
+      Value *PtrUV = Ptr->getUnderlyingValue();


Ah I see, the Ptr argument isn't used by the RISCV TTI, but is potentially used by others; it appears passing nullptr here would work, but then it may cause issues with TTI implementations that use the default from BasicTTI?

Yes, RISC-V TTI can omit passing the pointer because it implements getStridedMemoryOpCost and does not require full pointer information, so passing nullptr could work. However, this might cause issues for targets that do not implement getStridedMemoryOpCost.

Also cherry-pick the branch Mel-Chen:legalizeAndOptimizeInductions However, still not work well as collectLoopUniforms if the use-chain is too compilicated. :(

Still rely on CM_Strided to known legal and cost.

We should reopen it after supporting const strided accessess.

Mel-Chen · 2025-06-23T15:06:32Z

ping

fhahn · 2025-06-23T15:17:00Z

Do you know if/how this is related/interacts with #130032?

Mel-Chen · 2025-06-23T15:25:39Z

Do you know if/how this is related/interacts with #130032?

Honestly, I don't quite understand how consecutive load/store can directly replace reverse accesses. As you mentioned, #130032 lacks description.

LiqinWeng · 2025-06-24T03:24:20Z

Do you know if/how this is related/interacts with #130032?

Sorry， Sorry, I'll add a description.

llvmbot added backend:RISC-V vectorizers llvm:transforms labels Feb 25, 2025

Mel-Chen marked this pull request as draft February 25, 2025 14:16

Mel-Chen changed the title ~~[LV] Support strided memory accesses with a stride of -1~~ WIP:[LV] Support strided memory accesses with a stride of -1 Feb 25, 2025

Mel-Chen mentioned this pull request Feb 25, 2025

[LV][EVL] Generate negative strided load/store for reversed load/store #123608

Open

Mel-Chen force-pushed the cm-stride branch from 083f9f2 to 1836f61 Compare February 27, 2025 11:15

alexey-bataev reviewed Feb 27, 2025

View reviewed changes

Mel-Chen force-pushed the cm-stride branch 3 times, most recently from 1966fd5 to c54369e Compare March 10, 2025 08:27

Mel-Chen changed the title ~~WIP:[LV] Support strided memory accesses with a stride of -1~~ [LV] Support strided memory accesses with a stride of -1 Mar 10, 2025

Mel-Chen mentioned this pull request Mar 10, 2025

[LV][NFC] Pre-commit test for supporting strided accesses. #130563

Merged

Mel-Chen requested review from fhahn, lukel97, LiqinWeng, ayalz, wangpc-pp, arcbbb, ElvisWang123 and alexey-bataev March 10, 2025 09:13

Mel-Chen marked this pull request as ready for review March 10, 2025 09:20

alexey-bataev reviewed Mar 10, 2025

View reviewed changes

fhahn reviewed Mar 10, 2025

View reviewed changes

wangpc-pp reviewed Mar 11, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp Outdated Show resolved Hide resolved

lukel97 reviewed Mar 12, 2025

View reviewed changes

fhahn reviewed May 29, 2025

View reviewed changes

Mel-Chen force-pushed the cm-stride branch from 456f398 to 8875c32 Compare June 2, 2025 13:35

Mel-Chen force-pushed the cm-stride branch from 8875c32 to 746caae Compare June 11, 2025 10:54

Mel-Chen requested a review from fhahn June 12, 2025 09:16

fhahn reviewed Jun 12, 2025

View reviewed changes

Mel-Chen added 17 commits June 16, 2025 01:34

Init: New Recipe VPWidenStridedLoadRecipe

89c2625

[WIP][VPlan Based] Try to remove CM_Strided from uniform analysis

1275180

Also cherry-pick the branch Mel-Chen:legalizeAndOptimizeInductions However, still not work well as collectLoopUniforms if the use-chain is too compilicated. :(

[WIP][VPlan Based] Generate VPWidenStrideLoadRecipe in VPlanTransform

274cf23

Still rely on CM_Strided to known legal and cost.

[WIP][VPlan based] Clamp VF range in VPlan transformation

42506e1

[WIP][VPlan based] Time to remove CM_Strided

eb22784

[VPlan based] Patch comments, nfc

a5fc805

Format

db4c728

[Unrelated code] Remove Mel-Chen:legalizeAndOptimizeInductions

16d70e1

We should reopen it after supporting const strided accessess.

[Fix] Remove unused debug info setting.

70d9cb6

[Fix] Set Opcode as Instruction::Load directly.

1c77f1e

[Fix] Replace getLoadStorePointerOperand with Ptr->getUnderlyingValue().

88f2731

[Fix] Pass stride in element size

cb6611b

[Fix] New operand Stride for VPVectorPointerRecipe

f40ebc6

[Resolve TODO] Use i32 index type for unit stride and part 0

d8e8185

[Comment] Remove the unrelated change, nfc

45f06b4

[Comment] Add assert for consecutive, nfc

72c17a1

[Comment] Update comment of VPVectorPointerRecipe

b782845

Mel-Chen force-pushed the cm-stride branch from ed4f781 to b782845 Compare June 16, 2025 10:26

Mel-Chen requested a review from fhahn June 16, 2025 10:44

Mel-Chen mentioned this pull request Jun 20, 2025

[VPlan] Emit VPVectorEndPointerRecipe for reverse interleave pointer adjustment #144864

Open

		@@ -1743,18 +1746,20 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,

		/// A recipe to compute the pointers for widened memory accesses of IndexTy.

[LV] Support strided load with a stride of -1 #128718

Are you sure you want to change the base?

[LV] Support strided load with a stride of -1 #128718

Uh oh!

Conversation

Mel-Chen commented Feb 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Feb 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Mar 7, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Mel-Chen commented Mar 10, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Mel-Chen commented Jun 11, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Mel-Chen commented Feb 25, 2025 •

edited

Loading

llvmbot commented Feb 25, 2025 •

edited

Loading

github-actions bot commented Mar 7, 2025 •

edited

Loading