Skip to content

Commit 746caae

Browse files
committed
[Fix] New operand Stride for VPVectorPointerRecipe
1 parent adee371 commit 746caae

File tree

8 files changed

+44
-42
lines changed

8 files changed

+44
-42
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7756,10 +7756,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
77567756
VectorPtr = new VPVectorEndPointerRecipe(
77577757
Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
77587758
} else {
7759-
VectorPtr = new VPVectorPointerRecipe(
7760-
Ptr, getLoadStoreType(I), /*Strided*/ false,
7761-
GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
7762-
I->getDebugLoc());
7759+
const DataLayout &DL = I->getDataLayout();
7760+
auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
7761+
VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1));
7762+
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
7763+
GEP ? GEP->getNoWrapFlags()
7764+
: GEPNoWrapFlags::none(),
7765+
I->getDebugLoc());
77637766
}
77647767
Builder.insert(VectorPtr);
77657768
Ptr = VectorPtr;

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1745,24 +1745,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
17451745
};
17461746

17471747
/// A recipe to compute the pointers for widened memory accesses of IndexTy.
1748-
/// Supports both consecutive and reverse consecutive accesses.
1749-
/// TODO: Support non-unit strided accesses .
17501748
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
1751-
public VPUnrollPartAccessor<1> {
1749+
public VPUnrollPartAccessor<2> {
17521750
Type *IndexedTy;
17531751

1754-
/// Indicate whether to compute the pointer for strided memory accesses.
1755-
bool Strided;
1756-
17571752
public:
1758-
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided,
1753+
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, VPValue *Stride,
17591754
GEPNoWrapFlags GEPFlags, DebugLoc DL)
1760-
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
1761-
GEPFlags, DL),
1762-
IndexedTy(IndexedTy), Strided(Strided) {}
1755+
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC,
1756+
ArrayRef<VPValue *>({Ptr, Stride}), GEPFlags, DL),
1757+
IndexedTy(IndexedTy) {}
17631758

17641759
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
17651760

1761+
VPValue *getStride() const { return getOperand(1); }
1762+
17661763
void execute(VPTransformState &State) override;
17671764

17681765
bool onlyFirstLaneUsed(const VPValue *Op) const override {
@@ -1780,7 +1777,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
17801777
}
17811778

17821779
VPVectorPointerRecipe *clone() override {
1783-
return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided,
1780+
return new VPVectorPointerRecipe(getOperand(0), IndexedTy, getOperand(1),
17841781
getGEPNoWrapFlags(), getDebugLoc());
17851782
}
17861783

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2371,16 +2371,19 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
23712371
void VPVectorPointerRecipe::execute(VPTransformState &State) {
23722372
auto &Builder = State.Builder;
23732373
unsigned CurrentPart = getUnrollPart(*this);
2374-
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2375-
CurrentPart, Builder);
2374+
Value *Stride = State.get(getStride(), /*IsScalar*/ true);
2375+
bool IsStrideOne =
2376+
isa<ConstantInt>(Stride) && cast<ConstantInt>(Stride)->isOne();
2377+
// TODO: can use i32 index type if stride is minus one and the part is zero
2378+
// part.
2379+
Type *IndexTy = IsStrideOne
2380+
? getGEPIndexTy(State.VF.isScalable(),
2381+
/*IsReverse*/ false, CurrentPart, Builder)
2382+
: Stride->getType();
23762383
Value *Ptr = State.get(getOperand(0), VPLane(0));
23772384

23782385
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2379-
// TODO: Support non-unit-reverse strided accesses.
2380-
Value *Index =
2381-
Strided
2382-
? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))
2383-
: Increment;
2386+
Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride);
23842387
Value *ResultPtr =
23852388
Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
23862389

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2575,22 +2575,21 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
25752575
// The stride of consecutive reverse access must be -1.
25762576
int64_t Stride = -1;
25772577
auto *GEP = dyn_cast<GetElementPtrInst>(PtrUV->stripPointerCasts());
2578+
const DataLayout &DL = Ingredient.getDataLayout();
2579+
auto *StrideTy = DL.getIndexType(PtrUV->getType());
2580+
VPValue *StrideVPV =
2581+
Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, Stride));
25782582
// Create a new vector pointer for strided access.
2579-
auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, /*Stride=*/true,
2583+
auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, StrideVPV,
25802584
GEP ? GEP->getNoWrapFlags()
25812585
: GEPNoWrapFlags::none(),
25822586
VecEndPtr->getDebugLoc());
25832587
NewPtr->insertBefore(MemR);
25842588

25852589
auto *LoadR = cast<VPWidenLoadRecipe>(MemR);
2586-
auto *LI = cast<LoadInst>(&Ingredient);
2587-
const DataLayout &DL = LI->getDataLayout();
2588-
auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
2589-
VPValue *StrideVPV =
2590-
Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, Stride));
25912590
auto *StridedLoad = new VPWidenStridedLoadRecipe(
2592-
*LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
2593-
LoadR->getDebugLoc());
2591+
*cast<LoadInst>(&Ingredient), NewPtr, StrideVPV, &Plan.getVF(),
2592+
LoadR->getMask(), *LoadR, LoadR->getDebugLoc());
25942593
StridedLoad->insertBefore(LoadR);
25952594
LoadR->replaceAllUsesWith(StridedLoad);
25962595

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
3939
; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
4040
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
4141
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
42-
; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
42+
; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 0
4343
; RV64-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32
4444
; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
4545
; RV64-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
@@ -149,7 +149,7 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
149149
; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
150150
; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
151151
; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
152-
; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
152+
; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 0
153153
; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
154154
; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
155155
; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1
@@ -235,7 +235,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
235235
; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
236236
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
237237
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
238-
; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
238+
; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 0
239239
; RV64-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32
240240
; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
241241
; RV64-NEXT: [[TMP12:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
@@ -345,7 +345,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
345345
; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
346346
; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
347347
; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
348-
; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
348+
; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 0
349349
; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
350350
; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
351351
; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
7777
; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
7878
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
7979
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
80-
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
80+
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1>
8181
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]>
8282
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
8383
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
@@ -201,7 +201,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
201201
; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
202202
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
203203
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
204-
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
204+
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1>
205205
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]>
206206
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
207207
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
@@ -325,7 +325,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
325325
; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
326326
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
327327
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
328-
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
328+
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1>
329329
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]>
330330
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
331331
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
@@ -449,7 +449,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
449449
; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
450450
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
451451
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
452-
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
452+
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1>
453453
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]>
454454
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
455455
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
3131
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]]
3232
; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1
3333
; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP6]]
34-
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
34+
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 0
3535
; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP8]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
3636
; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP6]]
3737
; IF-EVL-NEXT: [[TMP10:%.*]] = zext i32 [[TMP5]] to i64
@@ -129,7 +129,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
129129
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
130130
; IF-EVL-NEXT: [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
131131
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP6]]
132-
; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
132+
; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 0
133133
; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, <vscale x 4 x i1> [[TMP9]], i32 [[TMP5]])
134134
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP6]]
135135
; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP5]] to i64

llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
4242
; CHECK-NEXT: " EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" +
4343
; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>, vp\<[[VF]]\>\l" +
4444
; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" +
45-
; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>\l" +
45+
; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>, ir\<1\>\l" +
4646
; CHECK-NEXT: " WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" +
4747
; CHECK-NEXT: " WIDEN-INTRINSIC ir\<%call\> = call llvm.sqrt(ir\<%lv\>)\l" +
4848
; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" +
49-
; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>\l" +
49+
; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>, ir\<1\>\l" +
5050
; CHECK-NEXT: " WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" +
5151
; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT]]\> = add nuw vp\<[[CAN_IV]]\>, vp\<[[VFxUF]]\>\l" +
5252
; CHECK-NEXT: " EMIT branch-on-count vp\<[[CAN_IV_NEXT]]\>, vp\<[[VEC_TC]]\>\l" +

0 commit comments

Comments
 (0)