Skip to content

Commit 3a6a226

Browse files
committed
[LV][EVL] Generate negative strided load/store for reversed load/store
This can reduce the operations to reverse mask, load result and store value.
1 parent d9f165d commit 3a6a226

File tree

3 files changed

+46
-46
lines changed

3 files changed

+46
-46
lines changed

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2603,17 +2603,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
26032603
}
26042604
#endif
26052605

2606-
/// Use all-true mask for reverse rather than actual mask, as it avoids a
2607-
/// dependence w/o affecting the result.
2608-
static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2609-
Value *EVL, const Twine &Name) {
2610-
VectorType *ValTy = cast<VectorType>(Operand->getType());
2611-
Value *AllTrueMask =
2612-
Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2613-
return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2614-
{Operand, AllTrueMask, EVL}, nullptr, Name);
2615-
}
2616-
26172606
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
26182607
auto *LI = cast<LoadInst>(&Ingredient);
26192608

@@ -2630,8 +2619,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
26302619
Value *Mask = nullptr;
26312620
if (VPValue *VPMask = getMask()) {
26322621
Mask = State.get(VPMask);
2633-
if (isReverse())
2634-
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
26352622
} else {
26362623
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
26372624
}
@@ -2641,17 +2628,29 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
26412628
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
26422629
nullptr, "wide.masked.gather");
26432630
} else {
2644-
VectorBuilder VBuilder(Builder);
2645-
VBuilder.setEVL(EVL).setMask(Mask);
2646-
NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2647-
Instruction::Load, DataTy, Addr, "vp.op.load"));
2631+
if (isReverse()) {
2632+
auto *EltTy = DataTy->getElementType();
2633+
auto *PtrTy = Addr->getType();
2634+
Value *Operands[] = {
2635+
Addr,
2636+
ConstantInt::getSigned(
2637+
Builder.getInt32Ty(),
2638+
-static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8),
2639+
Mask, EVL};
2640+
NewLI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
2641+
{DataTy, PtrTy, Builder.getInt32Ty()},
2642+
Operands, nullptr, "vp.neg.strided.load");
2643+
} else {
2644+
VectorBuilder VBuilder(Builder);
2645+
VBuilder.setEVL(EVL).setMask(Mask);
2646+
NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
2647+
Instruction::Load, DataTy, Addr, "vp.op.load"));
2648+
}
26482649
}
26492650
NewLI->addParamAttr(
26502651
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
26512652
State.addMetadata(NewLI, LI);
26522653
Instruction *Res = NewLI;
2653-
if (isReverse())
2654-
Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
26552654
State.set(this, Res);
26562655
}
26572656

@@ -2749,13 +2748,9 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
27492748
CallInst *NewSI = nullptr;
27502749
Value *StoredVal = State.get(StoredValue);
27512750
Value *EVL = State.get(getEVL(), VPLane(0));
2752-
if (isReverse())
2753-
StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
27542751
Value *Mask = nullptr;
27552752
if (VPValue *VPMask = getMask()) {
27562753
Mask = State.get(VPMask);
2757-
if (isReverse())
2758-
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
27592754
} else {
27602755
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
27612756
}
@@ -2765,11 +2760,26 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
27652760
Intrinsic::vp_scatter,
27662761
{StoredVal, Addr, Mask, EVL});
27672762
} else {
2768-
VectorBuilder VBuilder(Builder);
2769-
VBuilder.setEVL(EVL).setMask(Mask);
2770-
NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2771-
Instruction::Store, Type::getVoidTy(EVL->getContext()),
2772-
{StoredVal, Addr}));
2763+
if (isReverse()) {
2764+
Type *StoredValTy = StoredVal->getType();
2765+
auto *EltTy = cast<VectorType>(StoredValTy)->getElementType();
2766+
auto *PtrTy = Addr->getType();
2767+
Value *Operands[] = {
2768+
StoredVal, Addr,
2769+
ConstantInt::getSigned(
2770+
Builder.getInt32Ty(),
2771+
-static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8),
2772+
Mask, EVL};
2773+
NewSI = Builder.CreateIntrinsic(
2774+
Intrinsic::experimental_vp_strided_store,
2775+
{StoredValTy, PtrTy, Builder.getInt32Ty()}, Operands);
2776+
} else {
2777+
VectorBuilder VBuilder(Builder);
2778+
VBuilder.setEVL(EVL).setMask(Mask);
2779+
NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
2780+
Instruction::Store, Type::getVoidTy(EVL->getContext()),
2781+
{StoredVal, Addr}));
2782+
}
27732783
}
27742784
NewSI->addParamAttr(
27752785
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,14 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
3939
; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
4040
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
4141
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
42-
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
43-
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
42+
; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP12]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
4443
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
4544
; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
4645
; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
4746
; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
4847
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
4948
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
50-
; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
51-
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
49+
; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP17]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
5250
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
5351
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
5452
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -153,18 +151,14 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
153151
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
154152
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
155153
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
156-
; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
157-
; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
158-
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
154+
; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP20]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
159155
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
160156
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
161157
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
162158
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
163159
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
164160
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
165-
; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
166-
; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
167-
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
161+
; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP25]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
168162
; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
169163
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
170164
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -280,8 +274,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
280274
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]]
281275
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
282276
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
283-
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
284-
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
277+
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.strided.load.nxv16i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
285278
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
286279
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
287280
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]]
@@ -290,16 +283,14 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
290283
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
291284
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
292285
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
293-
; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
294-
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
286+
; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
295287
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]]
296288
; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
297289
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]]
298290
; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]]
299291
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
300292
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
301-
; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
302-
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
293+
; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP26]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
303294
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
304295
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
305296
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
4343
; CHECK-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP15]]
4444
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
4545
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
46-
; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
47-
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
46+
; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP20]], i32 -8, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
4847
; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
4948
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
5049
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]

0 commit comments

Comments
 (0)