Skip to content

[LV][EVL]Support reversed loads/stores. #88025

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 30 additions & 18 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1571,13 +1571,7 @@ class LoopVectorizationCostModel {
/// Returns true if VP intrinsics with explicit vector length support should
/// be generated in the tail folded loop.
bool foldTailWithEVL() const {
return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL &&
// FIXME: remove this once vp_reverse is supported.
none_of(
WideningDecisions,
[](const std::pair<std::pair<Instruction *, ElementCount>,
std::pair<InstWidening, InstructionCost>>
&Data) { return Data.second.first == CM_Widen_Reverse; });
return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
}

/// Returns true if the Phi is part of an inloop reduction.
Expand Down Expand Up @@ -9367,9 +9361,6 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
"explicit vector length.");
// FIXME: Support reverse loading after vp_reverse is added.
assert(!isReverse() && "Reverse loads are not implemented yet.");

auto *LI = cast<LoadInst>(&Ingredient);

Type *ScalarDataTy = getLoadStoreType(&Ingredient);
Expand All @@ -9382,9 +9373,19 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
CallInst *NewLI;
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
Value *Addr = State.get(getAddr(), 0, !CreateGather);
Value *Mask =
getMask() ? State.get(getMask(), 0)
: Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
Value *Mask = getMask()
? State.get(getMask(), 0)
: Builder.CreateVectorSplat(State.VF, Builder.getTrue());
if (isReverse() && getMask()) {
VectorType *MaskTy = cast<VectorType>(Mask->getType());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From looking at the test changes, it seems like this patch may not be covered?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The newly added test only seems to cover stores with masks, could you add one with loads that need a mask as well?

Mask = Builder.CreateIntrinsic(
MaskTy, Intrinsic::experimental_vp_reverse,
{Mask,
Builder.CreateVectorSplat(MaskTy->getElementCount(),
Builder.getTrue()),
EVL},
nullptr, "vp.reverse.mask");
}
if (CreateGather) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
Expand All @@ -9398,7 +9399,14 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
State.addMetadata(NewLI, LI);
State.set(this, NewLI, 0);
Instruction *Res = NewLI;
if (isReverse()) {
Value *MaskVal =
Builder.CreateVectorSplat(DataTy->getElementCount(), Builder.getTrue());
Res = Builder.CreateIntrinsic(DataTy, Intrinsic::experimental_vp_reverse,
{Res, MaskVal, EVL}, nullptr, "vp.reverse");
}
State.set(this, Res, 0);
}

void VPWidenStoreRecipe::execute(VPTransformState &State) {
Expand Down Expand Up @@ -9444,9 +9452,6 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
"explicit vector length.");
// FIXME: Support reverse loading after vp_reverse is added.
assert(!isReverse() && "Reverse store are not implemented yet.");

auto *SI = cast<StoreInst>(&Ingredient);

VPValue *StoredValue = getStoredValue();
Expand All @@ -9459,7 +9464,14 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue, 0);
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
// FIXME: Support reverse store after vp_reverse is added.
if (isReverse()) {
auto *StoredValTy = cast<VectorType>(StoredVal->getType());
Value *MaskVal = Builder.CreateVectorSplat(StoredValTy->getElementCount(),
Builder.getTrue());
StoredVal = Builder.CreateIntrinsic(
StoredValTy, Intrinsic::experimental_vp_reverse,
{StoredVal, MaskVal, EVL}, nullptr, "vp.reverse");
}
Value *Mask =
getMask() ? State.get(getMask(), 0)
: Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -2399,8 +2399,8 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(),
{L->getAddr(), EVL}, L->isConsecutive(), false,
L->getDebugLoc()),
{L->getAddr(), EVL}, L->isConsecutive(),
L->isReverse(), L->getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
Expand Down Expand Up @@ -2476,7 +2476,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(),
{S->getAddr(), S->getStoredValue(), EVL},
S->isConsecutive(), false, S->getDebugLoc()) {
S->isConsecutive(), S->isReverse(),
S->getDebugLoc()) {
setMask(Mask);
}

Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1341,8 +1341,6 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U);
if (!MemR)
continue;
assert(!MemR->isReverse() &&
"Reversed memory operations not supported yet.");
VPValue *OrigMask = MemR->getMask();
assert(OrigMask && "Unmasked widen memory recipe when folding tail");
VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,11 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]]
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
; IF-EVL-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
; IF-EVL-NEXT: [[TMP10:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP7]], -1
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
Expand All @@ -46,19 +43,19 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP14]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
; IF-EVL-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
; IF-EVL-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
; IF-EVL-NEXT: [[TMP31:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP21]]
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP21]]
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
; IF-EVL-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
; IF-EVL-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
; IF-EVL-NEXT: [[TMP28:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP31]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP28]], ptr align 4 [[TMP25]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
; IF-EVL-NEXT: [[TMP29:%.*]] = zext i32 [[TMP8]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP29]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IF-EVL-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
Expand Down