Skip to content

[LoopVectorize] Enable shuffle padding for masked interleaved accesses #75329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/Analysis/VectorUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,13 @@ class InterleavedAccessInfo {
/// Returns true if we have any interleave groups.
bool hasGroups() const { return !InterleaveGroups.empty(); }

/// Check if the interleaved store group has matched load group, which means
/// the store should be satisfied with some restrictions,
/// 1. The value operand of StoreInst should comes from LoadInst.
/// 2. The store group and the load group accesses the same memory.
Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
Value *Ptr) const;

private:
/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
/// Simplifies SCEV expressions in the context of existing SCEV assumptions.
Expand Down
23 changes: 23 additions & 0 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1441,6 +1441,29 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
RequiresScalarEpilogue = false;
}

Value *InterleavedAccessInfo::hasMatchedLoadGroupForStore(Instruction *Inst,
BasicBlock *BB,
Value *Ptr) const {
if (isa<PHINode>(Inst) || Inst->getParent() != BB)
return nullptr;

if (isa<LoadInst>(Inst)) {
Value *V = getUnderlyingObject(Inst->getOperand(0));
auto Group = getInterleaveGroup(Inst);
if (Group && (V == Ptr))
return Group->getInsertPos();
}

for (unsigned It = 0; It < Inst->getNumOperands(); It++) {
if (Instruction *I = dyn_cast<Instruction>(Inst->getOperand(It)))
if (Value *MatchedLoadGroupEntry =
hasMatchedLoadGroupForStore(I, BB, Ptr))
return MatchedLoadGroupEntry;
}

return nullptr;
}

template <typename InstT>
void InterleaveGroup<InstT>::addMetadata(InstT *NewInst) const {
llvm_unreachable("addMetadata can only be used for Instruction");
Expand Down
72 changes: 67 additions & 5 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,10 @@ static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
// after prolog. See `emitIterationCountCheck`.
static constexpr uint32_t MinItersBypassWeights[] = {1, 127};

static cl::opt<bool> EnableShufflePadding(
"enable-shuffle-padding", cl::init(true), cl::Hidden,
cl::desc("Enable shuffle padding to generate structure store."));

/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type.
Expand Down Expand Up @@ -796,6 +800,11 @@ class InnerLoopVectorizer {
// correct start value of reduction PHIs when vectorizing the epilogue.
SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
ReductionResumeValues;

/// The map stores shuffles which are used to pad the gap of the interleaved
/// store groups. The key for the map is the entry of the load group who is
/// matched to the related store group.
MapVector<Value *, SmallVector<SmallVector<Value *, 4>, 4>> PaddedShufflesMap;
};

class InnerLoopUnroller : public InnerLoopVectorizer {
Expand Down Expand Up @@ -1702,6 +1711,11 @@ class LoopVectorizationCostModel {
/// \p VF is the vectorization factor chosen for the original loop.
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;

Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
Value *Ptr) const {
return InterleaveInfo.hasMatchedLoadGroupForStore(Inst, BB, Ptr);
}

private:
unsigned NumPredStores = 0;

Expand Down Expand Up @@ -2557,6 +2571,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
: ShuffledMask;
};

Value *MatchedLoad = nullptr;
bool IsShufflePadding = false;
if (EnableShufflePadding && useMaskedInterleavedAccesses(*TTI) &&
TTI->enableScalableVectorization()) {
IsShufflePadding = true;
if (isa<StoreInst>(Instr) && (Group->getNumMembers() != Group->getFactor()))
MatchedLoad = Cost->hasMatchedLoadGroupForStore(
Instr, Instr->getParent(), getUnderlyingObject(Instr->getOperand(1)));
}

// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
Value *MaskForGaps = nullptr;
Expand Down Expand Up @@ -2626,8 +2650,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
for (unsigned I = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);

// Skip the gaps in the group.
if (!Member)
SmallVector<Value *, 4> Shuffles;
// Skip the gaps in the group if there are no paddings.
if (!Member && !IsShufflePadding)
continue;

auto StrideMask =
Expand All @@ -2636,6 +2661,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
Value *StridedVec = Builder.CreateShuffleVector(
NewLoads[Part], StrideMask, "strided.vec");

if (!Member) {
if (Group->isReverse())
StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
Shuffles.push_back(StridedVec);
continue;
}
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
Expand All @@ -2646,9 +2677,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
if (Group->isReverse())
StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");

Shuffles.push_back(StridedVec);

State.set(VPDefs[J], StridedVec, Part);
}
++J;
PaddedShufflesMap[Instr].push_back(Shuffles);
if (Member)
++J;
}
return;
}
Expand All @@ -2672,6 +2707,24 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
"Fail to get a member from an interleaved store group");
Instruction *Member = Group->getMember(i);

if (!Member && MatchedLoad) {
// %wide.vec = load <12 x float>; 0,1,2,3,...,11
// %shuffle1 = shuffle %wide.vec, poison, <0, 3, 6, 9> ; 0,3,6,9
// %shuffle2 = shuffle %wide.vec, poison, <1, 4, 7, 10> ; 1,4,7,10
// %padded = shuffle %wide.vec, poison, <2, 5, 8, 11> ; 2,5,8,11
//
// %concate1 = shuffle %op1, %op2, <0, 1, ..., 7> ; 0,3,6,9,1,4,7,10
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should op1 be shuffle1 and op2 be shuffle2 ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My apologies... Thanks for the reminder!

// %concate2 = shuffle %padded, poison,
// <0, 1, ..., 3, undef, undef, undef, undef>
// ; 2,5,8,11,poison,...,poison
// %concateFinal = shuffle %concate1, %concate2,
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; 0,1,2,3,...,11
// store <12 x float> %concateFinal
Value *PaddedShuffle = PaddedShufflesMap[MatchedLoad][i][Part];
StoredVecs.push_back(PaddedShuffle);
continue;
}

// Skip the gaps in the group.
if (!Member) {
Value *Undef = PoisonValue::get(SubVT);
Expand All @@ -2696,7 +2749,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
// Interleave all the smaller vectors into one wider vector.
Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
Instruction *NewStoreInstr;
if (BlockInMask || MaskForGaps) {
if ((BlockInMask || MaskForGaps) && !MatchedLoad) {
Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
Group->getAlign(), GroupMask);
Expand Down Expand Up @@ -6325,10 +6378,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
if (Group->getMember(IF))
Indices.push_back(IF);

bool IsShufflePaddingStore = false;
if (EnableShufflePadding && useMaskedInterleavedAccesses(TTI) &&
TTI.enableScalableVectorization() && !VF.isScalable())
IsShufflePaddingStore = true;

// Calculate the cost of the whole interleaved group.
// If shuffle padding is enabled, ignore gaps.
bool UseMaskForGaps =
(Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
(isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
(isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()) &&
(!IsShufflePaddingStore ||
!hasMatchedLoadGroupForStore(I, I->getParent(),
getUnderlyingObject(I->getOperand(1)))));
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
; REQUIRES: asserts
; RUN: opt -enable-shuffle-padding=true -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1 | FileCheck %s --check-prefixes=PADDING
; RUN: opt -enable-shuffle-padding=false -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1 | FileCheck %s --check-prefixes=NO-PADDING

%struct.patic = type { float, float, float }

; for (int i = 0; i < num; i++) {
; ps[i].x = factor * ps[i].x;
; ps[i].y = factor * ps[i].y;
; }
;
define void @shufflePadding(i32 noundef %num, ptr nocapture noundef %ps) {
; PADDING-LABEL: 'shufflePadding'
; PADDING: LV: Found an estimated cost of 3 for VF 16 For instruction: store float %mul6, ptr %y, align 4

; NO-PADDING-LABEL: 'shufflePadding'
; NO-PADDING: LV: Found an estimated cost of 188 for VF 16 For instruction: store float %mul6, ptr %y, align 4
entry:
%cmp19 = icmp sgt i32 %num, 0
br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %num to i64
br label %for.body

for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void

for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
%0 = load float, ptr %arrayidx, align 4
%mul = fmul fast float %0, 0x40019999A0000000
store float %mul, ptr %arrayidx, align 4
%y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
%1 = load float, ptr %y, align 4
%mul6 = fmul fast float %1, 0x40019999A0000000
store float %mul6, ptr %y, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
}

Loading