Skip to content

Commit dc58274

Browse files
author
zhangtiehu
committed
[LoopVectorize] Enable shuffle padding for masked interleaved accesses
typedef struct { float x; float y; float z; } patic; for (int i = 0; i < num; i++) { ps[i].x = factor * ps[i].x; ps[i].y = factor * ps[i].y; } This patch pads the gap of the interleave store group to eliminate masked.store, which helps to generate better code in Interleaved Access pass, as shown, %wide.vec = load <12 x float>; 0,1,2,3,...,11 %shuffle1 = shuffle %wide.vec, poison, <0, 3, 6, 9> ; 0,3,6,9 %shuffle2 = shuffle %wide.vec, poison, <1, 4, 7, 10> ; 1,4,7,10 %padded = shuffle %wide.vec, poison, <2, 5, 8, 11> ; 2,5,8,11 %concate1 = shuffle %op1, %op2, <0, 1, ..., 7> ; 0,3,6,9,1,4,7,10 %concate2 = shuffle %padded, poison, <0, 1, ..., 3, undef, undef, undef, undef> ; 2,5,8,11,poison,...,poison %concateFinal = shuffle %concate1, %concate2, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; 0,1,2,3,...,11 store <12 x float> %concateFinal This patch adds some restrictions for shuffle padding, that is target interleave store groups should have matched interleave load groups, which means, 1. The value operand of StoreInst should comes from LoadInst. 2. The store group and the load group accesses the same strcut memory.
1 parent e34c35a commit dc58274

File tree

5 files changed

+1364
-5
lines changed

5 files changed

+1364
-5
lines changed

llvm/include/llvm/Analysis/VectorUtils.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,13 @@ class InterleavedAccessInfo {
819819
/// Returns true if we have any interleave groups.
820820
bool hasGroups() const { return !InterleaveGroups.empty(); }
821821

822+
/// Check if the interleaved store group has matched load group, which means
823+
/// the store should be satisfied with some restrictions,
824+
/// 1. The value operand of StoreInst should comes from LoadInst.
825+
/// 2. The store group and the load group accesses the same memory.
826+
Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
827+
Value *Ptr) const;
828+
822829
private:
823830
/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
824831
/// Simplifies SCEV expressions in the context of existing SCEV assumptions.

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1441,6 +1441,29 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
14411441
RequiresScalarEpilogue = false;
14421442
}
14431443

1444+
Value *InterleavedAccessInfo::hasMatchedLoadGroupForStore(Instruction *Inst,
1445+
BasicBlock *BB,
1446+
Value *Ptr) const {
1447+
if (isa<PHINode>(Inst) || Inst->getParent() != BB)
1448+
return nullptr;
1449+
1450+
if (isa<LoadInst>(Inst)) {
1451+
Value *V = getUnderlyingObject(Inst->getOperand(0));
1452+
auto Group = getInterleaveGroup(Inst);
1453+
if (Group && (V == Ptr))
1454+
return Group->getInsertPos();
1455+
}
1456+
1457+
for (unsigned It = 0; It < Inst->getNumOperands(); It++) {
1458+
if (Instruction *I = dyn_cast<Instruction>(Inst->getOperand(It)))
1459+
if (Value *MatchedLoadGroupEntry =
1460+
hasMatchedLoadGroupForStore(I, BB, Ptr))
1461+
return MatchedLoadGroupEntry;
1462+
}
1463+
1464+
return nullptr;
1465+
}
1466+
14441467
template <typename InstT>
14451468
void InterleaveGroup<InstT>::addMetadata(InstT *NewInst) const {
14461469
llvm_unreachable("addMetadata can only be used for Instruction");

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 67 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,10 @@ static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
408408
// after prolog. See `emitIterationCountCheck`.
409409
static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410410

411+
static cl::opt<bool> EnableShufflePadding(
412+
"enable-shuffle-padding", cl::init(true), cl::Hidden,
413+
cl::desc("Enable shuffle padding to generate structure store."));
414+
411415
/// A helper function that returns true if the given type is irregular. The
412416
/// type is irregular if its allocated size doesn't equal the store size of an
413417
/// element of the corresponding vector type.
@@ -796,6 +800,11 @@ class InnerLoopVectorizer {
796800
// correct start value of reduction PHIs when vectorizing the epilogue.
797801
SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
798802
ReductionResumeValues;
803+
804+
/// The map stores shuffles which are used to pad the gap of the interleaved
805+
/// store groups. The key for the map is the entry of the load group who is
806+
/// matched to the related store group.
807+
MapVector<Value *, SmallVector<SmallVector<Value *, 4>, 4>> PaddedShufflesMap;
799808
};
800809

801810
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -1702,6 +1711,11 @@ class LoopVectorizationCostModel {
17021711
/// \p VF is the vectorization factor chosen for the original loop.
17031712
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
17041713

1714+
Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
1715+
Value *Ptr) const {
1716+
return InterleaveInfo.hasMatchedLoadGroupForStore(Inst, BB, Ptr);
1717+
}
1718+
17051719
private:
17061720
unsigned NumPredStores = 0;
17071721

@@ -2557,6 +2571,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
25572571
: ShuffledMask;
25582572
};
25592573

2574+
Value *MatchedLoad = nullptr;
2575+
bool IsShufflePadding = false;
2576+
if (EnableShufflePadding && useMaskedInterleavedAccesses(*TTI) &&
2577+
TTI->enableScalableVectorization()) {
2578+
IsShufflePadding = true;
2579+
if (isa<StoreInst>(Instr) && (Group->getNumMembers() != Group->getFactor()))
2580+
MatchedLoad = Cost->hasMatchedLoadGroupForStore(
2581+
Instr, Instr->getParent(), getUnderlyingObject(Instr->getOperand(1)));
2582+
}
2583+
25602584
// Vectorize the interleaved load group.
25612585
if (isa<LoadInst>(Instr)) {
25622586
Value *MaskForGaps = nullptr;
@@ -2626,8 +2650,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
26262650
for (unsigned I = 0; I < InterleaveFactor; ++I) {
26272651
Instruction *Member = Group->getMember(I);
26282652

2629-
// Skip the gaps in the group.
2630-
if (!Member)
2653+
SmallVector<Value *, 4> Shuffles;
2654+
// Skip the gaps in the group if there are no paddings.
2655+
if (!Member && !IsShufflePadding)
26312656
continue;
26322657

26332658
auto StrideMask =
@@ -2636,6 +2661,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
26362661
Value *StridedVec = Builder.CreateShuffleVector(
26372662
NewLoads[Part], StrideMask, "strided.vec");
26382663

2664+
if (!Member) {
2665+
if (Group->isReverse())
2666+
StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2667+
Shuffles.push_back(StridedVec);
2668+
continue;
2669+
}
26392670
// If this member has different type, cast the result type.
26402671
if (Member->getType() != ScalarTy) {
26412672
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
@@ -2646,9 +2677,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
26462677
if (Group->isReverse())
26472678
StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
26482679

2680+
Shuffles.push_back(StridedVec);
2681+
26492682
State.set(VPDefs[J], StridedVec, Part);
26502683
}
2651-
++J;
2684+
PaddedShufflesMap[Instr].push_back(Shuffles);
2685+
if (Member)
2686+
++J;
26522687
}
26532688
return;
26542689
}
@@ -2672,6 +2707,24 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
26722707
"Fail to get a member from an interleaved store group");
26732708
Instruction *Member = Group->getMember(i);
26742709

2710+
if (!Member && MatchedLoad) {
2711+
// %wide.vec = load <12 x float>; 0,1,2,3,...,11
2712+
// %shuffle1 = shuffle %wide.vec, poison, <0, 3, 6, 9> ; 0,3,6,9
2713+
// %shuffle2 = shuffle %wide.vec, poison, <1, 4, 7, 10> ; 1,4,7,10
2714+
// %padded = shuffle %wide.vec, poison, <2, 5, 8, 11> ; 2,5,8,11
2715+
//
2716+
// %concate1 = shuffle %op1, %op2, <0, 1, ..., 7> ; 0,3,6,9,1,4,7,10
2717+
// %concate2 = shuffle %padded, poison,
2718+
// <0, 1, ..., 3, undef, undef, undef, undef>
2719+
// ; 2,5,8,11,poison,...,poison
2720+
// %concateFinal = shuffle %concate1, %concate2,
2721+
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; 0,1,2,3,...,11
2722+
// store <12 x float> %concateFinal
2723+
Value *PaddedShuffle = PaddedShufflesMap[MatchedLoad][i][Part];
2724+
StoredVecs.push_back(PaddedShuffle);
2725+
continue;
2726+
}
2727+
26752728
// Skip the gaps in the group.
26762729
if (!Member) {
26772730
Value *Undef = PoisonValue::get(SubVT);
@@ -2696,7 +2749,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
26962749
// Interleave all the smaller vectors into one wider vector.
26972750
Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
26982751
Instruction *NewStoreInstr;
2699-
if (BlockInMask || MaskForGaps) {
2752+
if ((BlockInMask || MaskForGaps) && !MatchedLoad) {
27002753
Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
27012754
NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
27022755
Group->getAlign(), GroupMask);
@@ -6325,10 +6378,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
63256378
if (Group->getMember(IF))
63266379
Indices.push_back(IF);
63276380

6381+
bool IsShufflePaddingStore = false;
6382+
if (EnableShufflePadding && useMaskedInterleavedAccesses(TTI) &&
6383+
TTI.enableScalableVectorization() && !VF.isScalable())
6384+
IsShufflePaddingStore = true;
6385+
63286386
// Calculate the cost of the whole interleaved group.
6387+
// If shuffle padding is enabled, ignore gaps.
63296388
bool UseMaskForGaps =
63306389
(Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6331-
(isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6390+
(isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()) &&
6391+
(!IsShufflePaddingStore ||
6392+
!hasMatchedLoadGroupForStore(I, I->getParent(),
6393+
getUnderlyingObject(I->getOperand(1)))));
63326394
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
63336395
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
63346396
AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -enable-shuffle-padding=true -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1 | FileCheck %s --check-prefixes=PADDING
3+
; RUN: opt -enable-shuffle-padding=false -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1 | FileCheck %s --check-prefixes=NO-PADDING
4+
5+
%struct.patic = type { float, float, float }
6+
7+
; for (int i = 0; i < num; i++) {
8+
; ps[i].x = factor * ps[i].x;
9+
; ps[i].y = factor * ps[i].y;
10+
; }
11+
;
12+
define void @shufflePadding(i32 noundef %num, ptr nocapture noundef %ps) {
13+
; PADDING-LABEL: 'shufflePadding'
14+
; PADDING: LV: Found an estimated cost of 3 for VF 16 For instruction: store float %mul6, ptr %y, align 4
15+
16+
; NO-PADDING-LABEL: 'shufflePadding'
17+
; NO-PADDING: LV: Found an estimated cost of 188 for VF 16 For instruction: store float %mul6, ptr %y, align 4
18+
entry:
19+
%cmp19 = icmp sgt i32 %num, 0
20+
br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
21+
22+
for.body.preheader: ; preds = %entry
23+
%wide.trip.count = zext i32 %num to i64
24+
br label %for.body
25+
26+
for.cond.cleanup.loopexit: ; preds = %for.body
27+
br label %for.cond.cleanup
28+
29+
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
30+
ret void
31+
32+
for.body: ; preds = %for.body.preheader, %for.body
33+
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
34+
%arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
35+
%0 = load float, ptr %arrayidx, align 4
36+
%mul = fmul fast float %0, 0x40019999A0000000
37+
store float %mul, ptr %arrayidx, align 4
38+
%y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
39+
%1 = load float, ptr %y, align 4
40+
%mul6 = fmul fast float %1, 0x40019999A0000000
41+
store float %mul6, ptr %y, align 4
42+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
43+
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
44+
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
45+
}
46+

0 commit comments

Comments
 (0)