Skip to content

Commit 7f74651

Browse files
authored
[VPlan] Use pointer to member 0 as VPInterleaveRecipe's pointer arg. (#106431)
Update VPInterleaveRecipe to always use the pointer to member 0 as pointer argument. This in many cases helps to remove unneeded index adjustments and simplifies VPInterleaveRecipe::execute. In some rare cases, the address of member 0 does not dominate the insert position of the interleave group. In those cases a PtrAdd VPInstruction is emitted to compute the address of member 0 based on the address of the insert position. Alternatively we could hoist the recipe computing the address of member 0.
1 parent b0c070e commit 7f74651

18 files changed

+157
-237
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,9 +220,15 @@ class VPBuilder {
220220
new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
221221
}
222222

223-
VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL,
223+
VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
224224
const Twine &Name = "") {
225-
return createInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, DL, Name);
225+
return tryInsertInstruction(new VPInstruction(
226+
Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name));
227+
}
228+
VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
229+
const Twine &Name = "") {
230+
return tryInsertInstruction(new VPInstruction(
231+
Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name));
226232
}
227233

228234
VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9070,8 +9070,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
90709070
// Interleave memory: for each Interleave Group we marked earlier as relevant
90719071
// for this VPlan, replace the Recipes widening its memory instructions with a
90729072
// single VPInterleaveRecipe at its insertion point.
9073-
VPlanTransforms::createInterleaveGroups(InterleaveGroups, RecipeBuilder,
9074-
CM.isScalarEpilogueAllowed());
9073+
VPlanTransforms::createInterleaveGroups(
9074+
*Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
90759075

90769076
for (ElementCount VF : Range)
90779077
Plan->addVF(VF);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -957,7 +957,6 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
957957
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
958958
};
959959

960-
protected:
961960
struct GEPFlagsTy {
962961
char IsInBounds : 1;
963962
GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
@@ -1308,6 +1307,12 @@ class VPInstruction : public VPRecipeWithIRFlags,
13081307
assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
13091308
}
13101309

1310+
VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags,
1311+
DebugLoc DL = {}, const Twine &Name = "")
1312+
: VPRecipeWithIRFlags(VPDef::VPInstructionSC,
1313+
ArrayRef<VPValue *>({Ptr, Offset}), Flags, DL),
1314+
Opcode(VPInstruction::PtrAdd), Name(Name.str()) {}
1315+
13111316
VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
13121317
FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
13131318

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
641641
"can only generate first lane for PtrAdd");
642642
Value *Ptr = State.get(getOperand(0), /* IsScalar */ true);
643643
Value *Addend = State.get(getOperand(1), /* IsScalar */ true);
644-
return Builder.CreatePtrAdd(Ptr, Addend, Name);
644+
return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name)
645+
: Builder.CreatePtrAdd(Ptr, Addend, Name);
645646
}
646647
case VPInstruction::ResumePhi: {
647648
Value *IncomingFromVPlanPred =
@@ -2605,51 +2606,37 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
26052606
unsigned InterleaveFactor = Group->getFactor();
26062607
auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
26072608

2608-
// Prepare for the new pointers.
2609-
unsigned Index = Group->getIndex(Instr);
2610-
26112609
// TODO: extend the masked interleaved-group support to reversed access.
26122610
VPValue *BlockInMask = getMask();
26132611
assert((!BlockInMask || !Group->isReverse()) &&
26142612
"Reversed masked interleave-group not supported.");
26152613

2616-
Value *Idx;
2614+
Value *Index;
26172615
// If the group is reverse, adjust the index to refer to the last vector lane
26182616
// instead of the first. We adjust the index from the first vector lane,
26192617
// rather than directly getting the pointer for lane VF - 1, because the
26202618
// pointer operand of the interleaved access is supposed to be uniform.
26212619
if (Group->isReverse()) {
26222620
Value *RuntimeVF =
26232621
getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2624-
Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2625-
Idx = State.Builder.CreateMul(Idx,
2626-
State.Builder.getInt32(Group->getFactor()));
2627-
Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2628-
Idx = State.Builder.CreateNeg(Idx);
2629-
} else
2630-
Idx = State.Builder.getInt32(-Index);
2622+
Index = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2623+
Index = State.Builder.CreateMul(Index,
2624+
State.Builder.getInt32(Group->getFactor()));
2625+
Index = State.Builder.CreateNeg(Index);
2626+
} else {
2627+
// TODO: Drop redundant 0-index GEP as follow-up.
2628+
Index = State.Builder.getInt32(0);
2629+
}
26312630

26322631
VPValue *Addr = getAddr();
26332632
Value *ResAddr = State.get(Addr, VPLane(0));
26342633
if (auto *I = dyn_cast<Instruction>(ResAddr))
26352634
State.setDebugLocFrom(I->getDebugLoc());
26362635

2637-
// Notice current instruction could be any index. Need to adjust the address
2638-
// to the member of index 0.
2639-
//
2640-
// E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2641-
// b = A[i]; // Member of index 0
2642-
// Current pointer is pointed to A[i+1], adjust it to A[i].
2643-
//
2644-
// E.g. A[i+1] = a; // Member of index 1
2645-
// A[i] = b; // Member of index 0
2646-
// A[i+2] = c; // Member of index 2 (Current instruction)
2647-
// Current pointer is pointed to A[i+2], adjust it to A[i].
2648-
26492636
bool InBounds = false;
26502637
if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
26512638
InBounds = gep->isInBounds();
2652-
ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds);
2639+
ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
26532640

26542641
State.setDebugLocFrom(Instr->getDebugLoc());
26552642
Value *PoisonVec = PoisonValue::get(VecTy);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1590,14 +1590,19 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
15901590
}
15911591

15921592
void VPlanTransforms::createInterleaveGroups(
1593-
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
1593+
VPlan &Plan,
1594+
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
1595+
&InterleaveGroups,
15941596
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed) {
1597+
if (InterleaveGroups.empty())
1598+
return;
1599+
15951600
// Interleave memory: for each Interleave Group we marked earlier as relevant
15961601
// for this VPlan, replace the Recipes widening its memory instructions with a
15971602
// single VPInterleaveRecipe at its insertion point.
1603+
VPDominatorTree VPDT;
1604+
VPDT.recalculate(Plan);
15981605
for (const auto *IG : InterleaveGroups) {
1599-
auto *Recipe =
1600-
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
16011606
SmallVector<VPValue *, 4> StoredValues;
16021607
for (unsigned i = 0; i < IG->getFactor(); ++i)
16031608
if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
@@ -1607,9 +1612,44 @@ void VPlanTransforms::createInterleaveGroups(
16071612

16081613
bool NeedsMaskForGaps =
16091614
IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
1610-
auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
1611-
Recipe->getMask(), NeedsMaskForGaps);
1612-
VPIG->insertBefore(Recipe);
1615+
1616+
Instruction *IRInsertPos = IG->getInsertPos();
1617+
auto *InsertPos =
1618+
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
1619+
1620+
// Get or create the start address for the interleave group.
1621+
auto *Start =
1622+
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
1623+
VPValue *Addr = Start->getAddr();
1624+
VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
1625+
if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
1626+
// TODO: Hoist Addr's defining recipe (and any operands as needed) to
1627+
// InsertPos or sink loads above zero members to join it.
1628+
bool InBounds = false;
1629+
if (auto *Gep = dyn_cast<GetElementPtrInst>(
1630+
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
1631+
InBounds = Gep->isInBounds();
1632+
1633+
// We cannot re-use the address of member zero because it does not
1634+
// dominate the insert position. Instead, use the address of the insert
1635+
// position and create a PtrAdd adjusting it to the address of member
1636+
// zero.
1637+
assert(IG->getIndex(IRInsertPos) != 0 &&
1638+
"index of insert position shouldn't be zero");
1639+
APInt Offset(32,
1640+
getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 *
1641+
IG->getIndex(IRInsertPos),
1642+
/*IsSigned=*/true);
1643+
VPValue *OffsetVPV = Plan.getOrAddLiveIn(
1644+
ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
1645+
VPBuilder B(InsertPos);
1646+
Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
1647+
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
1648+
}
1649+
auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
1650+
InsertPos->getMask(), NeedsMaskForGaps);
1651+
VPIG->insertBefore(InsertPos);
1652+
16131653
unsigned J = 0;
16141654
for (unsigned i = 0; i < IG->getFactor(); ++i)
16151655
if (Instruction *Member = IG->getMember(i)) {

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,9 @@ struct VPlanTransforms {
114114
// widening its memory instructions with a single VPInterleaveRecipe at its
115115
// insertion point.
116116
static void createInterleaveGroups(
117-
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
117+
VPlan &Plan,
118+
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
119+
&InterleaveGroups,
118120
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);
119121

120122
/// Remove dead recipes from \p Plan.

llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ define void @interleaved_store_first_order_recurrence(ptr noalias %src, ptr %dst
1414
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1515
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[TMP0]], 3
1616
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP3]]
17-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 2
18-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -2
17+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
1918
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2019
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
2120
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP10]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,11 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
4141
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
4242
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
4343
; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
44-
; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
4544
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
4645
; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
47-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
48-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -4
46+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
4947
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
50-
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
48+
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
5149
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
5250
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
5351
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -121,20 +119,19 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
121119
; CHECK: vector.body:
122120
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
123121
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
122+
; CHECK-NEXT: [[OFFSET_IDX:%.+]] = shl i64 [[INDEX]], 1
124123
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
125124
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
126125
; CHECK-NEXT: [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
127126
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[TMP7]]
128127
; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
129128
; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
130129
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
130+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
131131
; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
132132
; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
133-
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[TMP7]], i64 0
134-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]]
135-
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
136133
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
137-
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
134+
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
138135
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
139136
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
140137
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -404,10 +401,10 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
404401
; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
405402
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
406403
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
407-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
404+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
408405
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
409406
; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
410-
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]]
407+
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
411408
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
412409
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
413410
; CHECK-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
@@ -715,16 +712,14 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
715712
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
716713
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
717714
; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
718-
; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
719715
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
716+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
720717
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
721718
; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
722719
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
723720
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
724-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
725-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -4
726721
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
727-
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4
722+
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
728723
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
729724
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
730725
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1271,14 +1266,11 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
12711266
; CHECK: vector.body:
12721267
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
12731268
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1274-
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1275-
; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
12761269
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
1277-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
12781270
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
1279-
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
1271+
; CHECK-NEXT: [[P:%.+]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
12801272
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
1281-
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
1273+
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[P]], align 4
12821274
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
12831275
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
12841276
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

0 commit comments

Comments
 (0)