Skip to content

Commit 3fd2b8d

Browse files
committed
!fixup address comments, thanks!
1 parent 7755ba9 commit 3fd2b8d

File tree

3 files changed

+189
-55
lines changed

3 files changed

+189
-55
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 49 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2234,10 +2234,13 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
22342234
}
22352235

22362236
static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
2237-
if (auto *W = dyn_cast_or_null<VPWidenLoadRecipe>(V->getDefiningRecipe()))
2238-
return !W->getMask() && (R0->getOperand(0) == V || R0->getOperand(1) == V);
2237+
auto *DefR = V->getDefiningRecipe();
2238+
if (!DefR)
2239+
return false;
2240+
if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
2241+
return !W->getMask() && is_contained(R0->operands(), V);
22392242

2240-
if (auto *IR = dyn_cast_or_null<VPInterleaveRecipe>(V->getDefiningRecipe()))
2243+
if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
22412244
return IR->getInterleaveGroup()->getFactor() ==
22422245
IR->getInterleaveGroup()->getNumMembers() &&
22432246
IR->getVPValue(Idx) == V;
@@ -2246,13 +2249,12 @@ static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
22462249

22472250
/// Returns true if \p IR is a full interleave group with factor and number of
22482251
/// members both equal to \p VF.
2249-
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *IR,
2250-
ElementCount VF) {
2251-
if (!IR)
2252+
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
2253+
unsigned VF) {
2254+
if (!InterleaveR)
22522255
return false;
2253-
auto IG = IR->getInterleaveGroup();
2254-
return IG->getFactor() == IG->getNumMembers() &&
2255-
IG->getNumMembers() == VF.getFixedValue();
2256+
auto IG = InterleaveR->getInterleaveGroup();
2257+
return IG->getFactor() == VF && IG->getNumMembers() == VF;
22562258
}
22572259

22582260
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
@@ -2261,6 +2263,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
22612263
if (VF.isScalable() || !VectorLoop)
22622264
return;
22632265

2266+
unsigned FixedVF = VF.getFixedValue();
22642267
SmallVector<VPInterleaveRecipe *> StoreGroups;
22652268
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
22662269
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
@@ -2274,33 +2277,50 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
22742277
if (R.isPhi())
22752278
return;
22762279

2277-
auto *IR = dyn_cast<VPInterleaveRecipe>(&R);
2278-
if (R.mayWriteToMemory() && !IR)
2280+
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
2281+
if (R.mayWriteToMemory() && !InterleaveR)
22792282
return;
22802283

2281-
if (!IR)
2284+
if (!InterleaveR)
22822285
continue;
22832286

2284-
if (!isConsecutiveInterleaveGroup(IR, VF))
2287+
// Bail out on non-consecutive interleave groups.
2288+
if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF))
22852289
return;
2286-
if (IR->getStoredValues().empty())
2290+
2291+
// Skip read interleave groups.
2292+
if (InterleaveR->getStoredValues().empty())
2293+
continue;
2294+
2295+
if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
2296+
VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
2297+
if (!DefR)
2298+
return false;
2299+
auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
2300+
return IR &&
2301+
IR->getInterleaveGroup()->getFactor() ==
2302+
IR->getInterleaveGroup()->getNumMembers() &&
2303+
IR->getVPValue(Op.index()) == Op.value();
2304+
})) {
2305+
StoreGroups.push_back(InterleaveR);
22872306
continue;
2307+
}
22882308

22892309
auto *Lane0 = dyn_cast_or_null<VPWidenRecipe>(
2290-
IR->getStoredValues()[0]->getDefiningRecipe());
2310+
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
22912311
if (!Lane0)
22922312
return;
2293-
for (const auto &[I, V] : enumerate(IR->getStoredValues())) {
2313+
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
22942314
auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
2295-
if (!R || R->getOpcode() != Lane0->getOpcode())
2315+
if (!R || R->getOpcode() != Lane0->getOpcode() || R->getNumOperands() > 2)
22962316
return;
22972317
if (any_of(R->operands(), [Lane0, Idx = I](VPValue *V) {
22982318
return !supportedLoad(Lane0, V, Idx);
22992319
}))
23002320
return;
23012321
}
23022322

2303-
StoreGroups.push_back(IR);
2323+
StoreGroups.push_back(InterleaveR);
23042324
}
23052325

23062326
if (StoreGroups.empty())
@@ -2330,15 +2350,20 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
23302350

23312351
// Narrow operation tree rooted at store groups.
23322352
for (auto *StoreGroup : StoreGroups) {
2333-
auto *Lane0 = cast<VPWidenRecipe>(
2334-
StoreGroup->getStoredValues()[0]->getDefiningRecipe());
2335-
2336-
Lane0->setOperand(0, Narrow(Lane0->getOperand(0)->getDefiningRecipe()));
2337-
Lane0->setOperand(1, Narrow(Lane0->getOperand(1)->getDefiningRecipe()));
2353+
VPValue *Res = nullptr;
2354+
if (auto *Lane0 = dyn_cast<VPWidenRecipe>(
2355+
StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
2356+
for (unsigned Idx = 0, E = Lane0->getNumOperands(); Idx != E; ++Idx)
2357+
Lane0->setOperand(Idx,
2358+
Narrow(Lane0->getOperand(Idx)->getDefiningRecipe()));
2359+
Res = Lane0;
2360+
} else {
2361+
Res = Narrow(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
2362+
}
23382363

23392364
auto *S = new VPWidenStoreRecipe(
23402365
*cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
2341-
StoreGroup->getAddr(), Lane0, nullptr, /*Consecutive=*/true,
2366+
StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
23422367
/*Reverse=*/false, StoreGroup->getDebugLoc());
23432368
S->insertBefore(StoreGroup);
23442369
StoreGroup->eraseFromParent();

llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,10 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no
1616
; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
1717
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
1818
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP1]]
19-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
20-
; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
21-
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
22-
; VF2-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[STRIDED_VEC]]
19+
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
2320
; VF2-NEXT: [[TMP4:%.*]] = fneg <2 x double> [[STRIDED_VEC1]]
24-
; VF2-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
25-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
26-
; VF2-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
27-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
21+
; VF2-NEXT: store <2 x double> [[TMP4]], ptr [[TMP2]], align 8
22+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
2823
; VF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
2924
; VF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
3025
; VF2: [[MIDDLE_BLOCK]]:
@@ -256,18 +251,15 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
256251
; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
257252
; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
258253
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
259-
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
254+
; VF2-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
255+
; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
256+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
260257
; VF2-NEXT: [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1
261258
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
262-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
263-
; VF2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
264-
; VF2-NEXT: [[TMP23:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
265-
; VF2-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
259+
; VF2-NEXT: [[TMP23:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
266260
; VF2-NEXT: [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]]
267-
; VF2-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP24]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
268-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
269-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
270-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
261+
; VF2-NEXT: store <2 x i64> [[TMP24]], ptr [[TMP7]], align 8
262+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
271263
; VF2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
272264
; VF2-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
273265
; VF2: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll

Lines changed: 131 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,9 @@ define void @load_store_interleave_group(ptr noalias %data) {
1616
; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
1717
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
1818
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
19-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
20-
; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
21-
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
22-
; VF2-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
23-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
24-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
25-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
19+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
20+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP2]], align 8
21+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
2622
; VF2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
2723
; VF2-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
2824
; VF2: [[MIDDLE_BLOCK]]:
@@ -120,14 +116,10 @@ define void @load_store_interleave_group_different_objecs(ptr noalias %src, ptr
120116
; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
121117
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
122118
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
123-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
124-
; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
125-
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
119+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
126120
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
127-
; VF2-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
128-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
129-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
130-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
121+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP3]], align 8
122+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
131123
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
132124
; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
133125
; VF2: [[MIDDLE_BLOCK]]:
@@ -323,3 +315,128 @@ loop:
323315
exit:
324316
ret void
325317
}
318+
319+
define void @same_load_group_used_by_multiple_load_groups(ptr noalias %src, ptr noalias %A, ptr noalias %B) {
320+
; VF2-LABEL: define void @same_load_group_used_by_multiple_load_groups(
321+
; VF2-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
322+
; VF2-NEXT: [[ENTRY:.*]]:
323+
; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
324+
; VF2: [[VECTOR_PH]]:
325+
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
326+
; VF2: [[VECTOR_BODY]]:
327+
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
328+
; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
329+
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
330+
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
331+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
332+
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
333+
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
334+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP3]], align 8
335+
; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
336+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP4]], align 8
337+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
338+
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
339+
; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
340+
; VF2: [[MIDDLE_BLOCK]]:
341+
; VF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
342+
; VF2: [[SCALAR_PH]]:
343+
; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
344+
; VF2-NEXT: br label %[[LOOP:.*]]
345+
; VF2: [[LOOP]]:
346+
; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
347+
; VF2-NEXT: [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
348+
; VF2-NEXT: [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[MUL_2]]
349+
; VF2-NEXT: [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
350+
; VF2-NEXT: [[A_0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[MUL_2]]
351+
; VF2-NEXT: store i64 [[L_0]], ptr [[A_0]], align 8
352+
; VF2-NEXT: [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
353+
; VF2-NEXT: [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[ADD_1]]
354+
; VF2-NEXT: [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8
355+
; VF2-NEXT: [[A_1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD_1]]
356+
; VF2-NEXT: store i64 [[L_1]], ptr [[A_1]], align 8
357+
; VF2-NEXT: [[B_0:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[MUL_2]]
358+
; VF2-NEXT: store i64 [[L_0]], ptr [[B_0]], align 8
359+
; VF2-NEXT: [[B_1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[ADD_1]]
360+
; VF2-NEXT: store i64 [[L_1]], ptr [[B_1]], align 8
361+
; VF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
362+
; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
363+
; VF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
364+
; VF2: [[EXIT]]:
365+
; VF2-NEXT: ret void
366+
;
367+
; VF4-LABEL: define void @same_load_group_used_by_multiple_load_groups(
368+
; VF4-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
369+
; VF4-NEXT: [[ENTRY:.*]]:
370+
; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
371+
; VF4: [[VECTOR_PH]]:
372+
; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
373+
; VF4: [[VECTOR_BODY]]:
374+
; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
375+
; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
376+
; VF4-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
377+
; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
378+
; VF4-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
379+
; VF4-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
380+
; VF4-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
381+
; VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
382+
; VF4-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[STRIDED_VEC]], <4 x i64> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
383+
; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
384+
; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
385+
; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
386+
; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
387+
; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
388+
; VF4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
389+
; VF4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
390+
; VF4: [[MIDDLE_BLOCK]]:
391+
; VF4-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
392+
; VF4: [[SCALAR_PH]]:
393+
; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
394+
; VF4-NEXT: br label %[[LOOP:.*]]
395+
; VF4: [[LOOP]]:
396+
; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
397+
; VF4-NEXT: [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
398+
; VF4-NEXT: [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[MUL_2]]
399+
; VF4-NEXT: [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
400+
; VF4-NEXT: [[A_0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[MUL_2]]
401+
; VF4-NEXT: store i64 [[L_0]], ptr [[A_0]], align 8
402+
; VF4-NEXT: [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
403+
; VF4-NEXT: [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[ADD_1]]
404+
; VF4-NEXT: [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8
405+
; VF4-NEXT: [[A_1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD_1]]
406+
; VF4-NEXT: store i64 [[L_1]], ptr [[A_1]], align 8
407+
; VF4-NEXT: [[B_0:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[MUL_2]]
408+
; VF4-NEXT: store i64 [[L_0]], ptr [[B_0]], align 8
409+
; VF4-NEXT: [[B_1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[ADD_1]]
410+
; VF4-NEXT: store i64 [[L_1]], ptr [[B_1]], align 8
411+
; VF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
412+
; VF4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
413+
; VF4-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
414+
; VF4: [[EXIT]]:
415+
; VF4-NEXT: ret void
416+
;
417+
entry:
418+
br label %loop
419+
420+
loop:
421+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
422+
%mul.2 = shl nsw i64 %iv, 1
423+
%src.0 = getelementptr inbounds i64, ptr %src, i64 %mul.2
424+
%l.0 = load i64, ptr %src.0, align 8
425+
%A.0 = getelementptr inbounds i64, ptr %A, i64 %mul.2
426+
store i64 %l.0, ptr %A.0, align 8
427+
%add.1 = or disjoint i64 %mul.2, 1
428+
%src.1 = getelementptr inbounds i64, ptr %src, i64 %add.1
429+
%l.1 = load i64, ptr %src.1, align 8
430+
%A.1 = getelementptr inbounds i64, ptr %A, i64 %add.1
431+
store i64 %l.1, ptr %A.1, align 8
432+
%B.0 = getelementptr inbounds i64, ptr %B, i64 %mul.2
433+
store i64 %l.0, ptr %B.0, align 8
434+
%B.1 = getelementptr inbounds i64, ptr %B, i64 %add.1
435+
store i64 %l.1, ptr %B.1, align 8
436+
%iv.next = add nuw nsw i64 %iv, 1
437+
%ec = icmp eq i64 %iv.next, 100
438+
br i1 %ec, label %exit, label %loop
439+
440+
exit:
441+
ret void
442+
}

0 commit comments

Comments
 (0)