Skip to content

Commit e1ea86e

Browse files
committed
[SLP]Do not try to use interleaved loads, if reordering is required
If the interleaved loads require reordering, better to avoid generate load + shuffle sequence, which in this case cannot be recognized as interleaved load. Also, it fixes the issue with the incorrect codegen. Fixes #138923
1 parent 09b772e commit e1ea86e

File tree

2 files changed

+40
-39
lines changed

2 files changed

+40
-39
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5976,10 +5976,15 @@ static bool isMaskedLoadCompress(
59765976
TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
59775977
LI->getPointerAddressSpace(), CostKind);
59785978
}
5979-
if (IsStrided && !IsMasked) {
5979+
if (IsStrided && !IsMasked && Order.empty()) {
59805980
// Check for potential segmented(interleaved) loads.
5981-
auto *AlignedLoadVecTy = getWidenedType(
5981+
VectorType *AlignedLoadVecTy = getWidenedType(
59825982
ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
5983+
if (!isSafeToLoadUnconditionally(
5984+
Ptr0, AlignedLoadVecTy, CommonAlignment, DL,
5985+
cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC,
5986+
&DT, &TLI))
5987+
AlignedLoadVecTy = LoadVecTy;
59835988
if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
59845989
CommonAlignment,
59855990
LI->getPointerAddressSpace())) {
@@ -18226,15 +18231,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1822618231
if (E->State == TreeEntry::Vectorize) {
1822718232
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
1822818233
} else if (E->State == TreeEntry::CompressVectorize) {
18229-
SmallVector<Value *> Scalars(E->Scalars.begin(), E->Scalars.end());
18230-
if (!E->ReorderIndices.empty()) {
18231-
SmallVector<int> Mask(E->ReorderIndices.begin(),
18232-
E->ReorderIndices.end());
18233-
reorderScalars(Scalars, Mask);
18234-
}
18235-
SmallVector<Value *> PointerOps(Scalars.size());
18236-
for (auto [I, V] : enumerate(Scalars))
18237-
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
1823818234
auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
1823918235
CompressEntryToData.at(E);
1824018236
Align CommonAlignment = LI->getAlign();

llvm/test/Transforms/SLPVectorizer/RISCV/reordered-interleaved-loads.ll

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,48 +10,53 @@ define i1 @test(i32 %conv15.12, i16 %0, ptr %1, i16 %2, i16 %3, i16 %4, i16 %5,
1010
; CHECK-NEXT: [[ENTRY:.*:]]
1111
; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[TMP1]], align 2
1212
; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr @h, align 2
13-
; CHECK-NEXT: [[TMP9:%.*]] = load <16 x i16>, ptr getelementptr inbounds nuw (i8, ptr @h, i64 6), align 2
14-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
15-
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> poison, i16 [[TMP6]], i32 0
16-
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP11]], i16 [[TMP5]], i32 1
17-
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP4]], i32 2
18-
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP3]], i32 3
19-
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP2]], i32 4
20-
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP0]], i32 6
21-
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP8]], i32 5
22-
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP7]], i32 7
23-
; CHECK-NEXT: [[TMP19:%.*]] = sext <8 x i16> [[TMP18]] to <8 x i32>
24-
; CHECK-NEXT: [[TMP20:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP19]], <8 x i32> zeroinitializer)
25-
; CHECK-NEXT: [[TMP21:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP18]], <4 x i16> [[TMP10]], i64 0)
26-
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq <8 x i16> [[TMP21]], zeroinitializer
27-
; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i32> splat (i32 8), <8 x i32> [[TMP19]]
28-
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[CONV15_1_4]], i32 0
29-
; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[CONV15_1_3]], i32 1
30-
; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> [[TMP25]], i32 [[CONV15_12]], i32 7
31-
; CHECK-NEXT: [[TMP27:%.*]] = xor <8 x i32> [[TMP23]], [[TMP26]]
32-
; CHECK-NEXT: [[TMP28:%.*]] = icmp sgt <8 x i32> [[TMP20]], [[TMP27]]
33-
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i1> [[TMP28]], i32 7
13+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
14+
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP0]], i32 2
15+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i16> [[TMP10]], i16 [[TMP8]], i32 1
16+
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[TMP7]], i32 3
17+
; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[TMP12]] to <4 x i32>
18+
; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP13]], <4 x i32> zeroinitializer)
19+
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <4 x i16> [[TMP12]], zeroinitializer
20+
; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> splat (i32 8), <4 x i32> [[TMP13]]
21+
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CONV15_12]], i32 3
22+
; CHECK-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[TMP16]], [[TMP17]]
23+
; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt <4 x i32> [[TMP14]], [[TMP18]]
24+
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3
3425
; CHECK-NEXT: [[CONV30_18:%.*]] = zext i1 [[TMP29]] to i16
3526
; CHECK-NEXT: store i16 [[CONV30_18]], ptr @a, align 2
36-
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[TMP28]], i32 6
27+
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2
3728
; CHECK-NEXT: [[CONV30_219:%.*]] = zext i1 [[TMP30]] to i16
3829
; CHECK-NEXT: store i16 [[CONV30_219]], ptr @a, align 2
39-
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i1> [[TMP28]], i32 5
30+
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1
4031
; CHECK-NEXT: [[CONV30_330:%.*]] = zext i1 [[TMP31]] to i16
4132
; CHECK-NEXT: store i16 [[CONV30_330]], ptr @a, align 2
42-
; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP28]], i32 4
33+
; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0
4334
; CHECK-NEXT: [[CONV30_4:%.*]] = zext i1 [[TMP32]] to i16
4435
; CHECK-NEXT: store i16 [[CONV30_4]], ptr @a, align 2
45-
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i1> [[TMP28]], i32 3
36+
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> poison, i16 [[TMP3]], i32 0
37+
; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP4]], i32 1
38+
; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> [[TMP25]], i16 [[TMP5]], i32 2
39+
; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP6]], i32 3
40+
; CHECK-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[TMP27]] to <4 x i32>
41+
; CHECK-NEXT: [[TMP38:%.*]] = load <16 x i16>, ptr getelementptr inbounds nuw (i8, ptr @h, i64 6), align 2
42+
; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP38]], <16 x i16> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
43+
; CHECK-NEXT: [[TMP40:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP28]], <4 x i32> zeroinitializer)
44+
; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <4 x i16> [[TMP39]], zeroinitializer
45+
; CHECK-NEXT: [[TMP42:%.*]] = select <4 x i1> [[TMP41]], <4 x i32> splat (i32 8), <4 x i32> [[TMP28]]
46+
; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[CONV15_1_3]], i32 2
47+
; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[CONV15_1_4]], i32 3
48+
; CHECK-NEXT: [[TMP45:%.*]] = xor <4 x i32> [[TMP42]], [[TMP44]]
49+
; CHECK-NEXT: [[TMP37:%.*]] = icmp sgt <4 x i32> [[TMP40]], [[TMP45]]
50+
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP37]], i32 0
4651
; CHECK-NEXT: [[CONV30_1_1:%.*]] = zext i1 [[TMP33]] to i16
4752
; CHECK-NEXT: store i16 [[CONV30_1_1]], ptr @a, align 2
48-
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP28]], i32 2
53+
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP37]], i32 1
4954
; CHECK-NEXT: [[CONV30_1_2:%.*]] = zext i1 [[TMP34]] to i16
5055
; CHECK-NEXT: store i16 [[CONV30_1_2]], ptr @a, align 2
51-
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP28]], i32 1
56+
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP37]], i32 2
5257
; CHECK-NEXT: [[CONV30_1_3:%.*]] = zext i1 [[TMP35]] to i16
5358
; CHECK-NEXT: store i16 [[CONV30_1_3]], ptr @a, align 2
54-
; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i1> [[TMP28]], i32 0
59+
; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP37]], i32 3
5560
; CHECK-NEXT: ret i1 [[TMP36]]
5661
;
5762
entry:

0 commit comments

Comments
 (0)