Skip to content

Commit 1fcf78d

Browse files
committed
[SLP]Cache data for compressed loads before codegen
Need to cache and use cached data for compressed loads before codegen to avoid side-effects, caused by the earlier vectorization, which may affect the analysis.
1 parent a9dff35 commit 1fcf78d

File tree

2 files changed

+58
-9
lines changed

2 files changed

+58
-9
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1889,6 +1889,7 @@ class BoUpSLP {
18891889
LoadEntriesToVectorize.clear();
18901890
IsGraphTransformMode = false;
18911891
GatheredLoadsEntriesFirst.reset();
1892+
CompressEntryToData.clear();
18921893
ExternalUses.clear();
18931894
ExternalUsesAsOriginalScalar.clear();
18941895
for (auto &Iter : BlocksSchedules) {
@@ -4308,6 +4309,11 @@ class BoUpSLP {
43084309
/// The index of the first gathered load entry in the VectorizeTree.
43094310
std::optional<unsigned> GatheredLoadsEntriesFirst;
43104311

4312+
/// Maps compress entries to their mask data for the final codegen.
4313+
SmallDenseMap<const TreeEntry *,
4314+
std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4315+
CompressEntryToData;
4316+
43114317
/// This POD struct describes one external user in the vectorized tree.
43124318
struct ExternalUser {
43134319
ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
@@ -13428,6 +13434,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1342813434
*TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
1342913435
CompressMask, LoadVecTy);
1343013436
assert(IsVectorized && "Expected to be vectorized");
13437+
CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
13438+
InterleaveFactor, IsMasked);
1343113439
Align CommonAlignment;
1343213440
if (IsMasked)
1343313441
CommonAlignment = computeCommonAlignment<LoadInst>(VL);
@@ -17963,10 +17971,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1796317971
if (E->State == TreeEntry::Vectorize) {
1796417972
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
1796517973
} else if (E->State == TreeEntry::CompressVectorize) {
17966-
bool IsMasked;
17967-
unsigned InterleaveFactor;
17968-
SmallVector<int> CompressMask;
17969-
VectorType *LoadVecTy;
1797017974
SmallVector<Value *> Scalars(E->Scalars.begin(), E->Scalars.end());
1797117975
if (!E->ReorderIndices.empty()) {
1797217976
SmallVector<int> Mask(E->ReorderIndices.begin(),
@@ -17976,11 +17980,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1797617980
SmallVector<Value *> PointerOps(Scalars.size());
1797717981
for (auto [I, V] : enumerate(Scalars))
1797817982
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
17979-
[[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
17980-
Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
17981-
*TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
17982-
CompressMask, LoadVecTy);
17983-
assert(IsVectorized && "Expected to be vectorized");
17983+
auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
17984+
CompressEntryToData.at(E);
1798417985
Align CommonAlignment;
1798517986
if (IsMasked)
1798617987
CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
3+
4+
declare noalias ptr @malloc()
5+
6+
define void @test() {
7+
; CHECK-LABEL: define void @test() {
8+
; CHECK-NEXT: [[TMP1:%.*]] = call dereferenceable_or_null(16) ptr @malloc()
9+
; CHECK-NEXT: [[TMP2:%.*]] = load volatile ptr, ptr null, align 8
10+
; CHECK-NEXT: [[TMP3:%.*]] = load <15 x i8>, ptr [[TMP1]], align 1
11+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <15 x i8> [[TMP3]], <15 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
12+
; CHECK-NEXT: store <8 x i8> [[TMP4]], ptr [[TMP2]], align 1
13+
; CHECK-NEXT: ret void
14+
;
15+
%1 = call dereferenceable_or_null(16) ptr @malloc()
16+
%2 = load volatile ptr, ptr null, align 8
17+
%3 = load i8, ptr %1, align 1
18+
store i8 %3, ptr %2, align 1
19+
%4 = getelementptr i8, ptr %1, i64 2
20+
%5 = load i8, ptr %4, align 1
21+
%6 = getelementptr i8, ptr %2, i64 1
22+
store i8 %5, ptr %6, align 1
23+
%7 = getelementptr i8, ptr %1, i64 4
24+
%8 = load i8, ptr %7, align 1
25+
%9 = getelementptr i8, ptr %2, i64 2
26+
store i8 %8, ptr %9, align 1
27+
%10 = getelementptr i8, ptr %1, i64 6
28+
%11 = load i8, ptr %10, align 1
29+
%12 = getelementptr i8, ptr %2, i64 3
30+
store i8 %11, ptr %12, align 1
31+
%13 = getelementptr i8, ptr %1, i64 8
32+
%14 = load i8, ptr %13, align 1
33+
%15 = getelementptr i8, ptr %2, i64 4
34+
store i8 %14, ptr %15, align 1
35+
%16 = getelementptr i8, ptr %1, i64 10
36+
%17 = load i8, ptr %16, align 1
37+
%18 = getelementptr i8, ptr %2, i64 5
38+
store i8 %17, ptr %18, align 1
39+
%19 = getelementptr i8, ptr %1, i64 12
40+
%20 = load i8, ptr %19, align 1
41+
%21 = getelementptr i8, ptr %2, i64 6
42+
store i8 %20, ptr %21, align 1
43+
%22 = getelementptr i8, ptr %1, i64 14
44+
%23 = load i8, ptr %22, align 1
45+
%24 = getelementptr i8, ptr %2, i64 7
46+
store i8 %23, ptr %24, align 1
47+
ret void
48+
}

0 commit comments

Comments
 (0)