Skip to content

Commit cd2cf4d

Browse files
committed
[VectorCombine] Switch to using a worklist.
This patch updates VectorCombine to use a worklist to allow iterative simplifications where a combine enables other combines. Suggested in D100302. The main use case at the moment is foldSingleElementStore and scalarizeLoadExtract working together to improve scalarization. Note that we now also do not run SimplifyInstructionsInBlock on the whole function if there have been changes. This means we fail to remove/simplify instructions not related to any of the vector combines. IMO this is fine, as simplifying the whole function seems more like a workaround for not tracking the changed instructions. Compile-time impact looks neutral: NewPM-O3: +0.02% NewPM-ReleaseThinLTO: -0.00% NewPM-ReleaseLTO-g: -0.02% http://llvm-compile-time-tracker.com/compare.php?from=52832cd917af00e2b9c6a9d1476ba79754dcabff&to=e66520a4637290550a945d528e3e59573485dd40&stat=instructions Reviewed By: spatel, lebedev.ri Differential Revision: https://reviews.llvm.org/D110171 (cherry-picked from 300870a)
1 parent 70a5ba2 commit cd2cf4d

File tree

6 files changed

+82
-46
lines changed

6 files changed

+82
-46
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@
3131
#include "llvm/Transforms/Utils/Local.h"
3232
#include "llvm/Transforms/Vectorize.h"
3333

34+
#define DEBUG_TYPE "vector-combine"
35+
#include "llvm/Transforms/Utils/InstructionWorklist.h"
36+
3437
using namespace llvm;
3538
using namespace llvm::PatternMatch;
3639

37-
#define DEBUG_TYPE "vector-combine"
3840
STATISTIC(NumVecLoad, "Number of vector loads formed");
3941
STATISTIC(NumVecCmp, "Number of vector compares formed");
4042
STATISTIC(NumVecBO, "Number of vector binops formed");
@@ -73,6 +75,7 @@ class VectorCombine {
7375
const DominatorTree &DT;
7476
AAResults &AA;
7577
AssumptionCache ∾
78+
InstructionWorklist Worklist;
7679

7780
bool vectorizeLoadInsert(Instruction &I);
7881
ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
@@ -92,14 +95,26 @@ class VectorCombine {
9295
bool foldExtractedCmps(Instruction &I);
9396
bool foldSingleElementStore(Instruction &I);
9497
bool scalarizeLoadExtract(Instruction &I);
98+
99+
void replaceValue(Value &Old, Value &New) {
100+
Old.replaceAllUsesWith(&New);
101+
New.takeName(&Old);
102+
if (auto *NewI = dyn_cast<Instruction>(&New)) {
103+
Worklist.pushUsersToWorkList(*NewI);
104+
Worklist.pushValue(NewI);
105+
}
106+
Worklist.pushValue(&Old);
107+
}
108+
109+
void eraseInstruction(Instruction &I) {
110+
for (Value *Op : I.operands())
111+
Worklist.pushValue(Op);
112+
Worklist.remove(&I);
113+
I.eraseFromParent();
114+
}
95115
};
96116
} // namespace
97117

98-
static void replaceValue(Value &Old, Value &New) {
99-
Old.replaceAllUsesWith(&New);
100-
New.takeName(&Old);
101-
}
102-
103118
bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
104119
// Match insert into fixed vector of scalar value.
105120
// TODO: Handle non-zero insert index.
@@ -501,6 +516,8 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
501516
else
502517
foldExtExtBinop(Ext0, Ext1, I);
503518

519+
Worklist.push(Ext0);
520+
Worklist.push(Ext1);
504521
return true;
505522
}
506523

@@ -928,8 +945,7 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
928945
DL);
929946
NSI->setAlignment(ScalarOpAlignment);
930947
replaceValue(I, *NSI);
931-
// Need erasing the store manually.
932-
I.eraseFromParent();
948+
eraseInstruction(I);
933949
return true;
934950
}
935951

@@ -939,11 +955,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
939955
/// Try to scalarize vector loads feeding extractelement instructions.
940956
bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
941957
Value *Ptr;
942-
Value *Idx;
943-
if (!match(&I, m_ExtractElt(m_Load(m_Value(Ptr)), m_Value(Idx))))
958+
if (!match(&I, m_Load(m_Value(Ptr))))
944959
return false;
945960

946-
auto *LI = cast<LoadInst>(I.getOperand(0));
961+
auto *LI = cast<LoadInst>(&I);
947962
const DataLayout &DL = I.getModule()->getDataLayout();
948963
if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType()))
949964
return false;
@@ -1039,6 +1054,16 @@ bool VectorCombine::run() {
10391054
return false;
10401055

10411056
bool MadeChange = false;
1057+
auto FoldInst = [this, &MadeChange](Instruction &I) {
1058+
Builder.SetInsertPoint(&I);
1059+
MadeChange |= vectorizeLoadInsert(I);
1060+
MadeChange |= foldExtractExtract(I);
1061+
MadeChange |= foldBitcastShuf(I);
1062+
MadeChange |= scalarizeBinopOrCmp(I);
1063+
MadeChange |= foldExtractedCmps(I);
1064+
MadeChange |= scalarizeLoadExtract(I);
1065+
MadeChange |= foldSingleElementStore(I);
1066+
};
10421067
for (BasicBlock &BB : F) {
10431068
// Ignore unreachable basic blocks.
10441069
if (!DT.isReachableFromEntry(&BB))
@@ -1047,21 +1072,22 @@ bool VectorCombine::run() {
10471072
for (Instruction &I : make_early_inc_range(BB)) {
10481073
if (isa<DbgInfoIntrinsic>(I))
10491074
continue;
1050-
Builder.SetInsertPoint(&I);
1051-
MadeChange |= vectorizeLoadInsert(I);
1052-
MadeChange |= foldExtractExtract(I);
1053-
MadeChange |= foldBitcastShuf(I);
1054-
MadeChange |= scalarizeBinopOrCmp(I);
1055-
MadeChange |= foldExtractedCmps(I);
1056-
MadeChange |= scalarizeLoadExtract(I);
1057-
MadeChange |= foldSingleElementStore(I);
1075+
FoldInst(I);
10581076
}
10591077
}
10601078

1061-
// We're done with transforms, so remove dead instructions.
1062-
if (MadeChange)
1063-
for (BasicBlock &BB : F)
1064-
SimplifyInstructionsInBlock(&BB);
1079+
while (!Worklist.isEmpty()) {
1080+
Instruction *I = Worklist.removeOne();
1081+
if (!I)
1082+
continue;
1083+
1084+
if (isInstructionTriviallyDead(I)) {
1085+
eraseInstruction(*I);
1086+
continue;
1087+
}
1088+
1089+
FoldInst(*I);
1090+
}
10651091

10661092
return MadeChange;
10671093
}

llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,14 @@ define void @matrix_extract_insert_scalar(i32 %i, i32 %k, i32 %j, [225 x double]
2020
; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 225
2121
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]])
2222
; CHECK-NEXT: [[TMP7:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>*
23-
; CHECK-NEXT: [[TMP8:%.*]] = load <225 x double>, <225 x double>* [[TMP7]], align 8
24-
; CHECK-NEXT: [[MATRIXEXT4:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP5]]
23+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP5]]
24+
; CHECK-NEXT: [[MATRIXEXT4:%.*]] = load double, double* [[TMP8]], align 8
2525
; CHECK-NEXT: [[MUL:%.*]] = fmul double [[MATRIXEXT]], [[MATRIXEXT4]]
26-
; CHECK-NEXT: [[MATRIXEXT7:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP1]]
27-
; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]]
2826
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]]
29-
; CHECK-NEXT: store double [[SUB]], double* [[TMP9]], align 8
27+
; CHECK-NEXT: [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8
28+
; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]]
29+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]]
30+
; CHECK-NEXT: store double [[SUB]], double* [[TMP10]], align 8
3031
; CHECK-NEXT: ret void
3132
;
3233
entry:

llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,14 @@ target triple = "arm64-apple-darwin"
66
define void @load_extract_insert_store_const_idx(<225 x double>* %A) {
77
; CHECK-LABEL: @load_extract_insert_store_const_idx(
88
; CHECK-NEXT: entry:
9-
; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8
10-
; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 0
9+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 0
10+
; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8
1111
; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]]
12-
; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 1
12+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 1
13+
; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8
1314
; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]]
14-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1
15-
; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8
15+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1
16+
; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8
1617
; CHECK-NEXT: ret void
1718
;
1819
entry:
@@ -33,13 +34,14 @@ define void @load_extract_insert_store_var_idx_assume_valid(i64 %idx.1, i64 %idx
3334
; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_1]])
3435
; CHECK-NEXT: [[CMP_2:%.*]] = icmp ult i64 [[IDX_2:%.*]], 225
3536
; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_2]])
36-
; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8
37-
; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]]
37+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]]
38+
; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8
3839
; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]]
39-
; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]]
40+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]]
41+
; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8
4042
; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]]
41-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]]
42-
; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8
43+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]]
44+
; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8
4345
; CHECK-NEXT: ret void
4446
;
4547
entry:
@@ -69,13 +71,14 @@ define void @load_extract_insert_store_var_idx_assume_valid_in_dominating_block(
6971
; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_2]])
7072
; CHECK-NEXT: br i1 [[C_1:%.*]], label [[LOOP:%.*]], label [[EXIT:%.*]]
7173
; CHECK: loop:
72-
; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8
73-
; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]]
74+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]]
75+
; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8
7476
; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]]
75-
; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]]
77+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]]
78+
; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8
7679
; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]]
77-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]]
78-
; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8
80+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]]
81+
; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8
7982
; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond()
8083
; CHECK-NEXT: br i1 [[C_2]], label [[LOOP]], label [[EXIT]]
8184
; CHECK: exit:

llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,9 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
453453

454454
define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
455455
; CHECK-LABEL: @PR34724(
456-
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
456+
; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
457+
; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
458+
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
457459
; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
458460
; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
459461
; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>

llvm/test/Transforms/VectorCombine/X86/extract-binop.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,9 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
453453

454454
define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
455455
; CHECK-LABEL: @PR34724(
456-
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
456+
; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
457+
; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
458+
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
457459
; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
458460
; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
459461
; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>

llvm/test/Transforms/VectorCombine/load-insert-store.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,9 @@ define void @insert_store_ptr_strip(<16 x i8>* %q, i8 zeroext %s) {
465465
; CHECK-LABEL: @insert_store_ptr_strip(
466466
; CHECK-NEXT: entry:
467467
; CHECK-NEXT: [[ADDR0:%.*]] = bitcast <16 x i8>* [[Q:%.*]] to <2 x i64>*
468-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 3
468+
; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[ADDR0]], i64 0
469+
; CHECK-NEXT: [[ADDR2:%.*]] = bitcast <2 x i64>* [[ADDR1]] to <16 x i8>*
470+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[ADDR2]], i32 0, i32 3
469471
; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1
470472
; CHECK-NEXT: ret void
471473
;

0 commit comments

Comments
 (0)