Skip to content

Commit 86aaebf

Browse files
committed
[Passes] Run vector-combine early with -fenable-matrix.
IR with matrix intrinsics is likely to also contain large vector operations, which can benefit from early simplifications. This is the last step in a series of changes to improve code-gen for code using matrix subscript operators with the C/C++ matrix extension in CLang, like using matrix_t = double __attribute__((matrix_type(15, 15))); void foo(unsigned i, matrix_t &A, matrix_t &B) { for (unsigned j = 0; j < 4; ++j) for (unsigned k = 0; k < i; k++) B[k][j] -= A[k][j] * B[i][j]; } https://clang.godbolt.org/z/6dKxK1Ed7 Reviewed By: spatel Differential Revision: https://reviews.llvm.org/D102496 (cherry-picked from a7c6471)
1 parent d4cccb6 commit 86aaebf

File tree

5 files changed

+95
-28
lines changed

5 files changed

+95
-28
lines changed

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
806806
// Delete small array after loop unroll.
807807
FPM.addPass(SROA());
808808

809+
// The matrix extension can introduce large vector operations early, which can
810+
// benefit from running vector-combine early on.
811+
if (EnableMatrix)
812+
FPM.addPass(VectorCombinePass());
813+
809814
// Eliminate redundancies.
810815
FPM.addPass(MergedLoadStoreMotionPass());
811816
if (RunNewGVN)

llvm/lib/Transforms/IPO/PassManagerBuilder.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
434434
MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
435435
MPM.add(createReassociatePass()); // Reassociate expressions
436436

437+
// The matrix extension can introduce large vector operations early, which can
438+
// benefit from running vector-combine early on.
439+
if (EnableMatrix)
440+
MPM.add(createVectorCombinePass());
441+
437442
// Begin the loop pass pipeline.
438443
if (EnableSimpleLoopUnswitch) {
439444
// The simple loop unswitch pass relies on separate cleanup passes. Schedule

llvm/test/Other/new-pm-defaults.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@
170170
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
171171
; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass
172172
; CHECK-O-NEXT: Running pass: SROA on foo
173+
; CHECK-MATRIX: Running pass: VectorCombinePass
173174
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
174175
; CHECK-O23SZ-NEXT: Running pass: GVN
175176
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

llvm/test/Other/opt-O3-pipeline-enable-matrix.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@
108108
; CHECK-NEXT: Dominator Tree Construction
109109
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
110110
; CHECK-NEXT: Function Alias Analysis Results
111+
; CHECK-NEXT: Optimize scalar/vector ops
111112
; CHECK-NEXT: Memory SSA
112113
; CHECK-NEXT: Natural Loop Information
113114
; CHECK-NEXT: Canonicalize natural loops

llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll

Lines changed: 83 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ define void @matrix_extract_insert_scalar(i32 %i, i32 %k, i32 %j, [225 x double]
2626
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]]
2727
; CHECK-NEXT: [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8
2828
; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]]
29-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]]
30-
; CHECK-NEXT: store double [[SUB]], double* [[TMP10]], align 8
29+
; CHECK-NEXT: store double [[SUB]], double* [[TMP9]], align 8
3130
; CHECK-NEXT: ret void
3231
;
3332
entry:
@@ -93,43 +92,99 @@ define void @matrix_extract_insert_loop(i32 %i, [225 x double]* nonnull align 8
9392
; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[I:%.*]] to i64
9493
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>*
9594
; CHECK-NEXT: [[CMP212_NOT:%.*]] = icmp eq i32 [[I]], 0
96-
; CHECK-NEXT: br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]]
97-
; CHECK: for.cond1.preheader.us.preheader:
98-
; CHECK-NEXT: [[DOTPRE_PRE:%.*]] = load <225 x double>, <225 x double>* [[TMP1]], align 8
99-
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]]
95+
; CHECK-NEXT: br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US:%.*]]
10096
; CHECK: for.cond1.preheader.us:
101-
; CHECK-NEXT: [[DOTPRE:%.*]] = phi <225 x double> [ [[MATINS_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ [[DOTPRE_PRE]], [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
102-
; CHECK-NEXT: [[J_014_US:%.*]] = phi i32 [ [[INC13_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
103-
; CHECK-NEXT: [[CONV5_US:%.*]] = zext i32 [[J_014_US]] to i64
104-
; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i64 [[CONV5_US]], 15
105-
; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], [[CONV6]]
106-
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 225
107-
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]])
97+
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[I]], 225
98+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]])
99+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV6]]
108100
; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]]
109101
; CHECK: for.body4.us:
110-
; CHECK-NEXT: [[TMP5:%.*]] = phi <225 x double> [ [[DOTPRE]], [[FOR_COND1_PREHEADER_US]] ], [ [[MATINS_US]], [[FOR_BODY4_US]] ]
111102
; CHECK-NEXT: [[K_013_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ]
112103
; CHECK-NEXT: [[CONV_US:%.*]] = zext i32 [[K_013_US]] to i64
113-
; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], [[CONV_US]]
114-
; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 225
115-
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP7]])
116-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP6]]
117-
; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, double* [[TMP8]], align 8
118-
; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = extractelement <225 x double> [[TMP5]], i64 [[TMP3]]
104+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[K_013_US]], 225
105+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]])
106+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[CONV_US]]
107+
; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, double* [[TMP5]], align 8
108+
; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = load double, double* [[TMP3]], align 8
119109
; CHECK-NEXT: [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]]
120-
; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = extractelement <225 x double> [[TMP5]], i64 [[TMP6]]
110+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV_US]]
111+
; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = load double, double* [[TMP6]], align 8
121112
; CHECK-NEXT: [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]]
122-
; CHECK-NEXT: [[MATINS_US]] = insertelement <225 x double> [[TMP5]], double [[SUB_US]], i64 [[TMP6]]
123-
; CHECK-NEXT: store <225 x double> [[MATINS_US]], <225 x double>* [[TMP1]], align 8
124-
; CHECK-NEXT: [[INC_US]] = add nuw i32 [[K_013_US]], 1
113+
; CHECK-NEXT: store double [[SUB_US]], double* [[TMP6]], align 8
114+
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[K_013_US]], 1
125115
; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]]
126-
; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
116+
; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]]
127117
; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us:
128-
; CHECK-NEXT: [[INC13_US]] = add nuw nsw i32 [[J_014_US]], 1
129-
; CHECK-NEXT: [[CMP_US:%.*]] = icmp ult i32 [[J_014_US]], 3
130-
; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP]]
118+
; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[CONV6]], 15
119+
; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[I]], 210
120+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP8]])
121+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP7]]
122+
; CHECK-NEXT: br label [[FOR_BODY4_US_1:%.*]]
131123
; CHECK: for.cond.cleanup:
132124
; CHECK-NEXT: ret void
125+
; CHECK: for.body4.us.1:
126+
; CHECK-NEXT: [[K_013_US_1:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ]
127+
; CHECK-NEXT: [[NARROW:%.*]] = add nuw nsw i32 [[K_013_US_1]], 15
128+
; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[NARROW]] to i64
129+
; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[K_013_US_1]], 210
130+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
131+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP10]]
132+
; CHECK-NEXT: [[MATRIXEXT_US_1:%.*]] = load double, double* [[TMP12]], align 8
133+
; CHECK-NEXT: [[MATRIXEXT8_US_1:%.*]] = load double, double* [[TMP9]], align 8
134+
; CHECK-NEXT: [[MUL_US_1:%.*]] = fmul double [[MATRIXEXT_US_1]], [[MATRIXEXT8_US_1]]
135+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP10]]
136+
; CHECK-NEXT: [[MATRIXEXT11_US_1:%.*]] = load double, double* [[TMP13]], align 8
137+
; CHECK-NEXT: [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]]
138+
; CHECK-NEXT: store double [[SUB_US_1]], double* [[TMP13]], align 8
139+
; CHECK-NEXT: [[INC_US_1]] = add nuw nsw i32 [[K_013_US_1]], 1
140+
; CHECK-NEXT: [[CMP2_US_1:%.*]] = icmp ult i32 [[INC_US_1]], [[I]]
141+
; CHECK-NEXT: br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]]
142+
; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.1:
143+
; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[CONV6]], 30
144+
; CHECK-NEXT: [[TMP15:%.*]] = icmp ult i32 [[I]], 195
145+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP15]])
146+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP14]]
147+
; CHECK-NEXT: br label [[FOR_BODY4_US_2:%.*]]
148+
; CHECK: for.body4.us.2:
149+
; CHECK-NEXT: [[K_013_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ]
150+
; CHECK-NEXT: [[NARROW16:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30
151+
; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[NARROW16]] to i64
152+
; CHECK-NEXT: [[TMP18:%.*]] = icmp ult i32 [[K_013_US_2]], 195
153+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP18]])
154+
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP17]]
155+
; CHECK-NEXT: [[MATRIXEXT_US_2:%.*]] = load double, double* [[TMP19]], align 8
156+
; CHECK-NEXT: [[MATRIXEXT8_US_2:%.*]] = load double, double* [[TMP16]], align 8
157+
; CHECK-NEXT: [[MUL_US_2:%.*]] = fmul double [[MATRIXEXT_US_2]], [[MATRIXEXT8_US_2]]
158+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP17]]
159+
; CHECK-NEXT: [[MATRIXEXT11_US_2:%.*]] = load double, double* [[TMP20]], align 8
160+
; CHECK-NEXT: [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]]
161+
; CHECK-NEXT: store double [[SUB_US_2]], double* [[TMP20]], align 8
162+
; CHECK-NEXT: [[INC_US_2]] = add nuw nsw i32 [[K_013_US_2]], 1
163+
; CHECK-NEXT: [[CMP2_US_2:%.*]] = icmp ult i32 [[INC_US_2]], [[I]]
164+
; CHECK-NEXT: br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]]
165+
; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.2:
166+
; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[CONV6]], 45
167+
; CHECK-NEXT: [[TMP22:%.*]] = icmp ult i32 [[I]], 180
168+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]])
169+
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP21]]
170+
; CHECK-NEXT: br label [[FOR_BODY4_US_3:%.*]]
171+
; CHECK: for.body4.us.3:
172+
; CHECK-NEXT: [[K_013_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ]
173+
; CHECK-NEXT: [[NARROW17:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45
174+
; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[NARROW17]] to i64
175+
; CHECK-NEXT: [[TMP25:%.*]] = icmp ult i32 [[K_013_US_3]], 180
176+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP25]])
177+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP24]]
178+
; CHECK-NEXT: [[MATRIXEXT_US_3:%.*]] = load double, double* [[TMP26]], align 8
179+
; CHECK-NEXT: [[MATRIXEXT8_US_3:%.*]] = load double, double* [[TMP23]], align 8
180+
; CHECK-NEXT: [[MUL_US_3:%.*]] = fmul double [[MATRIXEXT_US_3]], [[MATRIXEXT8_US_3]]
181+
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP24]]
182+
; CHECK-NEXT: [[MATRIXEXT11_US_3:%.*]] = load double, double* [[TMP27]], align 8
183+
; CHECK-NEXT: [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]]
184+
; CHECK-NEXT: store double [[SUB_US_3]], double* [[TMP27]], align 8
185+
; CHECK-NEXT: [[INC_US_3]] = add nuw nsw i32 [[K_013_US_3]], 1
186+
; CHECK-NEXT: [[CMP2_US_3:%.*]] = icmp ult i32 [[INC_US_3]], [[I]]
187+
; CHECK-NEXT: br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]]
133188
;
134189
entry:
135190
%i.addr = alloca i32, align 4

0 commit comments

Comments
 (0)