10
10
#define POLLY_SCHEDULEOPTIMIZER_H
11
11
12
12
#include " polly/ScopPass.h"
13
- #include " llvm/ADT/ArrayRef.h"
14
- #include " isl/isl-noexceptions.h"
15
13
16
14
namespace llvm {
17
15
class Pass ;
18
16
class PassRegistry ;
19
- class TargetTransformInfo ;
20
17
} // namespace llvm
21
18
22
- struct isl_schedule_node ;
23
-
24
- // / Parameters of the micro kernel.
25
- // /
26
- // / Parameters, which determine sizes of rank-1 (i.e., outer product) update
27
- // / used in the optimized matrix multiplication.
28
- struct MicroKernelParamsTy {
29
- int Mr;
30
- int Nr;
31
- };
32
-
33
- // / Parameters of the macro kernel.
34
- // /
35
- // / Parameters, which determine sizes of blocks of partitioned matrices
36
- // / used in the optimized matrix multiplication.
37
- struct MacroKernelParamsTy {
38
- int Mc;
39
- int Nc;
40
- int Kc;
41
- };
42
-
43
19
namespace polly {
44
- struct Dependences ;
45
- class MemoryAccess ;
46
- class Scop ;
47
-
48
- // / Additional parameters of the schedule optimizer.
49
- // /
50
- // / Target Transform Info and the SCoP dependencies used by the schedule
51
- // / optimizer.
52
- struct OptimizerAdditionalInfoTy {
53
- const llvm::TargetTransformInfo *TTI;
54
- const Dependences *D;
55
- };
56
-
57
- // / Parameters of the matrix multiplication operands.
58
- // /
59
- // / Parameters, which describe access relations that represent operands of the
60
- // / matrix multiplication.
61
- struct MatMulInfoTy {
62
- MemoryAccess *A = nullptr ;
63
- MemoryAccess *B = nullptr ;
64
- MemoryAccess *ReadFromC = nullptr ;
65
- MemoryAccess *WriteToC = nullptr ;
66
- int i = -1 ;
67
- int j = -1 ;
68
- int k = -1 ;
69
- };
70
-
71
- extern bool DisablePollyTiling;
72
-
73
20
llvm::Pass *createIslScheduleOptimizerWrapperPass ();
74
21
75
22
struct IslScheduleOptimizerPass
@@ -91,273 +38,6 @@ struct IslScheduleOptimizerPrinterPass
91
38
llvm::raw_ostream &OS;
92
39
};
93
40
94
- } // namespace polly
95
-
96
- namespace llvm {
97
- void initializeIslScheduleOptimizerWrapperPassPass (llvm::PassRegistry &);
98
- }
99
-
100
- class ScheduleTreeOptimizer {
101
- public:
102
- // / Apply schedule tree transformations.
103
- // /
104
- // / This function takes an (possibly already optimized) schedule tree and
105
- // / applies a set of additional optimizations on the schedule tree. The
106
- // / transformations applied include:
107
- // /
108
- // / - Tiling
109
- // / - Prevectorization
110
- // /
111
- // / @param Schedule The schedule object the transformations will be applied
112
- // / to.
113
- // / @param OAI Target Transform Info and the SCoP dependencies.
114
- // / @returns The transformed schedule.
115
- static isl::schedule
116
- optimizeSchedule (isl::schedule Schedule,
117
- const polly::OptimizerAdditionalInfoTy *OAI = nullptr );
118
-
119
- // / Apply schedule tree transformations.
120
- // /
121
- // / This function takes a node in an (possibly already optimized) schedule
122
- // / tree and applies a set of additional optimizations on this schedule tree
123
- // / node and its descendants. The transformations applied include:
124
- // /
125
- // / - Tiling
126
- // / - Prevectorization
127
- // /
128
- // / @param Node The schedule object post-transformations will be applied to.
129
- // / @param OAI Target Transform Info and the SCoP dependencies.
130
- // / @returns The transformed schedule.
131
- static isl::schedule_node
132
- optimizeScheduleNode (isl::schedule_node Node,
133
- const polly::OptimizerAdditionalInfoTy *OAI = nullptr );
134
-
135
- // / Decide if the @p NewSchedule is profitable for @p S.
136
- // /
137
- // / @param S The SCoP we optimize.
138
- // / @param NewSchedule The new schedule we computed.
139
- // /
140
- // / @return True, if we believe @p NewSchedule is an improvement for @p S.
141
- static bool isProfitableSchedule (polly::Scop &S, isl::schedule NewSchedule);
142
-
143
- // / Isolate a set of partial tile prefixes.
144
- // /
145
- // / This set should ensure that it contains only partial tile prefixes that
146
- // / have exactly VectorWidth iterations.
147
- // /
148
- // / @param Node A schedule node band, which is a parent of a band node,
149
- // / that contains a vector loop.
150
- // / @return Modified isl_schedule_node.
151
- static isl::schedule_node isolateFullPartialTiles (isl::schedule_node Node,
152
- int VectorWidth);
153
-
154
- private:
155
- // / Tile a schedule node.
156
- // /
157
- // / @param Node The node to tile.
158
- // / @param Identifier An name that identifies this kind of tiling and
159
- // / that is used to mark the tiled loops in the
160
- // / generated AST.
161
- // / @param TileSizes A vector of tile sizes that should be used for
162
- // / tiling.
163
- // / @param DefaultTileSize A default tile size that is used for dimensions
164
- // / that are not covered by the TileSizes vector.
165
- static isl::schedule_node tileNode (isl::schedule_node Node,
166
- const char *Identifier,
167
- llvm::ArrayRef<int > TileSizes,
168
- int DefaultTileSize);
169
-
170
- // / Tile a schedule node and unroll point loops.
171
- // /
172
- // / @param Node The node to register tile.
173
- // / @param TileSizes A vector of tile sizes that should be used for
174
- // / tiling.
175
- // / @param DefaultTileSize A default tile size that is used for dimensions
176
- static isl::schedule_node applyRegisterTiling (isl::schedule_node Node,
177
- llvm::ArrayRef<int > TileSizes,
178
- int DefaultTileSize);
179
-
180
- // / Apply the BLIS matmul optimization pattern.
181
- // /
182
- // / Make the loops containing the matrix multiplication be the innermost
183
- // / loops and apply the BLIS matmul optimization pattern. BLIS implements
184
- // / gemm as three nested loops around a macro-kernel, plus two packing
185
- // / routines. The macro-kernel is implemented in terms of two additional
186
- // / loops around a micro-kernel. The micro-kernel is a loop around a rank-1
187
- // / (i.e., outer product) update.
188
- // /
189
- // / For a detailed description please see [1].
190
- // /
191
- // / The order of the loops defines the data reused in the BLIS implementation
192
- // / of gemm ([1]). In particular, elements of the matrix B, the second
193
- // / operand of matrix multiplication, are reused between iterations of the
194
- // / innermost loop. To keep the reused data in cache, only elements of matrix
195
- // / A, the first operand of matrix multiplication, should be evicted during
196
- // / an iteration of the innermost loop. To provide such a cache replacement
197
- // / policy, elements of the matrix A can, in particular, be loaded first and,
198
- // / consequently, be least-recently-used.
199
- // /
200
- // / In our case matrices are stored in row-major order instead of
201
- // / column-major order used in the BLIS implementation ([1]). It affects only
202
- // / on the form of the BLIS micro kernel and the computation of its
203
- // / parameters. In particular, reused elements of the matrix B are
204
- // / successively multiplied by specific elements of the matrix A.
205
- // /
206
- // / Refs.:
207
- // / [1] - Analytical Modeling is Enough for High Performance BLIS
208
- // / Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
209
- // / Technical Report, 2014
210
- // / http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
211
- // /
212
- // / @see ScheduleTreeOptimizer::createMicroKernel
213
- // / @see ScheduleTreeOptimizer::createMacroKernel
214
- // / @see getMicroKernelParams
215
- // / @see getMacroKernelParams
216
- // /
217
- // / TODO: Implement the packing transformation.
218
- // /
219
- // / @param Node The node that contains a band to be optimized. The node
220
- // / is required to successfully pass
221
- // / ScheduleTreeOptimizer::isMatrMultPattern.
222
- // / @param TTI Target Transform Info.
223
- // / @param MMI Parameters of the matrix multiplication operands.
224
- // / @returns The transformed schedule.
225
- static isl::schedule_node
226
- optimizeMatMulPattern (isl::schedule_node Node,
227
- const llvm::TargetTransformInfo *TTI,
228
- polly::MatMulInfoTy &MMI);
229
-
230
- // / Check if this node is a band node we want to tile.
231
- // /
232
- // / We look for innermost band nodes where individual dimensions are marked as
233
- // / permutable.
234
- // /
235
- // / @param Node The node to check.
236
- static bool isTileableBandNode (isl::schedule_node Node);
237
-
238
- // / Pre-vectorizes one scheduling dimension of a schedule band.
239
- // /
240
- // / prevectSchedBand splits out the dimension DimToVectorize, tiles it and
241
- // / sinks the resulting point loop.
242
- // /
243
- // / Example (DimToVectorize=0, VectorWidth=4):
244
- // /
245
- // / | Before transformation:
246
- // / |
247
- // / | A[i,j] -> [i,j]
248
- // / |
249
- // / | for (i = 0; i < 128; i++)
250
- // / | for (j = 0; j < 128; j++)
251
- // / | A(i,j);
252
- // /
253
- // / | After transformation:
254
- // / |
255
- // / | for (it = 0; it < 32; it+=1)
256
- // / | for (j = 0; j < 128; j++)
257
- // / | for (ip = 0; ip <= 3; ip++)
258
- // / | A(4 * it + ip,j);
259
- // /
260
- // / The goal of this transformation is to create a trivially vectorizable
261
- // / loop. This means a parallel loop at the innermost level that has a
262
- // / constant number of iterations corresponding to the target vector width.
263
- // /
264
- // / This transformation creates a loop at the innermost level. The loop has
265
- // / a constant number of iterations, if the number of loop iterations at
266
- // / DimToVectorize can be divided by VectorWidth. The default VectorWidth is
267
- // / currently constant and not yet target specific. This function does not
268
- // / reason about parallelism.
269
- static isl::schedule_node prevectSchedBand (isl::schedule_node Node,
270
- unsigned DimToVectorize,
271
- int VectorWidth);
272
-
273
- // / Apply additional optimizations on the bands in the schedule tree.
274
- // /
275
- // / We are looking for an innermost band node and apply the following
276
- // / transformations:
277
- // /
278
- // / - Tile the band
279
- // / - if the band is tileable
280
- // / - if the band has more than one loop dimension
281
- // /
282
- // / - Prevectorize the schedule of the band (or the point loop in case of
283
- // / tiling).
284
- // / - if vectorization is enabled
285
- // /
286
- // / @param Node The schedule node to (possibly) optimize.
287
- // / @param User A pointer to forward some use information
288
- // / (currently unused).
289
- static isl_schedule_node *optimizeBand (isl_schedule_node *Node, void *User);
290
-
291
- // / Apply additional optimizations on the bands in the schedule tree.
292
- // /
293
- // / We apply the following
294
- // / transformations:
295
- // /
296
- // / - Tile the band
297
- // / - Prevectorize the schedule of the band (or the point loop in case of
298
- // / tiling).
299
- // / - if vectorization is enabled
300
- // /
301
- // / @param Node The schedule node to (possibly) optimize.
302
- // / @param User A pointer to forward some use information
303
- // / (currently unused).
304
- static isl::schedule_node standardBandOpts (isl::schedule_node Node,
305
- void *User);
306
-
307
- // / Check if this node contains a partial schedule that could
308
- // / probably be optimized with analytical modeling.
309
- // /
310
- // / isMatrMultPattern tries to determine whether the following conditions
311
- // / are true:
312
- // / 1. the partial schedule contains only one statement.
313
- // / 2. there are exactly three input dimensions.
314
- // / 3. all memory accesses of the statement will have stride 0 or 1, if we
315
- // / interchange loops (switch the variable used in the inner loop to
316
- // / the outer loop).
317
- // / 4. all memory accesses of the statement except from the last one, are
318
- // / read memory access and the last one is write memory access.
319
- // / 5. all subscripts of the last memory access of the statement don't
320
- // / contain the variable used in the inner loop.
321
- // / If this is the case, we could try to use an approach that is similar to
322
- // / the one used to get close-to-peak performance of matrix multiplications.
323
- // /
324
- // / @param Node The node to check.
325
- // / @param D The SCoP dependencies.
326
- // / @param MMI Parameters of the matrix multiplication operands.
327
- static bool isMatrMultPattern (isl::schedule_node Node,
328
- const polly::Dependences *D,
329
- polly::MatMulInfoTy &MMI);
330
-
331
- // / Create the BLIS macro-kernel.
332
- // /
333
- // / We create the BLIS macro-kernel by applying a combination of tiling
334
- // / of dimensions of the band node and interchanging of two innermost
335
- // / modified dimensions. The values of of MacroKernelParams's fields are used
336
- // / as tile sizes.
337
- // /
338
- // / @param Node The schedule node to be modified.
339
- // / @param MacroKernelParams Parameters of the macro kernel
340
- // / to be used as tile sizes.
341
- static isl::schedule_node
342
- createMacroKernel (isl::schedule_node Node,
343
- MacroKernelParamsTy MacroKernelParams);
344
-
345
- // / Create the BLIS macro-kernel.
346
- // /
347
- // / We create the BLIS macro-kernel by applying a combination of tiling
348
- // / of dimensions of the band node and interchanging of two innermost
349
- // / modified dimensions. The values passed in MicroKernelParam are used
350
- // / as tile sizes.
351
- // /
352
- // / @param Node The schedule node to be modified.
353
- // / @param MicroKernelParams Parameters of the micro kernel
354
- // / to be used as tile sizes.
355
- // / @see MicroKernelParamsTy
356
- static isl::schedule_node
357
- createMicroKernel (isl::schedule_node Node,
358
- MicroKernelParamsTy MicroKernelParams);
359
- };
360
-
361
41
// / Build the desired set of partial tile prefixes.
362
42
// /
363
43
// / We build a set of partial tile prefixes, which are prefixes of the vector
@@ -377,4 +57,10 @@ class ScheduleTreeOptimizer {
377
57
// / @param ScheduleRange A range of a map, which describes a prefix schedule
378
58
// / relation.
379
59
isl::set getPartialTilePrefixes (isl::set ScheduleRange, int VectorWidth);
60
+ } // namespace polly
61
+
62
+ namespace llvm {
63
+ void initializeIslScheduleOptimizerWrapperPassPass (llvm::PassRegistry &);
64
+ }
65
+
380
66
#endif // POLLY_SCHEDULEOPTIMIZER_H
0 commit comments