Skip to content

Commit 0ca7a92

Browse files
Reworked
1 parent c96c1ca commit 0ca7a92

File tree

8 files changed

+400
-147
lines changed

8 files changed

+400
-147
lines changed

include/gc/Transforms/Passes.td

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -124,29 +124,40 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
124124
def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
125125
let summary = "GPU tiling and fusion path.";
126126
let description = [{
127-
This path tiles linalg operations and wraps into foreach loops.
128-
The tiles calculation is based on the Execution Unit cache size and the number of threads per EU.
127+
This pass tiles linalg operations and creates an inner loop that is mapped to the block sizes, when converting
128+
to gpu.launch. The tiles calculation is based on the GPU device properties, retrieved from the DLTI attributes.
129+
If the DLTI attributes are not specified, defaults to the pass options.
129130
}];
130131
let options = [
131-
Option<"euMem", "eu-mem", "size_t",
132+
Option<"numEus", "num-eus", "size_t",
133+
/*default=*/"448",
134+
"Number of Execution Units.">,
135+
Option<"numEusPerSlice", "num-eus-per-slice", "size_t",
136+
/*default=*/"8",
137+
"Number of Execution Units per slice.">,
138+
Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
139+
/*default=*/"8",
140+
"Number of threads per Execution Unit.">,
141+
Option<"cacheSize", "cache-size", "size_t",
132142
/*default=*/"131072",
133143
"Execution Unit cache size.">,
134-
Option<"euThreads", "eu-threads", "size_t",
135-
/*default=*/"8",
136-
"Number of threads per EU.">
144+
Option<"vectorWidth", "vector-width", "size_t",
145+
/*default=*/"512",
146+
"The maximum width of EU's vector registers.">
137147
];
138148
}
139149

140150
def GpuLoopTiling : Pass<"gpu-loop-tiling", "func::FuncOp"> {
141151
let summary = "Create nested parallel loops to be mapped to GPU.";
142152
let description = [{
143-
This path tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops.
144-
Each tile of the outer loop is divided by the number of threads per EU.
153+
This pass tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops. The tiles
154+
calculation is based on the max_work_group_size DLTI attribute. If the attribute is not specified,
155+
defaults to the pass options.
145156
}];
146157
let options = [
147-
Option<"euThreads", "eu-threads", "size_t",
148-
/*default=*/"8",
149-
"Number of threads per Execution Unit.">
158+
Option<"workGroupSize", "work-group-size", "size_t",
159+
/*default=*/"64",
160+
"The maximum workgroup size.">
150161
];
151162
}
152163
#endif // GC_USE_IMEX

lib/gc/Transforms/GPU/GpuLoopTiling.cpp

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
#include "gc/Utils/Log.h"
2323

2424
using namespace mlir;
25-
// using namespace mlir::gc::gpu;
25+
using namespace mlir::gc;
2626

2727
namespace mlir::gc {
2828
#define GEN_PASS_DECL_GPULOOPTILING
@@ -41,34 +41,29 @@ struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
4141

4242
void runOnOperation() override {
4343
IRRewriter rewriter(&getContext());
44-
auto euThreads = static_cast<double>(getEuThreads(rewriter));
45-
getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
44+
auto wgSize = getWorkGroupSize(rewriter);
45+
getOperation().walk<WalkOrder::PreOrder>([wgSize](scf::ParallelOp loop) {
4646
if (!loop->getParentOfType<scf::ParallelOp>()) {
47-
tile(loop, euThreads);
47+
SmallVector<int64_t> tiles;
48+
auto steps = loop.getStep();
49+
tiles.reserve(steps.size());
50+
51+
for (auto step : steps) {
52+
if (auto v = getConstIdxValue(step)) {
53+
tiles.push_back(v);
54+
} else {
55+
tiles.push_back(32);
56+
}
57+
}
58+
59+
adjustTiles(wgSize, tiles);
60+
tileParallelLoop(loop, tiles, false);
4861
}
4962
return WalkResult::skip();
5063
});
5164
if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
5265
gcLogD("Failed to simplify regions");
5366
}
5467
}
55-
56-
private:
57-
static void tile(scf::ParallelOp loop, double euThreads) {
58-
SmallVector<int64_t> tileSizes;
59-
auto steps = loop.getStep();
60-
tileSizes.reserve(steps.size());
61-
62-
for (auto step : steps) {
63-
if (auto v = getConstIdxValue(step)) {
64-
tileSizes.push_back(static_cast<int64_t>(
65-
std::ceil(static_cast<double>(v) / euThreads)));
66-
} else {
67-
tileSizes.push_back(32);
68-
}
69-
}
70-
71-
tileParallelLoop(loop, tileSizes, false);
72-
}
7368
};
7469
} // namespace

0 commit comments

Comments
 (0)