Skip to content

Commit 7d157e7

Browse files
Implemented tiling and fusion path for GPU
1 parent 8716a7d commit 7d157e7

File tree

6 files changed

+472
-1
lines changed

6 files changed

+472
-1
lines changed

include/gc/Transforms/Passes.td

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,35 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
112112
"Call finish() after each kernel launch.">
113113
];
114114
}
115+
116+
def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
117+
let summary = "GPU tiling and fusion path.";
118+
let description = [{
119+
This path tiles linalg operations and wraps into foreach loops.
120+
The tiles calculation is based on the Execution Unit cache size and the number of threads per EU.
121+
}];
122+
let options = [
123+
Option<"euMem", "eu-mem", "size_t",
124+
/*default=*/"131072",
125+
"Execution Unit cache size.">,
126+
Option<"euThreads", "eu-threads", "size_t",
127+
/*default=*/"8",
128+
"Number of threads per EU.">
129+
];
130+
}
131+
132+
def GpuLoopTiling : Pass<"gpu-loop-tiling", "func::FuncOp"> {
133+
let summary = "Create nested parallel loops to be mapped to GPU.";
134+
let description = [{
135+
This path tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops.
136+
Each tile of the outer loop is divided by the number of threads per EU.
137+
}];
138+
let options = [
139+
Option<"euThreads", "eu-threads", "size_t",
140+
/*default=*/"8",
141+
"Number of threads per Execution Unit.">
142+
];
143+
}
115144
#endif // GC_USE_IMEX
116145

117146
def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",

lib/gc/Transforms/GPU/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ set_property(GLOBAL APPEND PROPERTY IMEX_LIBS ${IMEX_LIBS})
1212

1313
gc_add_mlir_library(GcGpuPasses
1414
AddContextArg.cpp
15+
GpuLoopTiling.cpp
16+
GpuTilingAndFusion.cpp
1517
GpuToGpuOcl.cpp
1618
LinalgToXeGPU.cpp
1719
Pipeline.cpp
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
//===-- GpuLoopTiling.cpp - DESC --------------------------------*- C++ -*-===//
2+
//
3+
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
10+
#include "mlir/Conversion/LLVMCommon/Pattern.h"
11+
#include "mlir/Conversion/Passes.h"
12+
#include "mlir/Dialect/Affine/IR/AffineOps.h"
13+
#include "mlir/Dialect/Affine/Utils.h"
14+
#include "mlir/Dialect/Func/IR/FuncOps.h"
15+
#include "mlir/Dialect/GPU/Transforms/Passes.h"
16+
#include "mlir/Dialect/Linalg/IR/Linalg.h"
17+
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
18+
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
19+
#include "mlir/Transforms/RegionUtils.h"
20+
21+
#include "./GpuUtils.h"
22+
#include "gc/Utils/Log.h"
23+
24+
using namespace mlir;
25+
// using namespace mlir::gc::gpu;
26+
27+
namespace mlir::gc {
28+
#define GEN_PASS_DECL_GPULOOPTILING
29+
#define GEN_PASS_DEF_GPULOOPTILING
30+
#include "gc/Transforms/Passes.h.inc"
31+
} // namespace mlir::gc
32+
33+
namespace {
34+
35+
struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
36+
gc::impl::GpuLoopTilingBase<GpuLoopTiling> {
37+
friend GpuPass;
38+
explicit GpuLoopTiling() : GpuLoopTiling(gc::GpuLoopTilingOptions{}) {}
39+
explicit GpuLoopTiling(const gc::GpuLoopTilingOptions &opts)
40+
: GpuPass(), GpuLoopTilingBase(opts) {}
41+
42+
void runOnOperation() override {
43+
IRRewriter rewriter(&getContext());
44+
auto euThreads = static_cast<double>(getEuThreads(rewriter));
45+
getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
46+
if (!loop->getParentOfType<scf::ParallelOp>()) {
47+
tile(loop, euThreads);
48+
}
49+
return WalkResult::skip();
50+
});
51+
if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
52+
gcLogD("Failed to simplify regions");
53+
}
54+
}
55+
56+
private:
57+
static void tile(scf::ParallelOp loop, double euThreads) {
58+
SmallVector<int64_t> tileSizes;
59+
auto steps = loop.getStep();
60+
tileSizes.reserve(steps.size());
61+
62+
for (auto step : steps) {
63+
if (auto v = getConstIdxValue(step)) {
64+
tileSizes.push_back(static_cast<int64_t>(
65+
std::ceil(static_cast<double>(v) / euThreads)));
66+
} else {
67+
tileSizes.push_back(32);
68+
}
69+
}
70+
71+
tileParallelLoop(loop, tileSizes, false);
72+
}
73+
};
74+
} // namespace

0 commit comments

Comments
 (0)