Skip to content

Commit c96c1ca

Browse files
Implemented tiling and fusion path for GPU
1 parent ddc3dd3 commit c96c1ca

File tree

6 files changed

+472
-1
lines changed

6 files changed

+472
-1
lines changed

include/gc/Transforms/Passes.td

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,35 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
120120
"Call finish() after each kernel launch.">
121121
];
122122
}
123+
124+
def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
125+
let summary = "GPU tiling and fusion path.";
126+
let description = [{
127+
This path tiles linalg operations and wraps into foreach loops.
128+
The tiles calculation is based on the Execution Unit cache size and the number of threads per EU.
129+
}];
130+
let options = [
131+
Option<"euMem", "eu-mem", "size_t",
132+
/*default=*/"131072",
133+
"Execution Unit cache size.">,
134+
Option<"euThreads", "eu-threads", "size_t",
135+
/*default=*/"8",
136+
"Number of threads per EU.">
137+
];
138+
}
139+
140+
def GpuLoopTiling : Pass<"gpu-loop-tiling", "func::FuncOp"> {
141+
let summary = "Create nested parallel loops to be mapped to GPU.";
142+
let description = [{
143+
This path tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops.
144+
Each tile of the outer loop is divided by the number of threads per EU.
145+
}];
146+
let options = [
147+
Option<"euThreads", "eu-threads", "size_t",
148+
/*default=*/"8",
149+
"Number of threads per Execution Unit.">
150+
];
151+
}
123152
#endif // GC_USE_IMEX
124153

125154
def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",

lib/gc/Transforms/GPU/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ set_property(GLOBAL APPEND PROPERTY IMEX_LIBS ${IMEX_LIBS})
1313
gc_add_mlir_library(GcGpuPasses
1414
AddContextArg.cpp
1515
AllocsToSLM.cpp
16+
GpuLoopTiling.cpp
17+
GpuTilingAndFusion.cpp
1618
GpuToGpuOcl.cpp
1719
LinalgToXeGPU.cpp
1820
Pipeline.cpp
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
//===-- GpuLoopTiling.cpp - DESC --------------------------------*- C++ -*-===//
2+
//
3+
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
10+
#include "mlir/Conversion/LLVMCommon/Pattern.h"
11+
#include "mlir/Conversion/Passes.h"
12+
#include "mlir/Dialect/Affine/IR/AffineOps.h"
13+
#include "mlir/Dialect/Affine/Utils.h"
14+
#include "mlir/Dialect/Func/IR/FuncOps.h"
15+
#include "mlir/Dialect/GPU/Transforms/Passes.h"
16+
#include "mlir/Dialect/Linalg/IR/Linalg.h"
17+
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
18+
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
19+
#include "mlir/Transforms/RegionUtils.h"
20+
21+
#include "./GpuUtils.h"
22+
#include "gc/Utils/Log.h"
23+
24+
using namespace mlir;
25+
// using namespace mlir::gc::gpu;
26+
27+
namespace mlir::gc {
28+
#define GEN_PASS_DECL_GPULOOPTILING
29+
#define GEN_PASS_DEF_GPULOOPTILING
30+
#include "gc/Transforms/Passes.h.inc"
31+
} // namespace mlir::gc
32+
33+
namespace {
34+
35+
struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
36+
gc::impl::GpuLoopTilingBase<GpuLoopTiling> {
37+
friend GpuPass;
38+
explicit GpuLoopTiling() : GpuLoopTiling(gc::GpuLoopTilingOptions{}) {}
39+
explicit GpuLoopTiling(const gc::GpuLoopTilingOptions &opts)
40+
: GpuPass(), GpuLoopTilingBase(opts) {}
41+
42+
void runOnOperation() override {
43+
IRRewriter rewriter(&getContext());
44+
auto euThreads = static_cast<double>(getEuThreads(rewriter));
45+
getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
46+
if (!loop->getParentOfType<scf::ParallelOp>()) {
47+
tile(loop, euThreads);
48+
}
49+
return WalkResult::skip();
50+
});
51+
if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
52+
gcLogD("Failed to simplify regions");
53+
}
54+
}
55+
56+
private:
57+
static void tile(scf::ParallelOp loop, double euThreads) {
58+
SmallVector<int64_t> tileSizes;
59+
auto steps = loop.getStep();
60+
tileSizes.reserve(steps.size());
61+
62+
for (auto step : steps) {
63+
if (auto v = getConstIdxValue(step)) {
64+
tileSizes.push_back(static_cast<int64_t>(
65+
std::ceil(static_cast<double>(v) / euThreads)));
66+
} else {
67+
tileSizes.push_back(32);
68+
}
69+
}
70+
71+
tileParallelLoop(loop, tileSizes, false);
72+
}
73+
};
74+
} // namespace

0 commit comments

Comments
 (0)