Skip to content

Commit 62f4333

Browse files
Apply suggestions from code review
1 parent 95224e2 commit 62f4333

File tree

5 files changed

+48
-34
lines changed

5 files changed

+48
-34
lines changed

include/gc/Transforms/Passes.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
124124
def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
125125
let summary = "GPU tiling and fusion path.";
126126
let description = [{
127-
This pass tiles linalg operations and creates two nested csf.forall loops. When converting to gpu.launch,
127+
This pass tiles linalg operations and creates two nested scf.forall loops. When converting to gpu.launch,
128128
the inner loop is mapped to the block sizes and the outer - to grid sizes. The tiles calculation is based
129129
on the GPU device properties, retrieved from the DLTI attributes. If the DLTI attributes are not specified,
130130
defaults to the pass options.
@@ -139,9 +139,9 @@ def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
139139
Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
140140
/*default=*/"8",
141141
"Number of threads per Execution Unit.">,
142-
Option<"cacheSize", "cache-size", "size_t",
142+
Option<"localMemSize", "local-mem-size", "size_t",
143143
/*default=*/"131072",
144-
"Execution Unit cache size.">,
144+
"The size of the local memory, shared across a work-group.">,
145145
Option<"vectorWidth", "vector-width", "size_t",
146146
/*default=*/"512",
147147
"The maximum width of EU's vector registers.">,

lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,7 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
877877
{CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, "num_exec_units_per_slice"},
878878
{CL_DEVICE_NUM_THREADS_PER_EU_INTEL, "num_threads_per_eu"},
879879
// Assuming the cache size is equal to the local mem
880-
{CL_DEVICE_LOCAL_MEM_SIZE, "L1_cache_size_in_bytes"},
880+
{CL_DEVICE_LOCAL_MEM_SIZE, "local_mem_size"},
881881
};
882882

883883
unsigned i = 0;

lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,17 @@ struct GpuTilingAndFusion final
5151
OpRewriter rw(fn);
5252
tileAndFuseLinalgOps(rw, fn);
5353
tileForallOps(rw, fn);
54-
if (failed(simplifyRegions(rw, fn->getRegions()))) {
55-
// Not simplified
56-
}
5754
}
5855

5956
private:
6057
void tileAndFuseLinalgOps(OpRewriter &rw, func::FuncOp &fn) {
6158
auto numEus = getNumEus(rw);
6259
auto numEusPerSlice = getNumEusPerSlice(rw);
6360
auto numThreadsPerEu = getNumThreadsPerEu(rw);
64-
auto cacheSize = getCacheSize(rw);
61+
auto localMemSize = getLocalMemSize(rw);
6562
auto vectorWidth = getVectorWidth(rw);
6663
auto cachePerThread =
67-
std::max(cacheSize / numEusPerSlice / numThreadsPerEu, vectorWidth);
64+
std::max(localMemSize / numEusPerSlice / numThreadsPerEu, vectorWidth);
6865
SCFTileAndFuseOptions opts;
6966
opts.tilingOptions.setTileSizeComputationFunction(
7067
[&rw, cachePerThread, vectorWidth,

lib/gc/Transforms/GPU/GpuUtils.h

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ template <typename DerivedT> struct GpuPass {
5858
static_cast<DerivedT *>(this)->numThreadsPerEu);
5959
}
6060

61-
int64_t getCacheSize(Builder &builder) {
62-
return getGpuPropertyAsInt(builder, "L1_cache_size_in_bytes",
63-
static_cast<DerivedT *>(this)->cacheSize);
61+
int64_t getLocalMemSize(Builder &builder) {
62+
return getGpuPropertyAsInt(builder, "local_mem_size",
63+
static_cast<DerivedT *>(this)->localMemSize);
6464
}
6565

6666
int64_t getVectorWidth(Builder &builder) {
@@ -87,7 +87,13 @@ struct OpRewriter final : IRRewriter {
8787
arith::ConstantIndexOp getConstant(int64_t v) {
8888
if (func.empty()) {
8989
func.addEntryBlock();
90-
} else {
90+
}
91+
#ifdef NDEBUG
92+
return create<arith::ConstantIndexOp>(v);
93+
#else
94+
// Make the code more clean for debugging - move the constants to the top of
95+
// the function and reuse the existing constants.
96+
else {
9197
// Try to find ConstantIndexOp with the same value in the function.
9298
for (auto &op : func.getBody().getOps()) {
9399
if (auto constOp = dyn_cast<arith::ConstantIndexOp>(op)) {
@@ -103,6 +109,7 @@ struct OpRewriter final : IRRewriter {
103109
auto op = create<arith::ConstantIndexOp>(v);
104110
restoreInsertionPoint(ip);
105111
return op;
112+
#endif
106113
}
107114

108115
private:
@@ -152,9 +159,11 @@ static void adjustTiles(T totalSize, T *begin, T *end,
152159
return;
153160
}
154161

155-
--end;
156-
T a = *begin;
157-
T b = *end;
162+
// a and b are the initial tile sizes, x and y are the new sizes.
163+
T *aPtr = begin;
164+
T *bPtr = end - 1;
165+
T a = *aPtr;
166+
T b = *bPtr;
158167
bool swap = a < b;
159168
if (swap) {
160169
std::swap(a, b);
@@ -164,8 +173,8 @@ static void adjustTiles(T totalSize, T *begin, T *end,
164173
b = floorPow2(b);
165174

166175
if (a * b <= total) {
167-
*begin = swap ? b : a;
168-
*end = swap ? a : b;
176+
*aPtr = swap ? b : a;
177+
*bPtr = swap ? a : b;
169178
return;
170179
}
171180

@@ -196,8 +205,8 @@ static void adjustTiles(T totalSize, T *begin, T *end,
196205
}
197206
}
198207

199-
*begin = swap ? y : x;
200-
*end = swap ? x : y;
208+
*aPtr = swap ? y : x;
209+
*bPtr = swap ? x : y;
201210
}
202211

203212
template <typename T, unsigned N>

test/mlir/unittests/Transforms/GPU/GpuUtilsTest.cpp

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,38 @@
1313
#include "gtest/gtest.h"
1414

1515
TEST(testAdjustTiles, GputUtilsTest) {
16-
auto testCalc = [](int64_t totalSize, SmallVector<int64_t> &tiles,
17-
const SmallVector<int64_t> &expected) {
18-
std::cout << totalSize << ": [";
19-
for (unsigned i = 0; i < tiles.size(); i++) {
20-
std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
16+
bool print = false;
17+
auto testAdjust = [print](int64_t totalSize, SmallVector<int64_t> &tiles,
18+
const SmallVector<int64_t> &expected) {
19+
if (print) {
20+
std::cout << totalSize << ": [";
21+
for (unsigned i = 0; i < tiles.size(); i++) {
22+
std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
23+
}
24+
std::cout << "] -> [";
2125
}
22-
std::cout << "] -> [";
26+
2327
gc::adjustTiles(totalSize, tiles);
24-
for (unsigned i = 0; i < tiles.size(); i++) {
25-
std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
28+
29+
if (print) {
30+
for (unsigned i = 0; i < tiles.size(); i++) {
31+
std::cout << tiles[i] << (i + 1 < tiles.size() ? ", " : "");
32+
}
33+
std::cout << "]" << std::endl;
2634
}
27-
std::cout << "]" << std::endl;
35+
2836
EXPECT_EQ(tiles, expected);
2937
};
30-
auto test = [testCalc](int64_t totalSize, SmallVector<int64_t> tiles,
31-
SmallVector<int64_t> expected) {
38+
auto test = [testAdjust](int64_t totalSize, SmallVector<int64_t> tiles,
39+
SmallVector<int64_t> expected) {
3240
if (tiles.size() != 2 || tiles[0] == tiles[1]) {
33-
testCalc(totalSize, tiles, expected);
41+
testAdjust(totalSize, tiles, expected);
3442
return;
3543
}
3644
SmallVector<int64_t> reversed(tiles.rbegin(), tiles.rend());
37-
testCalc(totalSize, tiles, expected);
45+
testAdjust(totalSize, tiles, expected);
3846
std::reverse(expected.begin(), expected.end());
39-
testCalc(totalSize, reversed, expected);
47+
testAdjust(totalSize, reversed, expected);
4048
};
4149

4250
test(8, {1, 1}, {1, 1});

0 commit comments

Comments
 (0)