Skip to content

Commit 7a7eacc

Browse files
author
Stephan Herhut
committed
[MLIR][GPU] Implement a simple greedy loop mapper.
Summary: The mapper assigns annotations to loop.parallel operations that are compatible with the loop to gpu mapping pass. The outermost loop uses the grid dimensions, followed by block dimensions. All remaining loops are mapped to sequential loops. Differential Revision: https://reviews.llvm.org/D74963
1 parent 157b3d5 commit 7a7eacc

File tree

9 files changed

+252
-8
lines changed

9 files changed

+252
-8
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
//===- ParallelLoopMapper.h - Utilities for mapping parallel loops to GPU ====//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This header file declares the utilities to generate mappings for parallel
10+
// loops to GPU devices.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
15+
#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
16+
17+
namespace mlir {
18+
19+
struct Region;
20+
21+
namespace gpu {
22+
23+
/// Name of the mapping attribute produced by loop mappers.
24+
static constexpr const char *kMappingAttributeName = "mapping";
25+
/// Name of the processor sub-attribute that identifies the hardware id
26+
/// to map a loop to.
27+
static constexpr const char *kProcessorEntryName = "processor";
28+
/// Name of the map sub-attribute that identifies the affine map to apply
29+
/// to the hardware id to compute the iteration number of the loop. This
30+
/// map is expected to be extended by step and lower bound computations:
31+
/// index = map(hardware_id) * step + lowerbound
32+
static constexpr const char *kIndexMapEntryName = "map";
33+
/// Name of the bound sub-attribute that itendities the affine map to
34+
/// compute an upper bound of iterations for the hardware id. This is
35+
/// applied to an upper bound on the number of iterations:
36+
/// launchBound = bound(upperbound-lowerbound ceildiv step)
37+
static constexpr const char *kBoundMapEntryName = "bound";
38+
39+
} // end namespace gpu
40+
41+
/// Maps the parallel loops found in the given function to workgroups. The first
42+
/// loop encountered will be mapped to the global workgroup and the second loop
43+
/// encountered to the local workgroup. Within each mapping, the first three
44+
/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
45+
/// mapped to sequential loops.
46+
void greedilyMapParallelLoopsToGPU(Region &region);
47+
48+
} // end namespace mlir
49+
50+
#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H

mlir/include/mlir/Dialect/LoopOps/LoopOps.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,9 @@ def ParallelOp : Loop_Op<"parallel",
289289

290290
let extraClassDeclaration = [{
291291
Block *getBody() { return &region().front(); }
292+
unsigned getNumInductionVars() {
293+
return getBody()->getNumArguments();
294+
}
292295
iterator_range<Block::args_iterator> getInductionVars() {
293296
return {getBody()->args_begin(), getBody()->args_end()};
294297
}

mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
1818
#include "mlir/Dialect/AffineOps/AffineOps.h"
1919
#include "mlir/Dialect/GPU/GPUDialect.h"
20+
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
2021
#include "mlir/Dialect/LoopOps/LoopOps.h"
2122
#include "mlir/Dialect/StandardOps/IR/Ops.h"
2223
#include "mlir/IR/AffineExpr.h"
@@ -508,23 +509,20 @@ struct MappingAnnotation {
508509

509510
} // namespace
510511

511-
static constexpr const char *kProcessorEntryName = "processor";
512-
static constexpr const char *kIndexMapEntryName = "map";
513-
static constexpr const char *kBoundMapEntryName = "bound";
514-
515512
/// Extracts the mapping annotations from the provided attribute. The attribute
516513
/// is expected to be of the form
517514
/// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
518515
/// where the bound is optional.
519516
static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
520517
DictionaryAttr dict = attribute.cast<DictionaryAttr>();
521-
unsigned processor = dict.get(kProcessorEntryName)
518+
unsigned processor = dict.get(gpu::kProcessorEntryName)
522519
.cast<IntegerAttr>()
523520
.getValue()
524521
.getSExtValue();
525-
AffineMap map = dict.get(kIndexMapEntryName).cast<AffineMapAttr>().getValue();
522+
AffineMap map =
523+
dict.get(gpu::kIndexMapEntryName).cast<AffineMapAttr>().getValue();
526524
AffineMapAttr boundAttr =
527-
dict.get(kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
525+
dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
528526
AffineMap bound;
529527
if (boundAttr)
530528
bound = boundAttr.getValue();
@@ -583,7 +581,8 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
583581
PatternRewriter &rewriter) {
584582
// TODO(herhut): Verify that this is a valid GPU mapping.
585583
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
586-
ArrayAttr mapping = parallelOp.getAttrOfType<ArrayAttr>("mapping");
584+
ArrayAttr mapping =
585+
parallelOp.getAttrOfType<ArrayAttr>(gpu::kMappingAttributeName);
587586

588587
// TODO(herhut): Support reductions.
589588
if (!mapping || parallelOp.getNumResults() != 0)

mlir/lib/Dialect/GPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ add_llvm_library(MLIRGPU
33
Transforms/AllReduceLowering.cpp
44
Transforms/KernelOutlining.cpp
55
Transforms/MemoryPromotion.cpp
6+
Transforms/ParallelLoopMapper.cpp
67

78
ADDITIONAL_HEADER_DIRS
89
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file implements utilities to generate mappings for parallel loops to
10+
// GPU devices.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
15+
16+
#include "mlir/Dialect/GPU/GPUDialect.h"
17+
#include "mlir/Dialect/GPU/Passes.h"
18+
#include "mlir/Dialect/LoopOps/LoopOps.h"
19+
#include "mlir/IR/AffineMap.h"
20+
#include "mlir/Pass/Pass.h"
21+
22+
using namespace mlir;
23+
using namespace mlir::gpu;
24+
using namespace mlir::loop;
25+
26+
namespace {
27+
28+
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
29+
30+
static constexpr int kNumHardwareIds = 3;
31+
32+
} // namespace
33+
34+
/// Bounded increment on MappingLevel. Increments to the next
35+
/// level unless Sequential was already reached.
36+
MappingLevel &operator++(MappingLevel &mappingLevel) {
37+
if (mappingLevel < Sequential) {
38+
mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
39+
}
40+
return mappingLevel;
41+
}
42+
43+
/// Computed the hardware id to use for a given mapping level. Will
44+
/// assign x,y and z hardware ids for the first 3 dimensions and use
45+
/// sequential after.
46+
static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
47+
if (dimension >= kNumHardwareIds || level == Sequential)
48+
return Sequential * kNumHardwareIds;
49+
return (level * kNumHardwareIds) + dimension;
50+
}
51+
52+
/// Add mapping information to the given parallel loop. Do not add
53+
/// mapping information if the loop already has it. Also, don't
54+
/// start a mapping at a nested loop.
55+
static void mapParallelOp(ParallelOp parallelOp,
56+
MappingLevel mappingLevel = MapGrid) {
57+
// Do not try to add a mapping to already mapped loops or nested loops.
58+
if (parallelOp.getAttr(gpu::kMappingAttributeName) ||
59+
((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
60+
return;
61+
62+
MLIRContext *ctx = parallelOp.getContext();
63+
Builder b(ctx);
64+
SmallVector<Attribute, 4> attrs;
65+
attrs.reserve(parallelOp.getNumInductionVars());
66+
for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
67+
SmallVector<NamedAttribute, 3> entries;
68+
entries.emplace_back(b.getNamedAttr(
69+
kProcessorEntryName,
70+
b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))));
71+
entries.emplace_back(b.getNamedAttr(
72+
kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
73+
entries.emplace_back(b.getNamedAttr(
74+
kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
75+
attrs.push_back(DictionaryAttr::get(entries, ctx));
76+
}
77+
parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx));
78+
++mappingLevel;
79+
// Parallel loop operations are immediately nested, so do not use
80+
// walk but just iterate over the operations.
81+
for (Operation &op : *parallelOp.getBody()) {
82+
if (ParallelOp nested = dyn_cast<ParallelOp>(op))
83+
mapParallelOp(nested, mappingLevel);
84+
}
85+
}
86+
87+
void mlir::greedilyMapParallelLoopsToGPU(Region &region) {
88+
region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
89+
}

mlir/test/Dialect/GPU/mapping.mlir

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s
2+
3+
func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
4+
%arg3 : index) {
5+
%zero = constant 0 : index
6+
%one = constant 1 : index
7+
%four = constant 4 : index
8+
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
9+
step (%four, %four) {
10+
loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
11+
step (%one, %one) {
12+
}
13+
}
14+
return
15+
}
16+
17+
// CHECK-LABEL: func @parallel_loop(
18+
// CHECK: loop.parallel
19+
// CHECK: loop.parallel
20+
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
21+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}]}
22+
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
23+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}]}
24+
// CHECK-NOT: mapping
25+
26+
// -----
27+
28+
func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
29+
%arg3 : index) {
30+
%zero = constant 0 : index
31+
%one = constant 1 : index
32+
%four = constant 4 : index
33+
loop.parallel (%i0, %i1, %i2, %i3) = (%zero, %zero, %zero, %zero) to (%arg0, %arg1, %arg2, %arg3)
34+
step (%four, %four, %four, %four) {
35+
loop.parallel (%si0, %si1, %si2, %si3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
36+
step (%one, %one, %one, %one) {
37+
loop.parallel (%ti0, %ti1, %ti2, %ti3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
38+
step (%one, %one, %one, %one) {
39+
}
40+
}
41+
}
42+
return
43+
}
44+
45+
// CHECK-LABEL: func @parallel_loop_4d(
46+
// CHECK: loop.parallel
47+
// CHECK: loop.parallel
48+
// CHECK: loop.parallel
49+
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
50+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
51+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
52+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
53+
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
54+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64},
55+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 5 : i64},
56+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
57+
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
58+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64},
59+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 2 : i64},
60+
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
61+
// CHECK-NOT: mapping

mlir/test/lib/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ add_llvm_library(MLIRTestTransforms
55
TestConstantFold.cpp
66
TestLoopFusion.cpp
77
TestGpuMemoryPromotion.cpp
8+
TestGpuParallelLoopMapping.cpp
89
TestInlining.cpp
910
TestLinalgTransforms.cpp
1011
TestLiveness.cpp
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
//===- TestGPUParallelLoopMapping.cpp - Test pass for GPU loop mapping ----===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file implements the pass testing the utilities for mapping parallel
10+
// loops to gpu hardware ids.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
15+
#include "mlir/Pass/Pass.h"
16+
17+
using namespace mlir;
18+
19+
namespace {
20+
/// Simple pass for testing the mapping of parallel loops to hardware ids using
21+
/// a greedy mapping stratgegy.
22+
class TestGpuGreedyParallelLoopMappingPass
23+
: public OperationPass<TestGpuGreedyParallelLoopMappingPass, FuncOp> {
24+
void runOnOperation() override {
25+
Operation *op = getOperation();
26+
for (Region &region : op->getRegions())
27+
greedilyMapParallelLoopsToGPU(region);
28+
}
29+
};
30+
} // end namespace
31+
32+
namespace mlir {
33+
void registerTestGpuParallelLoopMappingPass() {
34+
PassRegistration<TestGpuGreedyParallelLoopMappingPass> registration(
35+
"test-gpu-greedy-parallel-loop-mapping",
36+
"Greedily maps all parallel loops to gpu hardware ids.");
37+
}
38+
} // namespace mlir

mlir/tools/mlir-opt/mlir-opt.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ void registerTestMemRefDependenceCheck();
5050
void registerTestMemRefStrideCalculation();
5151
void registerTestOpaqueLoc();
5252
void registerTestParallelismDetection();
53+
void registerTestGpuParallelLoopMappingPass();
5354
void registerTestVectorConversions();
5455
void registerTestVectorToLoopsPass();
5556
void registerVectorizerTestPass();
@@ -103,6 +104,7 @@ void registerTestPasses() {
103104
registerTestMemRefStrideCalculation();
104105
registerTestOpaqueLoc();
105106
registerTestParallelismDetection();
107+
registerTestGpuParallelLoopMappingPass();
106108
registerTestVectorConversions();
107109
registerTestVectorToLoopsPass();
108110
registerVectorizerTestPass();

0 commit comments

Comments
 (0)