[MLIR][GPU] Implement a simple greedy loop mapper.

Stephan Herhut · Stephan Herhut · commit 7a7eacc797f7 · 2020-02-25T11:42:42.000+01:00
Summary: The mapper assigns annotations to loop.parallel operations that are compatible with the loop to gpu mapping pass. The outermost loop uses the grid dimensions, followed by block dimensions. All remaining loops are mapped to sequential loops. Differential Revision: https://reviews.llvm.org/D74963
diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
@@ -0,0 +1,50 @@
+//===- ParallelLoopMapper.h - Utilities for mapping parallel loops to GPU ====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utilities to generate mappings for parallel
+// loops to GPU devices.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
+#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
+
+namespace mlir {
+
+struct Region;
+
+namespace gpu {
+
+/// Name of the mapping attribute produced by loop mappers.
+static constexpr const char *kMappingAttributeName = "mapping";
+/// Name of the processor sub-attribute that identifies the hardware id
+/// to map a loop to.
+static constexpr const char *kProcessorEntryName = "processor";
+/// Name of the map sub-attribute that identifies the affine map to apply
+/// to the hardware id to compute the iteration number of the loop. This
+/// map is expected to be extended by step and lower bound computations:
+///   index = map(hardware_id) * step + lowerbound
+static constexpr const char *kIndexMapEntryName = "map";
+/// Name of the bound sub-attribute that itendities the affine map to
+/// compute an upper bound of iterations for the hardware id. This is
+/// applied to an upper bound on the number of iterations:
+///   launchBound = bound(upperbound-lowerbound ceildiv step)
+static constexpr const char *kBoundMapEntryName = "bound";
+
+} // end namespace gpu
+
+/// Maps the parallel loops found in the given function to workgroups. The first
+/// loop encountered will be mapped to the global workgroup and the second loop
+/// encountered to the local workgroup. Within each mapping, the first three
+/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
+/// mapped to sequential loops.
+void greedilyMapParallelLoopsToGPU(Region &region);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
@@ -289,6 +289,9 @@ def ParallelOp : Loop_Op<"parallel",
 
   let extraClassDeclaration = [{
     Block *getBody() { return &region().front(); }
+    unsigned getNumInductionVars() {
+      return getBody()->getNumArguments();
+    }
     iterator_range<Block::args_iterator> getInductionVars() {
       return {getBody()->args_begin(), getBody()->args_end()};
     }
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineExpr.h"
@@ -508,23 +509,20 @@ struct MappingAnnotation {
 
 } // namespace
 
-static constexpr const char *kProcessorEntryName = "processor";
-static constexpr const char *kIndexMapEntryName = "map";
-static constexpr const char *kBoundMapEntryName = "bound";
-
 /// Extracts the mapping annotations from the provided attribute. The attribute
 /// is expected to be of the form
 /// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
 /// where the bound is optional.
 static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
   DictionaryAttr dict = attribute.cast<DictionaryAttr>();
-  unsigned processor = dict.get(kProcessorEntryName)
+  unsigned processor = dict.get(gpu::kProcessorEntryName)
                            .cast<IntegerAttr>()
                            .getValue()
                            .getSExtValue();
-  AffineMap map = dict.get(kIndexMapEntryName).cast<AffineMapAttr>().getValue();
+  AffineMap map =
+      dict.get(gpu::kIndexMapEntryName).cast<AffineMapAttr>().getValue();
   AffineMapAttr boundAttr =
-      dict.get(kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
+      dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
   AffineMap bound;
   if (boundAttr)
     bound = boundAttr.getValue();
@@ -583,7 +581,8 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
                                          PatternRewriter &rewriter) {
   // TODO(herhut): Verify that this is a valid GPU mapping.
   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
-  ArrayAttr mapping = parallelOp.getAttrOfType<ArrayAttr>("mapping");
+  ArrayAttr mapping =
+      parallelOp.getAttrOfType<ArrayAttr>(gpu::kMappingAttributeName);
 
   // TODO(herhut): Support reductions.
   if (!mapping || parallelOp.getNumResults() != 0)
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(MLIRGPU
   Transforms/AllReduceLowering.cpp
   Transforms/KernelOutlining.cpp
   Transforms/MemoryPromotion.cpp
+  Transforms/ParallelLoopMapper.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -0,0 +1,89 @@
+//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities to generate mappings for parallel loops to
+// GPU devices.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+using namespace mlir::loop;
+
+namespace {
+
+enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
+
+static constexpr int kNumHardwareIds = 3;
+
+} // namespace
+
+/// Bounded increment on MappingLevel. Increments to the next
+/// level unless Sequential was already reached.
+MappingLevel &operator++(MappingLevel &mappingLevel) {
+  if (mappingLevel < Sequential) {
+    mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
+  }
+  return mappingLevel;
+}
+
+/// Computed the hardware id to use for a given mapping level. Will
+/// assign x,y and z hardware ids for the first 3 dimensions and use
+/// sequential after.
+static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
+  if (dimension >= kNumHardwareIds || level == Sequential)
+    return Sequential * kNumHardwareIds;
+  return (level * kNumHardwareIds) + dimension;
+}
+
+/// Add mapping information to the given parallel loop. Do not add
+/// mapping information if the loop already has it. Also, don't
+/// start a mapping at a nested loop.
+static void mapParallelOp(ParallelOp parallelOp,
+                          MappingLevel mappingLevel = MapGrid) {
+  // Do not try to add a mapping to already mapped loops or nested loops.
+  if (parallelOp.getAttr(gpu::kMappingAttributeName) ||
+      ((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
+    return;
+
+  MLIRContext *ctx = parallelOp.getContext();
+  Builder b(ctx);
+  SmallVector<Attribute, 4> attrs;
+  attrs.reserve(parallelOp.getNumInductionVars());
+  for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
+    SmallVector<NamedAttribute, 3> entries;
+    entries.emplace_back(b.getNamedAttr(
+        kProcessorEntryName,
+        b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))));
+    entries.emplace_back(b.getNamedAttr(
+        kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
+    entries.emplace_back(b.getNamedAttr(
+        kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
+    attrs.push_back(DictionaryAttr::get(entries, ctx));
+  }
+  parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx));
+  ++mappingLevel;
+  // Parallel loop operations are immediately nested, so do not use
+  // walk but just iterate over the operations.
+  for (Operation &op : *parallelOp.getBody()) {
+    if (ParallelOp nested = dyn_cast<ParallelOp>(op))
+      mapParallelOp(nested, mappingLevel);
+  }
+}
+
+void mlir::greedilyMapParallelLoopsToGPU(Region &region) {
+  region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
+}
diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
+                                            step (%one, %one)  {
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK:           loop.parallel 
+// CHECK:             loop.parallel 
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}]}
+// CHECK-NOT: mapping
+
+// -----
+
+func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
+                       %arg3 : index) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1, %i2, %i3) = (%zero, %zero, %zero, %zero) to (%arg0, %arg1, %arg2, %arg3)
+                                       step (%four, %four, %four, %four)  {
+    loop.parallel (%si0, %si1, %si2, %si3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
+                                             step (%one, %one, %one, %one)  {
+      loop.parallel (%ti0, %ti1, %ti2, %ti3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
+                                               step (%one, %one, %one, %one)  {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop_4d(
+// CHECK:           loop.parallel 
+// CHECK:             loop.parallel 
+// CHECK:               loop.parallel
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 5 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 2 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK-NOT: mapping
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(MLIRTestTransforms
   TestConstantFold.cpp
   TestLoopFusion.cpp
   TestGpuMemoryPromotion.cpp
+  TestGpuParallelLoopMapping.cpp
   TestInlining.cpp
   TestLinalgTransforms.cpp
   TestLiveness.cpp
diff --git a/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp
@@ -0,0 +1,38 @@
+//===- TestGPUParallelLoopMapping.cpp - Test pass for GPU loop mapping ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for mapping parallel
+// loops to gpu hardware ids.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the mapping of parallel loops to hardware ids using
+/// a greedy mapping stratgegy.
+class TestGpuGreedyParallelLoopMappingPass
+    : public OperationPass<TestGpuGreedyParallelLoopMappingPass, FuncOp> {
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    for (Region &region : op->getRegions())
+      greedilyMapParallelLoopsToGPU(region);
+  }
+};
+} // end namespace
+
+namespace mlir {
+void registerTestGpuParallelLoopMappingPass() {
+  PassRegistration<TestGpuGreedyParallelLoopMappingPass> registration(
+      "test-gpu-greedy-parallel-loop-mapping",
+      "Greedily maps all parallel loops to gpu hardware ids.");
+}
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -50,6 +50,7 @@ void registerTestMemRefDependenceCheck();
 void registerTestMemRefStrideCalculation();
 void registerTestOpaqueLoc();
 void registerTestParallelismDetection();
+void registerTestGpuParallelLoopMappingPass();
 void registerTestVectorConversions();
 void registerTestVectorToLoopsPass();
 void registerVectorizerTestPass();
@@ -103,6 +104,7 @@ void registerTestPasses() {
   registerTestMemRefStrideCalculation();
   registerTestOpaqueLoc();
   registerTestParallelismDetection();
+  registerTestGpuParallelLoopMappingPass();
   registerTestVectorConversions();
   registerTestVectorToLoopsPass();
   registerVectorizerTestPass();

Original file line number	Diff line number	Diff line change
`@@ -289,6 +289,9 @@ def ParallelOp : Loop_Op<"parallel",`
`289`	`289`
`290`	`290`	`let extraClassDeclaration = [{`
`291`	`291`	`Block *getBody() { return &region().front(); }`
	`292`	`+ unsigned getNumInductionVars() {`
	`293`	`+ return getBody()->getNumArguments();`
	`294`	`+ }`
`292`	`295`	`iterator_range<Block::args_iterator> getInductionVars() {`
`293`	`296`	`return {getBody()->args_begin(), getBody()->args_end()};`
`294`	`297`	`}`