[GEN] Implement GPU to GEN lowering (#13427)

FMarno · web-flow · commit e7d72507f915 · 2024-05-16T15:28:07.000+01:00
Implements the lowering of the GPU dialect to the GEN dialect where
possible.
Currently there are only 6 GEN operations, so the lowering is a bit
limited.

Signed-off-by: Finlay Marno finlay.marno@codeplay.com

---------

Signed-off-by: Finlay Marno &lt;finlay.marno@codeplay.com&gt;
diff --git a/mlir/include/mlir/Conversion/GPUToGEN/GPUToGEN.h b/mlir/include/mlir/Conversion/GPUToGEN/GPUToGEN.h
@@ -0,0 +1,29 @@
+//===- GPUToGEN.h - GPU to GEN Passes ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides passes to convert GPU dialect to GEN dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_GPUTOGEN_GPUTOGEN_H
+#define MLIR_CONVERSION_GPUTOGEN_GPUTOGEN_H
+
+#include <memory>
+
+namespace mlir {
+
+class Pass;
+class RewritePatternSet;
+
+#define GEN_PASS_DECL_CONVERTGPUOPSTOGENOPS
+#include "mlir/Conversion/Passes.h.inc"
+
+void populateGPUToGENPatterns(RewritePatternSet &patterns);
+
+} // namespace mlir
+#endif // MLIR_CONVERSION_GPUTOGEN_GPUTOGEN_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
@@ -36,6 +36,7 @@
 #include "mlir/Conversion/GENToLLVM/GENToLLVM.h"
 #include "mlir/Conversion/GENToSPIRV/GENToSPIRV.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToGEN/GPUToGEN.h"
 #include "mlir/Conversion/GPUToGENX/GPUToGENXPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
@@ -540,6 +540,15 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> {
   let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
+//===----------------------------------------------------------------------===//
+// GPUToGEN
+//===----------------------------------------------------------------------===//
+
+def ConvertGpuOpsToGENOps : Pass<"convert-gpu-to-gen"> {
+  let summary = "Generate GEN operations for gpu operations";
+  let dependentDialects = ["GEN::GENDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // GPUToGENX
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
@@ -25,6 +25,7 @@ add_subdirectory(FuncToSPIRV)
 add_subdirectory(GENToLLVM)
 add_subdirectory(GENToSPIRV)
 add_subdirectory(GPUCommon)
+add_subdirectory(GPUToGEN)
 add_subdirectory(GPUToGENX)
 add_subdirectory(GPUToNVVM)
 add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUToGEN/CMakeLists.txt b/mlir/lib/Conversion/GPUToGEN/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_mlir_conversion_library(MLIRGPUToGEN
+  GPUToGEN.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToGEN
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRGPUDialect
+  MLIRGENDialect
+)
diff --git a/mlir/lib/Conversion/GPUToGEN/GPUToGEN.cpp b/mlir/lib/Conversion/GPUToGEN/GPUToGEN.cpp
@@ -0,0 +1,134 @@
+//===- GPUToGEN.cpp - GPU to GEN Patterns ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements patterns to convert GPU dialect to GEN dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToGEN/GPUToGEN.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GEN/IR/GENDialect.h"
+#include "mlir/Dialect/GEN/IR/GENOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "llvm/Support/Debug.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTGPUOPSTOGENOPS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+template <typename GPUOp, typename GENOp>
+class GPUIndexOpToGENLowering : public OpConversionPattern<GPUOp> {
+public:
+  using OpConversionPattern<GPUOp>::OpConversionPattern;
+  using OpAdaptor = typename GPUOp::Adaptor;
+
+  LogicalResult
+  matchAndRewrite(GPUOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto dim = static_cast<std::uint32_t>(adaptor.getDimension());
+    Value idxDim = rewriter.create<arith::ConstantIntOp>(op->getLoc(), dim, 32);
+    rewriter.replaceOpWithNewOp<GENOp>(op, rewriter.getIndexType(), idxDim);
+    return success();
+  }
+};
+
+class GPUBarrierToGENLowering : public OpConversionPattern<gpu::BarrierOp> {
+public:
+  using OpConversionPattern<gpu::BarrierOp>::OpConversionPattern;
+  using OpAdaptor = typename gpu::BarrierOp::Adaptor;
+
+  LogicalResult match(gpu::BarrierOp op) const final { return success(); }
+
+  void rewrite(gpu::BarrierOp op, OpAdaptor,
+               ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOpWithNewOp<GEN::BarrierOp>(op);
+  }
+};
+
+class GPUShuffleToGENLowering : public OpConversionPattern<gpu::ShuffleOp> {
+public:
+  using OpConversionPattern<gpu::ShuffleOp>::OpConversionPattern;
+  using OpAdaptor = typename gpu::ShuffleOp::Adaptor;
+
+  LogicalResult
+  matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+
+    auto gpuMode = adaptor.getMode();
+    const auto genMode = [](gpu::ShuffleMode mode) {
+      switch (mode) {
+      case gpu::ShuffleMode::XOR:
+        return GEN::ShflKind::XOR;
+      case gpu::ShuffleMode::DOWN:
+        return GEN::ShflKind::DOWN;
+      case gpu::ShuffleMode::UP:
+        return GEN::ShflKind::UP;
+      case gpu::ShuffleMode::IDX:
+        return GEN::ShflKind::IDX;
+      }
+      llvm_unreachable("expected a matching shuffle mode");
+    }(gpuMode);
+
+    // TODO unable to validate gpu width parameter, potential for producing
+    // invalid code
+    IntegerAttr widthAttr;
+    if (!matchPattern(adaptor.getWidth(), m_Constant(&widthAttr))) {
+      return rewriter.notifyMatchFailure(
+          op, "shuffle width must be a constant value");
+    }
+
+    Value trueValue = rewriter.create<arith::ConstantOp>(
+        op->getLoc(), rewriter.getBoolAttr(true));
+    auto result = rewriter.create<GEN::SubGroupShuffleOp>(
+        op->getLoc(), op->getResult(0).getType(), adaptor.getValue(),
+        adaptor.getOffset(), genMode);
+
+    rewriter.replaceOp(op, {result, trueValue});
+    return success();
+  }
+};
+
+void mlir::populateGPUToGENPatterns(RewritePatternSet &patterns) {
+  patterns.add<GPUIndexOpToGENLowering<gpu::ThreadIdOp, GEN::LocalIdOp>,
+               GPUIndexOpToGENLowering<gpu::BlockIdOp, GEN::WorkGroupIdOp>,
+               GPUIndexOpToGENLowering<gpu::BlockDimOp, GEN::WorkGroupSizeOp>,
+               GPUIndexOpToGENLowering<gpu::GridDimOp, GEN::NumWorkGroupsOp>,
+               GPUBarrierToGENLowering, GPUShuffleToGENLowering>(
+      patterns.getContext());
+}
+
+namespace {
+struct ConvertGpuOpsToGENOpsPass
+    : public impl::ConvertGpuOpsToGENOpsBase<ConvertGpuOpsToGENOpsPass> {
+  void runOnOperation() override {
+    ConversionTarget target(getContext());
+
+    target.addLegalOp<arith::ConstantOp>();
+    target.addLegalDialect<GEN::GENDialect>();
+    // The ops of gpu dialect that can currently be mapped to GEN
+    target.addIllegalOp<gpu::ThreadIdOp, gpu::BlockIdOp, gpu::BlockDimOp,
+                        gpu::GridDimOp, gpu::BarrierOp, gpu::ShuffleOp>();
+
+    mlir::RewritePatternSet patterns(&getContext());
+    populateGPUToGENPatterns(patterns);
+
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      signalPassFailure();
+  }
+};
+} // namespace
diff --git a/mlir/test/Conversion/GPUToGEN/gpu-to-gen.mlir b/mlir/test/Conversion/GPUToGEN/gpu-to-gen.mlir
@@ -0,0 +1,159 @@
+// RUN: mlir-opt -split-input-file -convert-gpu-to-gen %s | FileCheck %s
+
+gpu.module @local_id_kernels {
+  // CHECK-LABEL: gen_local_id_x
+  gpu.func @gen_local_id_x() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 0 : i32
+    // CHECK: gen.local_id [[DIM]]
+    %0 = gpu.thread_id x
+    gpu.return
+  }
+
+  // CHECK-LABEL: gen_local_id_y
+  gpu.func @gen_local_id_y() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 1 : i32
+    // CHECK: gen.local_id [[DIM]]
+    %0 = gpu.thread_id y
+    gpu.return
+  }
+
+  // CHECK-LABEL: gen_local_id_z
+  gpu.func @gen_local_id_z() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 2 : i32
+    // CHECK: gen.local_id [[DIM]]
+    %0 = gpu.thread_id z
+    gpu.return
+  }
+}
+
+// -----
+
+
+gpu.module @work_group_id_kernels {
+  // CHECK-LABEL: gen_work_group_id_x
+  gpu.func @gen_work_group_id_x() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 0 : i32
+    // CHECK: gen.work_group_id [[DIM]]
+    %0 = gpu.block_id x
+    gpu.return
+  }
+
+  // CHECK-LABEL: gen_work_group_id_y
+  gpu.func @gen_work_group_id_y() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 1 : i32
+    // CHECK: gen.work_group_id [[DIM]]
+    %0 = gpu.block_id y
+    gpu.return
+  }
+
+  // CHECK-LABEL: gen_work_group_id_z
+  gpu.func @gen_work_group_id_z() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 2 : i32
+    // CHECK: gen.work_group_id [[DIM]]
+    %0 = gpu.block_id z
+    gpu.return
+  }
+}
+
+// -----
+
+
+gpu.module @work_group_size_kernels {
+  // CHECK-LABEL: gen_work_group_size_x
+  gpu.func @gen_work_group_size_x() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 0 : i32
+    // CHECK: gen.work_group_size [[DIM]]
+    %0 = gpu.block_dim x
+    gpu.return
+  }
+
+  // CHECK-LABEL: gen_work_group_size_y
+  gpu.func @gen_work_group_size_y() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 1 : i32
+    // CHECK: gen.work_group_size [[DIM]]
+    %0 = gpu.block_dim y
+    gpu.return
+  }
+
+  // CHECK-LABEL: gen_work_group_size_z
+  gpu.func @gen_work_group_size_z() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 2 : i32
+    // CHECK: gen.work_group_size [[DIM]]
+    %0 = gpu.block_dim z
+    gpu.return
+  }
+}
+
+// -----
+
+
+gpu.module @num_work_groups_kernels {
+  // CHECK-LABEL: gen_num_work_groups_x
+  gpu.func @gen_num_work_groups_x() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 0 : i32
+    // CHECK: gen.num_work_groups [[DIM]]
+    %0 = gpu.grid_dim x
+    gpu.return
+  }
+
+  // CHECK-LABEL: gen_num_work_groups_y
+  gpu.func @gen_num_work_groups_y() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 1 : i32
+    // CHECK: gen.num_work_groups [[DIM]]
+    %0 = gpu.grid_dim y
+    gpu.return
+  }
+
+  // CHECK-LABEL: gen_num_work_groups_z
+  gpu.func @gen_num_work_groups_z() kernel {
+    // CHECK: [[DIM:%.*]] = arith.constant 2 : i32
+    // CHECK: gen.num_work_groups [[DIM]]
+    %0 = gpu.grid_dim z
+    gpu.return
+  }
+}
+
+// -----
+
+gpu.module @barrier_kernels {
+  // CHECK-LABEL: gen_barrier
+  gpu.func @gen_barrier() kernel {
+    // CHECK: gen.barrier
+    gpu.barrier
+    gpu.return
+  }
+}
+
+// -----
+
+// CHECK-LABEL gpu.module @shuffle_kernels
+gpu.module @shuffle_kernels {
+  // CHECK: gpu.func @gen_shuffle_xor(%[[IN_XOR:.*]]: f32, %[[OFFSET_XOR:.*]]: i32) kernel {
+  gpu.func @gen_shuffle_xor(%in : f32, %offset: i32) kernel {
+    // CHECK: %{{.*}} = gen.sub_group_shuffle xor %[[IN_XOR]], %[[OFFSET_XOR]] : f32
+    %width = arith.constant 32 : i32
+    %0, %1 = gpu.shuffle xor %in, %offset, %width : f32
+    gpu.return
+  }
+  // CHECK: gpu.func @gen_shuffle_up(%[[IN_UP:.*]]: f32, %[[OFFSET_UP:.*]]: i32) kernel {
+  gpu.func @gen_shuffle_up(%in : f32, %offset: i32) kernel {
+    // CHECK: %{{.*}} = gen.sub_group_shuffle up %[[IN_UP]], %[[OFFSET_UP]] : f32
+    %width = arith.constant 32 : i32
+    %0, %1 = gpu.shuffle up %in, %offset, %width : f32
+    gpu.return
+  }
+  // CHECK: gpu.func @gen_shuffle_down(%[[IN_DOWN:.*]]: f32, %[[OFFSET_DOWN:.*]]: i32) kernel {
+  gpu.func @gen_shuffle_down(%in : f32, %offset: i32) kernel {
+    // CHECK: %{{.*}} = gen.sub_group_shuffle down %[[IN_DOWN]], %[[OFFSET_DOWN]] : f32
+    %width = arith.constant 32 : i32
+    %0, %1 = gpu.shuffle down %in, %offset, %width : f32
+    gpu.return
+  }
+  // CHECK: gpu.func @gen_shuffle_idx(%[[IN_IDX:.*]]: f32, %[[OFFSET_IDX:.*]]: i32) kernel {
+  gpu.func @gen_shuffle_idx(%in : f32, %offset: i32) kernel {
+    // CHECK: %{{.*}} = gen.sub_group_shuffle idx %[[IN_IDX]], %[[OFFSET_IDX]] : f32
+    %width = arith.constant 32 : i32
+    %0, %1 = gpu.shuffle idx %in, %offset, %width : f32
+    gpu.return
+  }
+}