[mlir][gpu] Add dump-ptx option

grypp · grypp · commit 22a32f7d9cb1 · 2023-07-13T21:14:57.000+02:00
When targeting NVIDIA GPUs, seeing the generated PTX is important. Currently, we don't have simple way to do it. This work adds dump-ptx to gpu-to-cubin pass. One can use it like `gpu-to-cubin{chip=sm_90 features=+ptx80 dump-ptx}`. Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D155166
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -117,6 +117,9 @@ class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
       *this, "gpu-binary-annotation",
       llvm::cl::desc("Annotation attribute string for GPU binary"),
       llvm::cl::init(getDefaultGpuBinaryAnnotation())};
+  Option<bool> dumpPtx{*this, "dump-ptx",
+                       ::llvm::cl::desc("Dump generated PTX"),
+                       llvm::cl::init(false)};
 };
 } // namespace gpu
 
@@ -137,7 +140,8 @@ void registerGpuSerializeToHsacoPass();
 std::unique_ptr<Pass> createGpuSerializeToCubinPass(StringRef triple,
                                                     StringRef chip,
                                                     StringRef features,
-                                                    int optLevel = 2);
+                                                    int optLevel = 2,
+                                                    bool dumpPtx = false);
 
 /// Create an instance of the GPU kernel function to HSAco binary serialization
 /// pass.
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
 
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
 #include "mlir/Pass/Pass.h"
@@ -50,7 +51,7 @@ class SerializeToCubinPass
 
   SerializeToCubinPass(StringRef triple = "nvptx64-nvidia-cuda",
                        StringRef chip = "sm_35", StringRef features = "+ptx60",
-                       int optLevel = 2);
+                       int optLevel = 2, bool dumpPtx = false);
 
   StringRef getArgument() const override { return "gpu-to-cubin"; }
   StringRef getDescription() const override {
@@ -73,10 +74,12 @@ static void maybeSetOption(Pass::Option<std::string> &option, StringRef value) {
 }
 
 SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
-                                           StringRef features, int optLevel) {
+                                           StringRef features, int optLevel,
+                                           bool dumpPtx) {
   maybeSetOption(this->triple, triple);
   maybeSetOption(this->chip, chip);
   maybeSetOption(this->features, features);
+  this->dumpPtx = dumpPtx;
   if (this->optLevel.getNumOccurrences() == 0)
     this->optLevel.setValue(optLevel);
 }
@@ -112,6 +115,10 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
                                     &linkState));
 
   auto kernelName = getOperation().getName().str();
+  if (dumpPtx) {
+    llvm::dbgs() << " Kernel Name : [" << kernelName << "]\n";
+    llvm::dbgs() << isa << "\n";
+  }
   RETURN_ON_CUDA_ERROR(cuLinkAddData(
       linkState, CUjitInputType::CU_JIT_INPUT_PTX,
       const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
@@ -151,9 +158,10 @@ void mlir::registerGpuSerializeToCubinPass() {
 std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(StringRef triple,
                                                           StringRef arch,
                                                           StringRef features,
-                                                          int optLevel) {
+                                                          int optLevel,
+                                                          bool dumpPtx) {
   return std::make_unique<SerializeToCubinPass>(triple, arch, features,
-                                                optLevel);
+                                                optLevel, dumpPtx);
 }
 
 #else  // MLIR_GPU_TO_CUBIN_PASS_ENABLE
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: Generated by LLVM NVPTX Back-End
+// CHECK: .visible .func kernel_a()
+// CHECK: ret;
+
+gpu.module @bar {
+  llvm.func @kernel_a()
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}