Skip to content

Commit 2224221

Browse files
committed
[mlir] Add NVVM to CUBIN conversion to mlir-opt
If MLIR_CUDA_RUNNER_ENABLED, register a 'gpu-to-cubin' conversion pass to mlir-opt. The next step is to switch CUDA integration tests from mlir-cuda-runner to mlir-opt + mlir-cpu-runner and remove mlir-cuda-runner. Depends On D98279 Reviewed By: herhut, rriddle, mehdi_amini Differential Revision: https://reviews.llvm.org/D98203
1 parent 80d1f65 commit 2224221

File tree

8 files changed

+226
-6
lines changed

8 files changed

+226
-6
lines changed

mlir/include/mlir/Dialect/GPU/Passes.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,18 @@ class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
5353

5454
void runOnOperation() final;
5555

56+
protected:
57+
void getDependentDialects(DialectRegistry &registry) const override;
58+
5659
private:
57-
// Creates the LLVM target machine to generate the ISA.
60+
/// Creates the LLVM target machine to generate the ISA.
5861
std::unique_ptr<llvm::TargetMachine> createTargetMachine();
5962

60-
// Translates the 'getOperation()' result to an LLVM module.
63+
/// Translates the 'getOperation()' result to an LLVM module.
6164
virtual std::unique_ptr<llvm::Module>
62-
translateToLLVMIR(llvm::LLVMContext &llvmContext) = 0;
65+
translateToLLVMIR(llvm::LLVMContext &llvmContext);
6366

64-
// Serializes the target ISA to binary form.
67+
/// Serializes the target ISA to binary form.
6568
virtual std::unique_ptr<std::vector<char>>
6669
serializeISA(const std::string &isa) = 0;
6770

@@ -83,6 +86,10 @@ class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
8386
// Registration
8487
//===----------------------------------------------------------------------===//
8588

89+
/// Register pass to serialize GPU kernel functions to a CUBIN binary
90+
/// annotation.
91+
void registerGpuSerializeToCubinPass();
92+
8693
/// Generate the code for registering passes.
8794
#define GEN_PASS_REGISTRATION
8895
#include "mlir/Dialect/GPU/Passes.h.inc"

mlir/include/mlir/InitAllPasses.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ inline void registerAllPasses() {
5151
registerAffinePasses();
5252
registerAsyncPasses();
5353
registerGPUPasses();
54+
registerGpuSerializeToCubinPass();
5455
registerLinalgPasses();
5556
LLVM::registerLLVMPasses();
5657
quant::registerQuantPasses();

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
2424
intrinsics_gen
2525

2626
LINK_COMPONENTS
27+
Core
28+
MC
2729
${AMDGPU_LIBS}
2830
${NVPTX_LIBS}
2931

mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ class GpuKernelToBlobPass
6161

6262
private:
6363
// Translates the 'getOperation()' result to an LLVM module.
64+
// Note: when this class is removed, this function no longer needs to be
65+
// virtual.
6466
std::unique_ptr<llvm::Module>
6567
translateToLLVMIR(llvm::LLVMContext &llvmContext) override {
6668
return loweringCallback(getOperation(), llvmContext, "LLVMDialectModule");

mlir/lib/Dialect/GPU/CMakeLists.txt

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
if (MLIR_CUDA_CONVERSIONS_ENABLED)
2+
set(NVPTX_LIBS
3+
NVPTXCodeGen
4+
NVPTXDesc
5+
NVPTXInfo
6+
)
7+
endif()
8+
19
add_mlir_dialect_library(MLIRGPU
210
IR/GPUDialect.cpp
311
Transforms/AllReduceLowering.cpp
@@ -6,13 +14,15 @@ add_mlir_dialect_library(MLIRGPU
614
Transforms/MemoryPromotion.cpp
715
Transforms/ParallelLoopMapper.cpp
816
Transforms/SerializeToBlob.cpp
17+
Transforms/SerializeToCubin.cpp
918

1019
ADDITIONAL_HEADER_DIRS
1120
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
1221

1322
LINK_COMPONENTS
1423
Core
1524
MC
25+
${NVPTX_LIBS}
1626

1727
DEPENDS
1828
MLIRGPUOpsIncGen
@@ -26,10 +36,50 @@ add_mlir_dialect_library(MLIRGPU
2636
MLIREDSC
2737
MLIRIR
2838
MLIRLLVMIR
39+
MLIRLLVMToLLVMIRTranslation
2940
MLIRSCF
3041
MLIRPass
3142
MLIRSideEffectInterfaces
3243
MLIRStandard
3344
MLIRSupport
3445
MLIRTransformUtils
3546
)
47+
48+
if(MLIR_CUDA_RUNNER_ENABLED)
49+
if(NOT MLIR_CUDA_CONVERSIONS_ENABLED)
50+
message(SEND_ERROR
51+
"Building mlir with cuda support requires the NVPTX backend")
52+
endif()
53+
54+
# Configure CUDA language support. Using check_language first allows us to
55+
# give a custom error message.
56+
include(CheckLanguage)
57+
check_language(CUDA)
58+
if (CMAKE_CUDA_COMPILER)
59+
enable_language(CUDA)
60+
else()
61+
message(SEND_ERROR
62+
"Building mlir with cuda support requires a working CUDA install")
63+
endif()
64+
65+
# Enable gpu-to-cubin pass.
66+
target_compile_definitions(obj.MLIRGPU
67+
PRIVATE
68+
MLIR_GPU_TO_CUBIN_PASS_ENABLE=1
69+
)
70+
71+
# Add CUDA headers includes and the libcuda.so library.
72+
target_include_directories(obj.MLIRGPU
73+
PRIVATE
74+
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
75+
)
76+
77+
find_library(CUDA_DRIVER_LIBRARY cuda)
78+
79+
target_link_libraries(MLIRGPU
80+
PRIVATE
81+
MLIRNVVMToLLVMIRTranslation
82+
${CUDA_DRIVER_LIBRARY}
83+
)
84+
85+
endif()

mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
#include "mlir/Dialect/GPU/Passes.h"
1616
#include "mlir/Pass/Pass.h"
17+
#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
18+
#include "mlir/Target/LLVMIR/Export.h"
1719
#include "llvm/IR/LegacyPassManager.h"
1820
#include "llvm/Support/TargetRegistry.h"
1921
#include "llvm/Support/TargetSelect.h"
@@ -68,6 +70,12 @@ void gpu::SerializeToBlobPass::runOnOperation() {
6870
getOperation()->setAttr(gpuBinaryAnnotation, attr);
6971
}
7072

73+
void gpu::SerializeToBlobPass::getDependentDialects(
74+
DialectRegistry &registry) const {
75+
registerLLVMDialectTranslation(registry);
76+
OperationPass<gpu::GPUModuleOp>::getDependentDialects(registry);
77+
}
78+
7179
std::unique_ptr<llvm::TargetMachine>
7280
gpu::SerializeToBlobPass::createTargetMachine() {
7381
Location loc = getOperation().getLoc();
@@ -87,3 +95,9 @@ gpu::SerializeToBlobPass::createTargetMachine() {
8795

8896
return std::unique_ptr<llvm::TargetMachine>{machine};
8997
}
98+
99+
std::unique_ptr<llvm::Module>
100+
gpu::SerializeToBlobPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) {
101+
return translateModuleToLLVMIR(getOperation(), llvmContext,
102+
"LLVMDialectModule");
103+
}
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
//===- LowerGPUToCUBIN.cpp - Convert GPU kernel to CUBIN blob -------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file implements a pass that serializes a gpu module into CUBIN blob and
10+
// adds that blob as a string attribute of the module.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
#include "mlir/Dialect/GPU/Passes.h"
14+
15+
#if MLIR_GPU_TO_CUBIN_PASS_ENABLE
16+
#include "mlir/Pass/Pass.h"
17+
#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
18+
#include "mlir/Target/LLVMIR/Export.h"
19+
#include "llvm/Support/TargetSelect.h"
20+
21+
#include <cuda.h>
22+
23+
using namespace mlir;
24+
25+
static void emitCudaError(const llvm::Twine &expr, const char *buffer,
26+
CUresult result, Location loc) {
27+
const char *error;
28+
cuGetErrorString(result, &error);
29+
emitError(loc, expr.concat(" failed with error code ")
30+
.concat(llvm::Twine{error})
31+
.concat("[")
32+
.concat(buffer)
33+
.concat("]"));
34+
}
35+
36+
#define RETURN_ON_CUDA_ERROR(expr) \
37+
do { \
38+
if (auto status = (expr)) { \
39+
emitCudaError(#expr, jitErrorBuffer, status, loc); \
40+
return {}; \
41+
} \
42+
} while (false)
43+
44+
namespace {
45+
class SerializeToCubinPass
46+
: public PassWrapper<SerializeToCubinPass, gpu::SerializeToBlobPass> {
47+
public:
48+
SerializeToCubinPass();
49+
50+
private:
51+
void getDependentDialects(DialectRegistry &registry) const override;
52+
53+
// Serializes PTX to CUBIN.
54+
std::unique_ptr<std::vector<char>>
55+
serializeISA(const std::string &isa) override;
56+
};
57+
} // namespace
58+
59+
// Sets the 'option' to 'value' unless it already has a value.
60+
static void maybeSetOption(Pass::Option<std::string> &option,
61+
const char *value) {
62+
if (!option.hasValue())
63+
option = value;
64+
}
65+
66+
SerializeToCubinPass::SerializeToCubinPass() {
67+
maybeSetOption(this->triple, "nvptx64-nvidia-cuda");
68+
maybeSetOption(this->chip, "sm_35");
69+
maybeSetOption(this->features, "+ptx60");
70+
}
71+
72+
void SerializeToCubinPass::getDependentDialects(
73+
DialectRegistry &registry) const {
74+
registerNVVMDialectTranslation(registry);
75+
gpu::SerializeToBlobPass::getDependentDialects(registry);
76+
}
77+
78+
std::unique_ptr<std::vector<char>>
79+
SerializeToCubinPass::serializeISA(const std::string &isa) {
80+
Location loc = getOperation().getLoc();
81+
char jitErrorBuffer[4096] = {0};
82+
83+
RETURN_ON_CUDA_ERROR(cuInit(0));
84+
85+
// Linking requires a device context.
86+
CUdevice device;
87+
RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
88+
CUcontext context;
89+
RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device));
90+
CUlinkState linkState;
91+
92+
CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
93+
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
94+
void *jitOptionsVals[] = {jitErrorBuffer,
95+
reinterpret_cast<void *>(sizeof(jitErrorBuffer))};
96+
97+
RETURN_ON_CUDA_ERROR(cuLinkCreate(2, /* number of jit options */
98+
jitOptions, /* jit options */
99+
jitOptionsVals, /* jit option values */
100+
&linkState));
101+
102+
auto kernelName = getOperation().getName().str();
103+
RETURN_ON_CUDA_ERROR(cuLinkAddData(
104+
linkState, CUjitInputType::CU_JIT_INPUT_PTX,
105+
const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
106+
kernelName.c_str(), 0, /* number of jit options */
107+
nullptr, /* jit options */
108+
nullptr /* jit option values */
109+
));
110+
111+
void *cubinData;
112+
size_t cubinSize;
113+
RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize));
114+
115+
char *cubinAsChar = static_cast<char *>(cubinData);
116+
auto result =
117+
std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
118+
119+
// This will also destroy the cubin data.
120+
RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
121+
RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
122+
123+
return result;
124+
}
125+
126+
// Register pass to serialize GPU kernel functions to a CUBIN binary annotation.
127+
void mlir::registerGpuSerializeToCubinPass() {
128+
PassRegistration<SerializeToCubinPass> registerSerializeToCubin(
129+
"gpu-to-cubin", "Lower GPU kernel function to CUBIN binary annotations",
130+
[] {
131+
// Initialize LLVM NVPTX backend.
132+
LLVMInitializeNVPTXTarget();
133+
LLVMInitializeNVPTXTargetInfo();
134+
LLVMInitializeNVPTXTargetMC();
135+
LLVMInitializeNVPTXAsmPrinter();
136+
137+
return std::make_unique<SerializeToCubinPass>();
138+
});
139+
}
140+
#else // MLIR_GPU_TO_CUBIN_PASS_ENABLE
141+
void mlir::registerGpuSerializeToCubinPass() {}
142+
#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE

mlir/test/Integration/GPU/CUDA/shuffle.mlir

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
// RUN: mlir-cuda-runner %s \
2-
// RUN: -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
1+
// RUN: mlir-opt %s \
2+
// RUN: -gpu-kernel-outlining \
3+
// RUN: -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{gpu-binary-annotation=nvvm.cubin})' \
34
// RUN: -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
5+
// RUN: | mlir-cpu-runner \
46
// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
57
// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
68
// RUN: --entry-point-result=void \

0 commit comments

Comments
 (0)