Skip to content

[MLIR][NVGPU] Move max threads/blocks size to dialect (NFC) #124454

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,20 @@

#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"

// Maximum warp size
constexpr int kWarpSize = 32;

// Maximum number of threads in a block and block in a grid
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
constexpr int kMaxTotalBlockdim = 1024;
constexpr int kMaxBlockdimx = 1024;
constexpr int kMaxBlockdimy = 1024;
constexpr int kMaxBlockdimz = 64;
constexpr int kMaxTotalGriddim = 2147483647;
constexpr int kMaxGriddimx = 2147483647;
constexpr int kMaxGriddimy = 65535;
constexpr int kMaxGriddimz = 65535;

/// M size of wgmma.mma_async instruction
constexpr int kWgmmaSizeM = 64;

Expand Down
25 changes: 9 additions & 16 deletions mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
Expand Down Expand Up @@ -237,25 +238,17 @@ DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
std::optional<int64_t> blockDimZ) {

// TODO: pass a configuration object to set the limits properly.
static constexpr int maxTotalBlockdim = 1024;
static constexpr int maxBlockdimx = 1024;
static constexpr int maxBlockdimy = 1024;
static constexpr int maxBlockdimz = 64;
static constexpr int maxTotalGriddim = 2147483647;
static constexpr int maxGriddimx = 2147483647;
static constexpr int maxGriddimy = 65535;
static constexpr int maxGriddimz = 65535;

if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
maxTotalBlockdim ||
kMaxTotalBlockdim ||
(gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
maxTotalGriddim ||
blockDimX.value_or(1) > maxBlockdimx ||
blockDimY.value_or(1) > maxBlockdimy ||
blockDimZ.value_or(1) > maxBlockdimz ||
gridDimY.value_or(1) > maxGriddimy ||
gridDimZ.value_or(1) > maxGriddimz ||
gridDimX.value_or(1) > maxGriddimx) {
kMaxTotalGriddim ||
blockDimX.value_or(1) > kMaxBlockdimx ||
blockDimY.value_or(1) > kMaxBlockdimy ||
blockDimZ.value_or(1) > kMaxBlockdimz ||
gridDimY.value_or(1) > kMaxGriddimy ||
gridDimZ.value_or(1) > kMaxGriddimz ||
gridDimX.value_or(1) > kMaxGriddimx) {
return transformOp.emitSilenceableError()
<< "Trying to launch a GPU kernel with grid_dims = ("
<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
Expand Down
Loading