-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[MLIR][NVGPU] Move max threads/blocks size to dialect (NFC) #124454
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Guray Ozen (grypp) ChangesThis PR moves maximum number of threads in a block and block in a grid to nvgpu dialect to avoid replicated code. The limits are defined here: Full diff: https://github.com/llvm/llvm-project/pull/124454.diff 2 Files Affected:
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index aad2ac6f4dd2b4..db4c63b3390eb7 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -22,8 +22,20 @@
#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"
+// Maximum warp size
constexpr int kWarpSize = 32;
+// Maximum number of threads in a block and block in a grid
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
+constexpr int kMaxTotalBlockdim = 1024;
+constexpr int kMaxBlockdimx = 1024;
+constexpr int kMaxBlockdimy = 1024;
+constexpr int kMaxBlockdimz = 64;
+constexpr int kMaxTotalGriddim = 2147483647;
+constexpr int kMaxGriddimx = 2147483647;
+constexpr int kMaxGriddimy = 65535;
+constexpr int kMaxGriddimz = 65535;
+
/// M size of wgmma.mma_async instruction
constexpr int kWgmmaSizeM = 64;
diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
index f4d36129bae776..6fbde3a77087c8 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
@@ -14,6 +14,7 @@
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
@@ -113,17 +114,16 @@ static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity = 1) {
// clang-format on
// Return n-D ids for indexing and 1-D size + id for predicate generation.
- return IdBuilderResult{
- /*mappingIdOps=*/ids,
- /*availableMappingSizes=*/
- SmallVector<int64_t>{computeProduct(originalBasis)},
- // `forallMappingSizes` iterate in the scaled basis, they need to be
- // scaled back into the original basis to provide tight
- // activeMappingSizes quantities for predication.
- /*activeMappingSizes=*/
- SmallVector<int64_t>{computeProduct(forallMappingSizes) *
- multiplicity},
- /*activeIdOps=*/SmallVector<Value>{cast<Value>(linearId)}};
+ return IdBuilderResult{
+ /*mappingIdOps=*/ids,
+ /*availableMappingSizes=*/
+ SmallVector<int64_t>{computeProduct(originalBasis)},
+ // `forallMappingSizes` iterate in the scaled basis, they need to be
+ // scaled back into the original basis to provide tight
+ // activeMappingSizes quantities for predication.
+ /*activeMappingSizes=*/
+ SmallVector<int64_t>{computeProduct(forallMappingSizes) * multiplicity},
+ /*activeIdOps=*/SmallVector<Value>{cast<Value>(linearId)}};
};
return res;
@@ -237,25 +237,17 @@ DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
std::optional<int64_t> blockDimZ) {
// TODO: pass a configuration object to set the limits properly.
- static constexpr int maxTotalBlockdim = 1024;
- static constexpr int maxBlockdimx = 1024;
- static constexpr int maxBlockdimy = 1024;
- static constexpr int maxBlockdimz = 64;
- static constexpr int maxTotalGriddim = 2147483647;
- static constexpr int maxGriddimx = 2147483647;
- static constexpr int maxGriddimy = 65535;
- static constexpr int maxGriddimz = 65535;
if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
- maxTotalBlockdim ||
+ kMaxTotalBlockdim ||
(gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
- maxTotalGriddim ||
- blockDimX.value_or(1) > maxBlockdimx ||
- blockDimY.value_or(1) > maxBlockdimy ||
- blockDimZ.value_or(1) > maxBlockdimz ||
- gridDimY.value_or(1) > maxGriddimy ||
- gridDimZ.value_or(1) > maxGriddimz ||
- gridDimX.value_or(1) > maxGriddimx) {
+ kMaxTotalGriddim ||
+ blockDimX.value_or(1) > kMaxBlockdimx ||
+ blockDimY.value_or(1) > kMaxBlockdimy ||
+ blockDimZ.value_or(1) > kMaxBlockdimz ||
+ gridDimY.value_or(1) > kMaxGriddimy ||
+ gridDimZ.value_or(1) > kMaxGriddimz ||
+ gridDimX.value_or(1) > kMaxGriddimx) {
return transformOp.emitSilenceableError()
<< "Trying to launch a GPU kernel with grid_dims = ("
<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
|
@llvm/pr-subscribers-mlir-nvgpu Author: Guray Ozen (grypp) ChangesThis PR moves maximum number of threads in a block and block in a grid to nvgpu dialect to avoid replicated code. The limits are defined here: Full diff: https://github.com/llvm/llvm-project/pull/124454.diff 2 Files Affected:
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index aad2ac6f4dd2b4..db4c63b3390eb7 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -22,8 +22,20 @@
#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"
+// Maximum warp size
constexpr int kWarpSize = 32;
+// Maximum number of threads in a block and block in a grid
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
+constexpr int kMaxTotalBlockdim = 1024;
+constexpr int kMaxBlockdimx = 1024;
+constexpr int kMaxBlockdimy = 1024;
+constexpr int kMaxBlockdimz = 64;
+constexpr int kMaxTotalGriddim = 2147483647;
+constexpr int kMaxGriddimx = 2147483647;
+constexpr int kMaxGriddimy = 65535;
+constexpr int kMaxGriddimz = 65535;
+
/// M size of wgmma.mma_async instruction
constexpr int kWgmmaSizeM = 64;
diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
index f4d36129bae776..6fbde3a77087c8 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
@@ -14,6 +14,7 @@
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
@@ -113,17 +114,16 @@ static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity = 1) {
// clang-format on
// Return n-D ids for indexing and 1-D size + id for predicate generation.
- return IdBuilderResult{
- /*mappingIdOps=*/ids,
- /*availableMappingSizes=*/
- SmallVector<int64_t>{computeProduct(originalBasis)},
- // `forallMappingSizes` iterate in the scaled basis, they need to be
- // scaled back into the original basis to provide tight
- // activeMappingSizes quantities for predication.
- /*activeMappingSizes=*/
- SmallVector<int64_t>{computeProduct(forallMappingSizes) *
- multiplicity},
- /*activeIdOps=*/SmallVector<Value>{cast<Value>(linearId)}};
+ return IdBuilderResult{
+ /*mappingIdOps=*/ids,
+ /*availableMappingSizes=*/
+ SmallVector<int64_t>{computeProduct(originalBasis)},
+ // `forallMappingSizes` iterate in the scaled basis, they need to be
+ // scaled back into the original basis to provide tight
+ // activeMappingSizes quantities for predication.
+ /*activeMappingSizes=*/
+ SmallVector<int64_t>{computeProduct(forallMappingSizes) * multiplicity},
+ /*activeIdOps=*/SmallVector<Value>{cast<Value>(linearId)}};
};
return res;
@@ -237,25 +237,17 @@ DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
std::optional<int64_t> blockDimZ) {
// TODO: pass a configuration object to set the limits properly.
- static constexpr int maxTotalBlockdim = 1024;
- static constexpr int maxBlockdimx = 1024;
- static constexpr int maxBlockdimy = 1024;
- static constexpr int maxBlockdimz = 64;
- static constexpr int maxTotalGriddim = 2147483647;
- static constexpr int maxGriddimx = 2147483647;
- static constexpr int maxGriddimy = 65535;
- static constexpr int maxGriddimz = 65535;
if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
- maxTotalBlockdim ||
+ kMaxTotalBlockdim ||
(gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
- maxTotalGriddim ||
- blockDimX.value_or(1) > maxBlockdimx ||
- blockDimY.value_or(1) > maxBlockdimy ||
- blockDimZ.value_or(1) > maxBlockdimz ||
- gridDimY.value_or(1) > maxGriddimy ||
- gridDimZ.value_or(1) > maxGriddimz ||
- gridDimX.value_or(1) > maxGriddimx) {
+ kMaxTotalGriddim ||
+ blockDimX.value_or(1) > kMaxBlockdimx ||
+ blockDimY.value_or(1) > kMaxBlockdimy ||
+ blockDimZ.value_or(1) > kMaxBlockdimz ||
+ gridDimY.value_or(1) > kMaxGriddimy ||
+ gridDimZ.value_or(1) > kMaxGriddimz ||
+ gridDimX.value_or(1) > kMaxGriddimx) {
return transformOp.emitSilenceableError()
<< "Trying to launch a GPU kernel with grid_dims = ("
<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
|
This PR moves maximum number of threads in a block and block in a grid to nvgpu dialect to avoid replicated code. The limits are defined here: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
3de693d
to
6340c36
Compare
) This PR moves maximum number of threads in a block and block in a grid to nvgpu dialect to avoid replicated code. The limits are defined here: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
This PR moves maximum number of threads in a block and block in a grid to nvgpu dialect to avoid replicated code.
The limits are defined here:
https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability