-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[mlir][gpu] Generate multiple rank-specializations for tensor map cre… #74082
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mlir-execution-engine @llvm/pr-subscribers-mlir Author: Adam Paszke (apaszke) Changes…ation The previous code was technically incorrect in that the type indicated that the memref only has 1 dimension, while the code below was happily dereferencing the size array out of bounds. Now, if the compiler doesn't get too smart about optimizations, this code might even work. But, if the compiler realizes that the array has 1 element it might starrt doing silly things. This generates a specialization per each supported rank, making sure we don't do any UB. Full diff: https://github.com/llvm/llvm-project/pull/74082.diff 1 Files Affected:
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index b8ac9ab90a9f3b8..370f8eabe7f2367 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -423,9 +423,24 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled(
elementStrides[4], interleave, swizzle, l2Promotion, oobFill);
}
+namespace {
+
+template<int rank>
+void mgpuGetMemRefDataAndShape(void *raw_descriptor, char **addr, uint64_t *globalDim) {
+ auto descriptor = reinterpret_cast<StridedMemRefType<char, rank>*>(raw_descriptor);
+ *addr = descriptor->data;
+ if constexpr (rank > 0) { // rank 0 memrefs have no sizes
+ for (int i = 0; i < rank; ++i) {
+ globalDim[i] = static_cast<uint64_t>(descriptor->sizes[rank - i - 1]);
+ }
+ }
+}
+
+} // namespace
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
int64_t tensorRank, // Dimensionality of tensor
- StridedMemRefType<char, 1> *descriptor, // Starting address
+ void *ranked_descriptor, // Starting address
const CUtensorMapDataType tensorDataType, // Stride size (in bytes)
CUtensorMapInterleave interleave, // Type of interleaved layout
CUtensorMapSwizzle swizzle, // Bank swizzling pattern
@@ -435,17 +450,40 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
) {
CUtensorMap tensorMap;
- auto *globalAddress = descriptor->data;
uint32_t boxDim[5] = {1, 1, 1, 1, 1}, elementStrides[5] = {1, 1, 1, 1, 1};
uint64_t globalDim[5] = {1, 1, 1, 1, 1}, globalStrides[5] = {0};
uint32_t tensorRank32 = uint32_t(tensorRank);
+ char *globalAddress = nullptr;
+ switch (tensorRank) {
+ case 0:
+ mgpuGetMemRefDataAndShape<0>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ case 1:
+ mgpuGetMemRefDataAndShape<1>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ case 2:
+ mgpuGetMemRefDataAndShape<2>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ case 3:
+ mgpuGetMemRefDataAndShape<3>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ case 4:
+ mgpuGetMemRefDataAndShape<4>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ case 5:
+ mgpuGetMemRefDataAndShape<5>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ default:
+ fprintf(stderr, "'mgpuTensorMapEncodeTiledMemref' failed with 'rank is too high'\n");
+ return NULL;
+ }
+
static const int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2,
4, 8, 2, 4, 4, 4};
for (int64_t r = 0; r < tensorRank; ++r) {
elementStrides[r] = uint32_t(1);
boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - 1]);
- globalDim[r] = static_cast<uint64_t>(descriptor->sizes[tensorRank - r - 1]);
}
globalStrides[0] = globalDim[0] * elementSizeInBytes[tensorDataType];
|
3b5e821
to
35b2bcd
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
…ation The previous code was technically incorrect in that the type indicated that the memref only has 1 dimension, while the code below was happily dereferencing the size array out of bounds. Now, if the compiler doesn't get too smart about optimizations, this code *might even work*. But, if the compiler realizes that the array has 1 element it might starrt doing silly things. This generates a specialization per each supported rank, making sure we don't do any UB.
35b2bcd
to
ae79cc8
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very good catch. Thanks for fixing this.
@grypp please be careful about title/description: you merged this one with the weird wrapping that GitHub does by default. This needs to be addressed before merging. |
I didn't see the overflow at the title. |
…ation
The previous code was technically incorrect in that the type indicated that the memref only has 1 dimension, while the code below was happily dereferencing the size array out of bounds. Now, if the compiler doesn't get too smart about optimizations, this code might even work. But, if the compiler realizes that the array has 1 element it might starrt doing silly things. This generates a specialization per each supported rank, making sure we don't do any UB.