-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[MLIR][NVGPU] Add tma.fence.descriptor
OP
#133218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
When the TMA descriptor is transferred from host memory to global memory using cudaMemcpy, each thread block must insert a fence before any thread accesses the updated tensor map in global memory. Once the tensor map has been accessed, no additional fences are needed by that block unless the map is modified again. Example from cuda programming guide. This OP basically implements `ptx::fence_proxy_tensormap_generic`. ``` #include <cuda.h> #include <cuda/ptx> namespace ptx = cuda::ptx; __device__ CUtensorMap global_tensor_map; __global__ void kernel(CUtensorMap *tensor_map) { // Fence acquire tensor map: ptx::n32_t<128> size_bytes; // Since the tensor map was modified from the host using cudaMemcpy, // the scope should be .sys. ptx::fence_proxy_tensormap_generic( ptx::sem_acquire, ptx::scope_sys, tensor_map, size_bytes ); // Safe to use tensor_map after fence inside this thread.. } int main() { CUtensorMap local_tensor_map; // [ ..Initialize map.. ] cudaMemcpy(&global_tensor_map, &local_tensor_map, sizeof(CUtensorMap), cudaMemcpyHostToDevice); kernel<<<1, 1>>>(global_tensor_map); } ```
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Guray Ozen (grypp) ChangesWhen the TMA descriptor is transferred from host memory to global memory using cudaMemcpy, each thread block must insert a fence before any thread accesses the updated tensor map in global memory. Once the tensor map has been accessed, no additional fences are needed by that block unless the map is modified again. Example from cuda programming guide. The
Full diff: https://github.com/llvm/llvm-project/pull/133218.diff 3 Files Affected:
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index eb0fb90d271ed..d6f332f1100b3 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -433,6 +433,20 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";
}
+def NVGPU_TmaFenceOp : NVGPU_Op<"tma.fence.descriptor", []> {
+ let summary = "Insert fence given `nvgpu.tensormap.descriptor` ";
+ let description = [{
+ The Op fences the given `$tmaDescriptor`. This is necessary if the tensor map
+ descriptor was modified from the host using cudaMemcpy. In this case, the
+ kernel needs a fence, then it is safe to use `tensor.map` for load.
+ }];
+ let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor);
+ let assemblyFormat = [{
+ $tensorMapDescriptor attr-dict `:` type($tensorMapDescriptor)
+ }];
+}
+
+
def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
let description = [{
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index f53de416f2abd..37d111de25d6b 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -1653,6 +1653,28 @@ struct NVGPUWarpgroupMmaInitAccumulatorOpLowering
}
};
+struct NVGPUTmaFenceOpLowering
+ : public ConvertOpToLLVMPattern<nvgpu::TmaFenceOp> {
+ using ConvertOpToLLVMPattern<nvgpu::TmaFenceOp>::ConvertOpToLLVMPattern;
+ LogicalResult
+ matchAndRewrite(nvgpu::TmaFenceOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ MLIRContext *ctx = op.getContext();
+ ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+ auto i32Ty = b.getI32Type();
+ Value tensormapSize =
+ b.create<LLVM::ConstantOp>(i32Ty, rewriter.getI32IntegerAttr(128));
+
+ auto memscope =
+ NVVM::MemScopeKindAttr::get(ctx, ::mlir::NVVM::MemScopeKind::SYS);
+
+ rewriter.replaceOpWithNewOp<NVVM::FenceProxyAcquireOp>(
+ op, memscope, adaptor.getTensorMapDescriptor(), tensormapSize);
+
+ return success();
+ }
+};
+
struct NVGPUTmaPrefetchOpLowering
: public ConvertOpToLLVMPattern<nvgpu::TmaPrefetchOp> {
using ConvertOpToLLVMPattern<nvgpu::TmaPrefetchOp>::ConvertOpToLLVMPattern;
@@ -1714,6 +1736,7 @@ void mlir::populateNVGPUToNVVMConversionPatterns(
NVGPUTmaAsyncStoreOpLowering, // nvgpu.tma.async.store
NVGPUTmaCreateDescriptorOpLowering, // nvgpu.tma.create.descriptor
NVGPUTmaPrefetchOpLowering, // nvgpu.tma.prefetch.descriptor
+ NVGPUTmaFenceOpLowering, // nvgpu.tma.fence.descriptor
NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx
NVGPUGenerateWarpgroupDescriptorLowering, // nvgpu.warpgroup.generate.descriptor
NVGPUWarpgroupMmaOpLowering, // nvgpu.warpgroup.mma
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 6b59b5e4343b4..605ed976ecd4d 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -802,6 +802,17 @@ func.func @tma_prefetch(%tensorMap1d: !tensorMap1d, %p : i1) {
func.return
}
+
+// CHECK-LABEL: @tma_prefetch(
+// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !nvgpu.tensormap.descriptor<tensor = memref<128xf32, 3>, swizzle = none, l2promo = none, oob = nan, interleave = none>, %[[arg1:[a-zA-Z0-9_]+]]: i1
+func.func @tma_fence(%tensorMap1d: !tensorMap1d) {
+ // CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.tensormap.descriptor<tensor = memref<128xf32, 3>, swizzle = none, l2promo = none, oob = nan, interleave = none> to !llvm.ptr
+ // CHECK: %[[S1:.+]] = llvm.mlir.constant(128 : i32) : i32
+ // CHECK: nvvm.fence.proxy.acquire <sys> %[[S0]], %[[S1]]
+ nvgpu.tma.fence.descriptor %tensorMap1d: !tensorMap1d
+ func.return
+}
+
!lhsTensorMap = !nvgpu.tensormap.descriptor<tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
!rhsTensorMap = !nvgpu.tensormap.descriptor<tensor = memref<64x64xf16, strided<[64, 1], offset: 8192>, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
|
@llvm/pr-subscribers-mlir-nvgpu Author: Guray Ozen (grypp) ChangesWhen the TMA descriptor is transferred from host memory to global memory using cudaMemcpy, each thread block must insert a fence before any thread accesses the updated tensor map in global memory. Once the tensor map has been accessed, no additional fences are needed by that block unless the map is modified again. Example from cuda programming guide. The
Full diff: https://github.com/llvm/llvm-project/pull/133218.diff 3 Files Affected:
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index eb0fb90d271ed..d6f332f1100b3 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -433,6 +433,20 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";
}
+def NVGPU_TmaFenceOp : NVGPU_Op<"tma.fence.descriptor", []> {
+ let summary = "Insert fence given `nvgpu.tensormap.descriptor` ";
+ let description = [{
+ The Op fences the given `$tmaDescriptor`. This is necessary if the tensor map
+ descriptor was modified from the host using cudaMemcpy. In this case, the
+ kernel needs a fence, then it is safe to use `tensor.map` for load.
+ }];
+ let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor);
+ let assemblyFormat = [{
+ $tensorMapDescriptor attr-dict `:` type($tensorMapDescriptor)
+ }];
+}
+
+
def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
let description = [{
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index f53de416f2abd..37d111de25d6b 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -1653,6 +1653,28 @@ struct NVGPUWarpgroupMmaInitAccumulatorOpLowering
}
};
+struct NVGPUTmaFenceOpLowering
+ : public ConvertOpToLLVMPattern<nvgpu::TmaFenceOp> {
+ using ConvertOpToLLVMPattern<nvgpu::TmaFenceOp>::ConvertOpToLLVMPattern;
+ LogicalResult
+ matchAndRewrite(nvgpu::TmaFenceOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ MLIRContext *ctx = op.getContext();
+ ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+ auto i32Ty = b.getI32Type();
+ Value tensormapSize =
+ b.create<LLVM::ConstantOp>(i32Ty, rewriter.getI32IntegerAttr(128));
+
+ auto memscope =
+ NVVM::MemScopeKindAttr::get(ctx, ::mlir::NVVM::MemScopeKind::SYS);
+
+ rewriter.replaceOpWithNewOp<NVVM::FenceProxyAcquireOp>(
+ op, memscope, adaptor.getTensorMapDescriptor(), tensormapSize);
+
+ return success();
+ }
+};
+
struct NVGPUTmaPrefetchOpLowering
: public ConvertOpToLLVMPattern<nvgpu::TmaPrefetchOp> {
using ConvertOpToLLVMPattern<nvgpu::TmaPrefetchOp>::ConvertOpToLLVMPattern;
@@ -1714,6 +1736,7 @@ void mlir::populateNVGPUToNVVMConversionPatterns(
NVGPUTmaAsyncStoreOpLowering, // nvgpu.tma.async.store
NVGPUTmaCreateDescriptorOpLowering, // nvgpu.tma.create.descriptor
NVGPUTmaPrefetchOpLowering, // nvgpu.tma.prefetch.descriptor
+ NVGPUTmaFenceOpLowering, // nvgpu.tma.fence.descriptor
NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx
NVGPUGenerateWarpgroupDescriptorLowering, // nvgpu.warpgroup.generate.descriptor
NVGPUWarpgroupMmaOpLowering, // nvgpu.warpgroup.mma
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 6b59b5e4343b4..605ed976ecd4d 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -802,6 +802,17 @@ func.func @tma_prefetch(%tensorMap1d: !tensorMap1d, %p : i1) {
func.return
}
+
+// CHECK-LABEL: @tma_prefetch(
+// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !nvgpu.tensormap.descriptor<tensor = memref<128xf32, 3>, swizzle = none, l2promo = none, oob = nan, interleave = none>, %[[arg1:[a-zA-Z0-9_]+]]: i1
+func.func @tma_fence(%tensorMap1d: !tensorMap1d) {
+ // CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.tensormap.descriptor<tensor = memref<128xf32, 3>, swizzle = none, l2promo = none, oob = nan, interleave = none> to !llvm.ptr
+ // CHECK: %[[S1:.+]] = llvm.mlir.constant(128 : i32) : i32
+ // CHECK: nvvm.fence.proxy.acquire <sys> %[[S0]], %[[S1]]
+ nvgpu.tma.fence.descriptor %tensorMap1d: !tensorMap1d
+ func.return
+}
+
!lhsTensorMap = !nvgpu.tensormap.descriptor<tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
!rhsTensorMap = !nvgpu.tensormap.descriptor<tensor = memref<64x64xf16, strided<[64, 1], offset: 8192>, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fix looks good to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the change Guray! :) LGTM!
When the TMA descriptor is transferred from host memory to global memory using cudaMemcpy, each thread block must insert a fence before any thread accesses the updated tensor map in global memory. Once the tensor map has been accessed, no additional fences are needed by that block unless the map is modified again.
Example from cuda programming guide. The
tma.fence.descriptor
basically implementsptx::fence_proxy_tensormap_generic
.