Skip to content

Commit 38d9a44

Browse files
authored
[MLIR][NVGPU] Add tma.fence.descriptor OP (#133218)
When the TMA descriptor is transferred from host memory to global memory using cudaMemcpy, each thread block must insert a fence before any thread accesses the updated tensor map in global memory. Once the tensor map has been accessed, no additional fences are needed by that block unless the map is modified again. [Example from cuda programming guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#using-tma-to-transfer-multi-dimensional-arrays). The `tma.fence.descriptor` basically implements `ptx::fence_proxy_tensormap_generic`. ``` #include <cuda.h> #include <cuda/ptx> namespace ptx = cuda::ptx; __device__ CUtensorMap global_tensor_map; __global__ void kernel(CUtensorMap *tensor_map) { // Fence acquire tensor map: ptx::n32_t<128> size_bytes; // Since the tensor map was modified from the host using cudaMemcpy, // the scope should be .sys. ptx::fence_proxy_tensormap_generic( ptx::sem_acquire, ptx::scope_sys, tensor_map, size_bytes ); // Safe to use tensor_map after fence inside this thread.. } int main() { CUtensorMap local_tensor_map; // [ ..Initialize map.. ] cudaMemcpy(&global_tensor_map, &local_tensor_map, sizeof(CUtensorMap), cudaMemcpyHostToDevice); kernel<<<1, 1>>>(global_tensor_map); } ```
1 parent bc7e391 commit 38d9a44

File tree

3 files changed

+48
-0
lines changed

3 files changed

+48
-0
lines changed

mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,20 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
452452
let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";
453453
}
454454

455+
def NVGPU_TmaFenceOp : NVGPU_Op<"tma.fence.descriptor", []> {
456+
let summary = "Insert fence given `nvgpu.tensormap.descriptor` ";
457+
let description = [{
458+
The Op fences the given `$tmaDescriptor`. This is necessary if the tensor map
459+
descriptor was modified from the host using cudaMemcpy. In this case, the
460+
kernel needs a fence after which it is safe to use `tensor.map`.
461+
}];
462+
let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor);
463+
let assemblyFormat = [{
464+
$tensorMapDescriptor attr-dict `:` type($tensorMapDescriptor)
465+
}];
466+
}
467+
468+
455469
def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
456470
let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
457471
let description = [{

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1671,6 +1671,28 @@ struct NVGPUWarpgroupMmaInitAccumulatorOpLowering
16711671
}
16721672
};
16731673

1674+
struct NVGPUTmaFenceOpLowering
1675+
: public ConvertOpToLLVMPattern<nvgpu::TmaFenceOp> {
1676+
using ConvertOpToLLVMPattern<nvgpu::TmaFenceOp>::ConvertOpToLLVMPattern;
1677+
LogicalResult
1678+
matchAndRewrite(nvgpu::TmaFenceOp op, OpAdaptor adaptor,
1679+
ConversionPatternRewriter &rewriter) const override {
1680+
MLIRContext *ctx = op.getContext();
1681+
ImplicitLocOpBuilder b(op->getLoc(), rewriter);
1682+
auto i32Ty = b.getI32Type();
1683+
Value tensormapSize =
1684+
b.create<LLVM::ConstantOp>(i32Ty, rewriter.getI32IntegerAttr(128));
1685+
1686+
auto memscope =
1687+
NVVM::MemScopeKindAttr::get(ctx, ::mlir::NVVM::MemScopeKind::SYS);
1688+
1689+
rewriter.replaceOpWithNewOp<NVVM::FenceProxyAcquireOp>(
1690+
op, memscope, adaptor.getTensorMapDescriptor(), tensormapSize);
1691+
1692+
return success();
1693+
}
1694+
};
1695+
16741696
struct NVGPUTmaPrefetchOpLowering
16751697
: public ConvertOpToLLVMPattern<nvgpu::TmaPrefetchOp> {
16761698
using ConvertOpToLLVMPattern<nvgpu::TmaPrefetchOp>::ConvertOpToLLVMPattern;
@@ -1733,6 +1755,7 @@ void mlir::populateNVGPUToNVVMConversionPatterns(
17331755
NVGPUTmaAsyncStoreOpLowering, // nvgpu.tma.async.store
17341756
NVGPUTmaCreateDescriptorOpLowering, // nvgpu.tma.create.descriptor
17351757
NVGPUTmaPrefetchOpLowering, // nvgpu.tma.prefetch.descriptor
1758+
NVGPUTmaFenceOpLowering, // nvgpu.tma.fence.descriptor
17361759
NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx
17371760
NVGPUGenerateWarpgroupDescriptorLowering, // nvgpu.warpgroup.generate.descriptor
17381761
NVGPUWarpgroupMmaOpLowering, // nvgpu.warpgroup.mma

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -824,6 +824,17 @@ func.func @tma_prefetch(%tensorMap1d: !tensorMap1d, %p : i1) {
824824
func.return
825825
}
826826

827+
828+
// CHECK-LABEL: @tma_fence(
829+
// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !nvgpu.tensormap.descriptor<tensor = memref<128xf32, 3>, swizzle = none, l2promo = none, oob = nan, interleave = none>
830+
func.func @tma_fence(%tensorMap1d: !tensorMap1d) {
831+
// CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.tensormap.descriptor<tensor = memref<128xf32, 3>, swizzle = none, l2promo = none, oob = nan, interleave = none> to !llvm.ptr
832+
// CHECK: %[[S1:.+]] = llvm.mlir.constant(128 : i32) : i32
833+
// CHECK: nvvm.fence.proxy.acquire <sys> %[[S0]], %[[S1]]
834+
nvgpu.tma.fence.descriptor %tensorMap1d: !tensorMap1d
835+
func.return
836+
}
837+
827838
!lhsTensorMap = !nvgpu.tensormap.descriptor<tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
828839
!rhsTensorMap = !nvgpu.tensormap.descriptor<tensor = memref<64x64xf16, strided<[64, 1], offset: 8192>, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
829840

0 commit comments

Comments
 (0)