Skip to content

Commit 29ef475

Browse files
committed
[mlir][nvgpu] Add memref address space convert
GPU dialect has `#gpu.address_space<workgroup>` for shared memory of NVGPU (address space =3). Howeverm when IR combine NVGPU and GPU dialect, `nvgpu-to-nvvm` pass fails due to missing attribute conversion. This PR adds `populateGpuMemorySpaceAttributeConversions` to nvgou-to-nvvm lowering, so we can use `#gpu.address_space<workgroup>` `nvgpu-to-nvvm` pass
1 parent f42b761 commit 29ef475

File tree

2 files changed

+28
-0
lines changed

2 files changed

+28
-0
lines changed

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,21 @@ struct ConvertNVGPUToNVVMPass
405405
RewritePatternSet patterns(&getContext());
406406
LLVMTypeConverter converter(&getContext(), options);
407407
IRRewriter rewriter(&getContext());
408+
populateGpuMemorySpaceAttributeConversions(
409+
converter, [](gpu::AddressSpace space) -> unsigned {
410+
switch (space) {
411+
case gpu::AddressSpace::Global:
412+
return static_cast<unsigned>(
413+
NVVM::NVVMMemorySpace::kGlobalMemorySpace);
414+
case gpu::AddressSpace::Workgroup:
415+
return static_cast<unsigned>(
416+
NVVM::NVVMMemorySpace::kSharedMemorySpace);
417+
case gpu::AddressSpace::Private:
418+
return 0;
419+
}
420+
llvm_unreachable("unknown address space enum value");
421+
return 0;
422+
});
408423
/// device-side async tokens cannot be materialized in nvvm. We just
409424
/// convert them to a dummy i32 type in order to easily drop them during
410425
/// conversion.

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,19 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d
666666
func.return
667667
}
668668

669+
// CHECK-LABEL: func @async_tma_load
670+
!tensorMap1dgpuspace = !nvgpu.tensormap.descriptor<tensor = memref<128xf32, #gpu.address_space<workgroup>>, swizzle=none, l2promo = none, oob = nan, interleave = none>
671+
func.func @async_tma_load_gpu_address_space(%tensorMap1d: !tensorMap1dgpuspace,
672+
%buffer1d: memref<128xf32, #gpu.address_space<workgroup>>,
673+
%mbarrier: !mbarrier) {
674+
%c0 = arith.constant 0 : index
675+
%crd0 = arith.constant 0 : index
676+
%crd1 = arith.constant 0 : index
677+
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}]
678+
nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d : !tensorMap1dgpuspace, !mbarrier -> memref<128xf32,#gpu.address_space<workgroup>>
679+
func.return
680+
}
681+
669682
// CHECK-LABEL: func @async_tma_load_pred
670683
func.func @async_tma_load_pred(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d,
671684
%buffer1d: memref<128xf32,3>,

0 commit comments

Comments
 (0)