Skip to content

Commit ca74ad8

Browse files
gryppnicolasvasilache
authored andcommitted
[mlir] Nvidia Hopper TMA load integration test
This work introduces sm90 integration testing and adds a single test. Depends on : D155825 D155680 D155563 D155453 Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D155838
1 parent 67754a9 commit ca74ad8

File tree

5 files changed

+99
-3
lines changed

5 files changed

+99
-3
lines changed

mlir/test/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS)
3131
option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.")
3232
option(MLIR_RUN_CUDA_SM80_TESTS "Run CUDA A100 tests.")
3333
option(MLIR_RUN_CUDA_SM80_LT_TESTS "Run CUDA A100 structured sparsity tests.")
34+
option(MLIR_RUN_CUDA_SM90_TESTS "Run CUDA H100 tests.")
3435
option(MLIR_RUN_ARM_SVE_TESTS "Run Arm SVE tests.")
3536
option(MLIR_RUN_ARM_SME_TESTS "Run Arm SME tests.")
3637

@@ -71,6 +72,7 @@ llvm_canonicalize_cmake_booleans(
7172
MLIR_RUN_ARM_SME_TESTS
7273
MLIR_RUN_CUDA_SM80_TESTS
7374
MLIR_RUN_CUDA_SM80_LT_TESTS
75+
MLIR_RUN_CUDA_SM90_TESTS
7476
)
7577

7678
configure_lit_site_cfg(
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
if not config.enable_cuda_runner or not config.mlir_run_cuda_sm90_tests:
2+
config.unsupported = True
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
// RUN: mlir-opt %s --convert-nvgpu-to-nvvm -gpu-kernel-outlining \
2+
// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \
3+
// RUN: -convert-vector-to-llvm \
4+
// RUN: -convert-math-to-llvm \
5+
// RUN: -expand-strided-metadata \
6+
// RUN: -lower-affine \
7+
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
8+
// RUN: -convert-arith-to-llvm \
9+
// RUN: -finalize-memref-to-llvm \
10+
// RUN: -convert-func-to-llvm \
11+
// RUN: -canonicalize \
12+
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_90 features=+ptx80 dump-ptx}))' \
13+
// RUN: 2&>1 | FileCheck %s --check-prefixes=CHECK-PTX
14+
15+
// CHECK-PTX: mbarrier.init.shared.b64
16+
// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
17+
// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
18+
// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
19+
// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
20+
// CHECK-PTX: mbarrier.try_wait.parity.shared.b64
21+
22+
module @mymod {
23+
memref.global "private" @bufferLhsGlobal : memref<64x8xf32, 3>
24+
memref.global "private" @bufferRhsGlobal : memref<8x128xf32, 3>
25+
func.func @main() {
26+
%c10000000 = arith.constant 10000000 : index
27+
%c6144 = arith.constant 6144 : index
28+
%c45 = arith.constant 45 : index
29+
%c7 = arith.constant 7 : index
30+
%c64 = arith.constant 64 : index
31+
%c1 = arith.constant 1 : index
32+
%c0 = arith.constant 0 : index
33+
%c8 = arith.constant 8 : index
34+
%c128 = arith.constant 128 : index
35+
%cst = arith.constant 3.000000e+00 : f32
36+
%alloc = memref.alloc() : memref<64x8xf32>
37+
%alloc_0 = memref.alloc() : memref<8x128xf32>
38+
scf.for %arg0 = %c0 to %c8 step %c1 {
39+
scf.for %arg1 = %c0 to %c128 step %c1 {
40+
memref.store %cst, %alloc_0[%arg0, %arg1] : memref<8x128xf32>
41+
}
42+
}
43+
scf.for %arg0 = %c0 to %c64 step %c1 {
44+
scf.for %arg1 = %c0 to %c8 step %c1 {
45+
%5 = arith.index_cast %arg1 : index to i64
46+
%6 = arith.uitofp %5 : i64 to f32
47+
memref.store %6, %alloc[%arg0, %arg1] : memref<64x8xf32>
48+
}
49+
}
50+
%0 = gpu.wait async
51+
%memref, %asyncToken = gpu.alloc async [%0] () : memref<64x8xf32>
52+
%memref_1, %asyncToken_2 = gpu.alloc async [%0] () : memref<8x128xf32>
53+
%1 = gpu.memcpy async [%0] %memref, %alloc : memref<64x8xf32>, memref<64x8xf32>
54+
%2 = gpu.memcpy async [%0] %memref_1, %alloc_0 : memref<8x128xf32>, memref<8x128xf32>
55+
%cast = memref.cast %memref : memref<64x8xf32> to memref<*xf32>
56+
%cast_3 = memref.cast %memref_1 : memref<8x128xf32> to memref<*xf32>
57+
%3 = nvgpu.tma.create.descriptor %cast box[%c64, %c8] : memref<*xf32> -> <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
58+
%4 = nvgpu.tma.create.descriptor %cast_3 box[%c8, %c128] : memref<*xf32> -> <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
59+
gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
60+
%5 = gpu.block_dim x
61+
%6 = gpu.thread_id x
62+
%7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
63+
%8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
64+
%9 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
65+
nvgpu.mbarrier.init %9, %5 : <memorySpace = #gpu.address_space<workgroup>>
66+
gpu.barrier
67+
%10 = arith.cmpi eq, %6, %c0 : index
68+
scf.if %10 {
69+
nvgpu.mbarrier.arrive.expect_tx %9, %c6144 : <memorySpace = #gpu.address_space<workgroup>>
70+
%11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
71+
%12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
72+
gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
73+
gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
74+
nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
75+
nvgpu.tma.async.load %4[%c0, %c0], %9 to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
76+
} else {
77+
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : <memorySpace = #gpu.address_space<workgroup>>
78+
}
79+
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
80+
scf.if %10 {
81+
%11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
82+
%12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
83+
gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
84+
gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
85+
}
86+
gpu.terminator
87+
}
88+
return
89+
}
90+
}

mlir/test/lit.site.cfg.py.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ config.mlir_run_riscv_vector_tests = "@MLIR_RUN_RISCV_VECTOR_TESTS@"
4848
config.mlir_run_cuda_tensor_core_tests = @MLIR_RUN_CUDA_TENSOR_CORE_TESTS@
4949
config.mlir_run_cuda_sm80_tests = @MLIR_RUN_CUDA_SM80_TESTS@
5050
config.mlir_run_cuda_sm80_lt_tests = @MLIR_RUN_CUDA_SM80_LT_TESTS@
51+
config.mlir_run_cuda_sm90_tests = @MLIR_RUN_CUDA_SM90_TESTS@
5152
config.mlir_include_integration_tests = @MLIR_INCLUDE_INTEGRATION_TESTS@
5253
config.arm_emulator_executable = "@ARM_EMULATOR_EXECUTABLE@"
5354
config.arm_emulator_options = "@ARM_EMULATOR_OPTIONS@"

utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ expand_template(
5151
"@MLIR_RUN_CUDA_TENSOR_CORE_TESTS@": "0",
5252
"@MLIR_RUN_CUDA_SM80_TESTS@": "0",
5353
"@MLIR_RUN_CUDA_SM80_LT_TESTS@": "0",
54+
"@MLIR_RUN_CUDA_SM90_TESTS@": "0",
5455
"@MLIR_INCLUDE_INTEGRATION_TESTS@": "0",
5556
"@SHLIBDIR@": package_path("//llvm:BUILD"),
5657
},
@@ -640,19 +641,19 @@ cc_library(
640641
"//mlir:IndexToLLVM",
641642
"//mlir:MathToLLVM",
642643
"//mlir:MemRefDialect",
643-
"//mlir:MemRefTransforms",
644644
"//mlir:MemRefToLLVM",
645+
"//mlir:MemRefTransforms",
645646
"//mlir:NVGPUToNVVM",
646647
"//mlir:NVVMToLLVMIRTranslation",
647648
"//mlir:Pass",
648-
"//mlir:ReconcileUnrealizedCasts",
649649
"//mlir:ROCDLToLLVMIRTranslation",
650+
"//mlir:ReconcileUnrealizedCasts",
650651
"//mlir:SCFDialect",
651652
"//mlir:SCFToControlFlow",
652653
"//mlir:SPIRVDialect",
653654
"//mlir:ToLLVMIRTranslation",
654-
"//mlir:Transforms",
655655
"//mlir:TransformUtils",
656+
"//mlir:Transforms",
656657
"//mlir:VectorDialect",
657658
"//mlir:VectorToLLVM",
658659
"//mlir:VectorToSCF",

0 commit comments

Comments
 (0)