Skip to content

Commit 38e09a5

Browse files
grypplegrosbuffle
authored andcommitted
[mlir] adapt sm_90 integration test mbarrier.group (llvm#67423)
llvm#65951 improved mbarrier supports. This PR adapts that usage in the integration test.
1 parent 8a4e5f1 commit 38e09a5

File tree

3 files changed

+29
-33
lines changed

3 files changed

+29
-33
lines changed

mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
// |-------------------------------|
3636

3737

38-
!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
38+
!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
3939
!tokenType = !nvgpu.mbarrier.token
4040

4141
!lhs = memref<128x64xf16>
@@ -93,21 +93,21 @@ module @mymod {
9393

9494
// Step 6. Initialize the mbarrier
9595
%9 = nvgpu.mbarrier.create -> !barrierType
96-
nvgpu.mbarrier.init %9, %5 : !barrierType
96+
nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
9797
%10 = arith.cmpi eq, %6, %c0 : index
9898

9999

100100
// Step 7. First thread does TMA load
101101
scf.if %10 {
102102
gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index
103-
nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
104-
nvgpu.mbarrier.arrive.expect_tx %9, %c8192 : !barrierType
103+
nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
104+
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType
105105
} else {
106-
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
106+
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
107107
}
108108

109109
// Step 8. Wait until TMA is done
110-
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
110+
nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
111111

112112
// Step 9. Print loaded data in 128b swizzled
113113
scf.if %10 {

mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
// |-------------------------------|
4141

4242

43-
!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
43+
!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
4444
!tokenType = !nvgpu.mbarrier.token
4545

4646
!lhs = memref<128x64xf16>
@@ -96,56 +96,50 @@ module @mymod {
9696
memref.store %vL32, %lhs32[%j, %i] : memref<128x64xf32>
9797
}
9898
}
99-
100-
// Step 2. Print on the host
101-
%lhs32_unranked = memref.cast %lhs32 : memref<128x64xf32> to memref<*xf32>
102-
call @printMemrefF32(%lhs32_unranked) : (memref<*xf32>) -> ()
103-
%rhs32_unranked = memref.cast %rhs32 : memref<64x128xf32> to memref<*xf32>
104-
call @printMemrefF32(%rhs32_unranked) : (memref<*xf32>) -> ()
10599

106-
// Step 3. Copy host to device
100+
// Step 2. Copy host to device
107101
%0 = gpu.wait async
108102
%d_glbmem_lhs, %asyncToken = gpu.alloc async [%0] () : !lhs
109103
%d_glbmem_rhs, %asyncToken_2 = gpu.alloc async [%0] () : !rhs
110104
%1 = gpu.memcpy async [%0] %d_glbmem_lhs, %lhs : !lhs, !lhs
111105
%2 = gpu.memcpy async [%0] %d_glbmem_rhs, %rhs : !rhs, !rhs
112106

113-
// Step 4. Create TMA tensor descriptor
107+
// Step 3. Create TMA tensor descriptor
114108
%d_lhs_unranked = memref.cast %d_glbmem_lhs :!lhs to memref<*xf16>
115109
%d_rhs_unranked = memref.cast %d_glbmem_rhs :!rhs to memref<*xf16>
116110

117111
%d_lhsTensorMap = nvgpu.tma.create.descriptor %d_lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap
118112
%d_rhsTensorMap = nvgpu.tma.create.descriptor %d_rhs_unranked box[%c64, %c64] : memref<*xf16> -> !rhsTensorMap
119113

120-
// Step 5. Launch a GPU kernel
114+
// Step 4. Launch a GPU kernel
121115
gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
122116
%5 = gpu.block_dim x
123117
%6 = gpu.thread_id x
124118
%lhsShmem = memref.get_global @bufferLhsGlobal : !shmemlhs
125119
%rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs
126120
%rhsShmem2 = memref.subview %rhsShmem[%c32, %c0][%c32, %c128][%c1, %c1] : !shmemrhs to memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
127121

128-
// Step 6. Initialize the mbarrier
122+
// Step 5. Initialize the mbarrier
129123
%9 = nvgpu.mbarrier.create -> !barrierType
130-
nvgpu.mbarrier.init %9, %5 : !barrierType
124+
nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
131125
%10 = arith.cmpi eq, %6, %c0 : index
132126

133127

134-
// Step 7. First thread does TMA load
128+
// Step 6. First thread does TMA load
135129
scf.if %10 {
136130
gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
137-
nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9 to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
138-
nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9 to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
139-
nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9 to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
140-
nvgpu.mbarrier.arrive.expect_tx %9, %c32768 : !barrierType
131+
nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
132+
nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
133+
nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
134+
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c32768 : !barrierType
141135
} else {
142-
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
136+
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
143137
}
144138

145-
// Step 8. Wait until TMA is done
146-
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
139+
// Step 7. Wait until TMA is done
140+
nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
147141

148-
// Step 9. Print loaded data in 128b swizzled
142+
// Step 8. Print loaded data in 128b swizzled
149143
scf.if %10 {
150144
gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32
151145
scf.for %ii = %c0 to %c64 step %c1 {
@@ -158,6 +152,7 @@ module @mymod {
158152
}
159153
gpu.printf "===----------------=== %d \n" %c-1_i32 : i32
160154
}
155+
gpu.barrier
161156
gpu.terminator
162157
}
163158
return

mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
// RUN: --entry-point-result=void \
4040
// RUN: | FileCheck %s
4141

42+
4243
// CHECK: [GPU] TMA BEFORE lhs[45][7] 0.000000
4344
// CHECK: [GPU] TMA BEFORE rhs[7][0] 0.000000
4445
// CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000
@@ -87,21 +88,21 @@ module @mymod {
8788
%7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
8889
%8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
8990
%9 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
90-
nvgpu.mbarrier.init %9, %5 : <memorySpace = #gpu.address_space<workgroup>>
91+
nvgpu.mbarrier.init %9[%c0], %5 : <memorySpace = #gpu.address_space<workgroup>>
9192
gpu.barrier
9293
%10 = arith.cmpi eq, %6, %c0 : index
9394
scf.if %10 {
94-
nvgpu.mbarrier.arrive.expect_tx %9, %c6144 : <memorySpace = #gpu.address_space<workgroup>>
95+
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
9596
%11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
9697
%12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
9798
gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
9899
gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
99-
nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
100-
nvgpu.tma.async.load %4[%c0, %c0], %9 to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
100+
nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
101+
nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
101102
} else {
102-
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : <memorySpace = #gpu.address_space<workgroup>>
103+
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : <memorySpace = #gpu.address_space<workgroup>>
103104
}
104-
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
105+
nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
105106
scf.if %10 {
106107
%11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
107108
%12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>

0 commit comments

Comments
 (0)