Skip to content

Commit f6f817d

Browse files
committed
[mlir][sparse][gpu] minor improvements in 2:4 example
Reviewed By: K-Wu Differential Revision: https://reviews.llvm.org/D155244
1 parent 74e928a commit f6f817d

File tree

1 file changed

+7
-25
lines changed

1 file changed

+7
-25
lines changed

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// NOTE: this test requires gpu-sm80
2+
// NOTE: this test requires gpu-sm80 and cusparselt
33
//
44
// RUN: mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
55
// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
@@ -41,7 +41,8 @@ module {
4141
%token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
4242
%token16 = gpu.destroy_sp_mat async [%token15] %spmat
4343
%token17 = gpu.destroy_dn_tensor async [%token16] %dnmat
44-
%token19 = gpu.memcpy async [%token17] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
44+
%token18 = gpu.destroy_dn_tensor async [%token17] %dnmat2
45+
%token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
4546
%token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16>
4647
%token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16>
4748
%token22 = gpu.dealloc async [%token21] %d_a : memref<16x32xf16>
@@ -69,9 +70,9 @@ module {
6970
%c64 = arith.constant 64 : index
7071

7172
// Matrices A, B, C (16x32, 32x16, 16x16).
72-
%a = memref.alloc() : memref<16x32xf16> // 16x32 but 2:4, row-major
73-
%b = memref.alloc() : memref<32x16xf16> // regular dense column-major
74-
%c = memref.alloc() : memref<16x16xf16> // accumulator row-major
73+
%a = memref.alloc() : memref<16x32xf16> // 16x32 with 2:4, row-major
74+
%b = memref.alloc() : memref<32x16xf16> // regular dense column-major
75+
%c = memref.alloc() : memref<16x16xf16> // accumulator row-major
7576

7677
//
7778
// Setup matrix A.
@@ -181,27 +182,8 @@ module {
181182
vector.print %pb0 : vector<16xf16>
182183
}
183184

184-
// Maps the provided host buffers into the device address space.
185-
// Writes from the host are guaranteed to be visible to device
186-
// kernels that are launched afterwards. Writes from the device
187-
// are guaranteed to be visible on the host after synchronizing
188-
// with the device kernel completion.
189-
%cast_a = memref.cast %a : memref<16x32xf16> to memref<*xf16>
190-
gpu.host_register %cast_a : memref<*xf16>
191-
%cast_b = memref.cast %b : memref<32x16xf16> to memref<*xf16>
192-
gpu.host_register %cast_b : memref<*xf16>
193-
%cast_c = memref.cast %c : memref<16x16xf16> to memref<*xf16>
194-
gpu.host_register %cast_c : memref<*xf16>
195-
196185
// Call the kernel.
197-
%t1 = arith.constant 1 : index
198-
%t32 = arith.constant 32 : index
199186
call @sampled_matmul (%a, %b, %c): (memref<16x32xf16>, memref<32x16xf16>, memref<16x16xf16>) -> ()
200-
201-
// Unmaps the host buffers.
202-
gpu.host_unregister %cast_a : memref<*xf16>
203-
gpu.host_unregister %cast_b : memref<*xf16>
204-
gpu.host_unregister %cast_c : memref<*xf16>
205187

206188
//
207189
// Verify computed matrix C.
@@ -227,7 +209,7 @@ module {
227209
%pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x16xf16>, vector<16xf16>
228210
vector.print %pc0 : vector<16xf16>
229211
}
230-
212+
231213
llvm.call @mgpuDestroySparseLtEnv() : () -> ()
232214
return
233215
}

0 commit comments

Comments
 (0)