1
1
//
2
- // NOTE: this test requires gpu-sm80
2
+ // NOTE: this test requires gpu-sm80 and cusparselt
3
3
//
4
4
// RUN: mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
5
5
// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
@@ -41,7 +41,8 @@ module {
41
41
%token15 = gpu.spmm async [%token14 ] %spmat {NON_TRANSPOSE }, %dnmat {NON_TRANSPOSE }, %dnmat2 , %mem1 , %mem2 , %mem3 : memref <?xf16 >, memref <?xf16 >,memref <?xf16 > into f16
42
42
%token16 = gpu.destroy_sp_mat async [%token15 ] %spmat
43
43
%token17 = gpu.destroy_dn_tensor async [%token16 ] %dnmat
44
- %token19 = gpu.memcpy async [%token17 ] %c , %d_c : memref <16 x16 xf16 >, memref <16 x16 xf16 >
44
+ %token18 = gpu.destroy_dn_tensor async [%token17 ] %dnmat2
45
+ %token19 = gpu.memcpy async [%token18 ] %c , %d_c : memref <16 x16 xf16 >, memref <16 x16 xf16 >
45
46
%token20 = gpu.dealloc async [%token19 ] %d_c : memref <16 x16 xf16 >
46
47
%token21 = gpu.dealloc async [%token20 ] %d_b : memref <32 x16 xf16 >
47
48
%token22 = gpu.dealloc async [%token21 ] %d_a : memref <16 x32 xf16 >
@@ -69,9 +70,9 @@ module {
69
70
%c64 = arith.constant 64 : index
70
71
71
72
// Matrices A, B, C (16x32, 32x16, 16x16).
72
- %a = memref.alloc () : memref <16 x32 xf16 > // 16x32 but 2:4, row-major
73
- %b = memref.alloc () : memref <32 x16 xf16 > // regular dense column-major
74
- %c = memref.alloc () : memref <16 x16 xf16 > // accumulator row-major
73
+ %a = memref.alloc () : memref <16 x32 xf16 > // 16x32 with 2:4, row-major
74
+ %b = memref.alloc () : memref <32 x16 xf16 > // regular dense column-major
75
+ %c = memref.alloc () : memref <16 x16 xf16 > // accumulator row-major
75
76
76
77
//
77
78
// Setup matrix A.
@@ -181,27 +182,8 @@ module {
181
182
vector.print %pb0 : vector <16 xf16 >
182
183
}
183
184
184
- // Maps the provided host buffers into the device address space.
185
- // Writes from the host are guaranteed to be visible to device
186
- // kernels that are launched afterwards. Writes from the device
187
- // are guaranteed to be visible on the host after synchronizing
188
- // with the device kernel completion.
189
- %cast_a = memref.cast %a : memref <16 x32 xf16 > to memref <*xf16 >
190
- gpu.host_register %cast_a : memref <*xf16 >
191
- %cast_b = memref.cast %b : memref <32 x16 xf16 > to memref <*xf16 >
192
- gpu.host_register %cast_b : memref <*xf16 >
193
- %cast_c = memref.cast %c : memref <16 x16 xf16 > to memref <*xf16 >
194
- gpu.host_register %cast_c : memref <*xf16 >
195
-
196
185
// Call the kernel.
197
- %t1 = arith.constant 1 : index
198
- %t32 = arith.constant 32 : index
199
186
call @sampled_matmul (%a , %b , %c ): (memref <16 x32 xf16 >, memref <32 x16 xf16 >, memref <16 x16 xf16 >) -> ()
200
-
201
- // Unmaps the host buffers.
202
- gpu.host_unregister %cast_a : memref <*xf16 >
203
- gpu.host_unregister %cast_b : memref <*xf16 >
204
- gpu.host_unregister %cast_c : memref <*xf16 >
205
187
206
188
//
207
189
// Verify computed matrix C.
@@ -227,7 +209,7 @@ module {
227
209
%pc0 = vector.transfer_read %c [%pci , %c0 ], %f0 : memref <16 x16 xf16 >, vector <16 xf16 >
228
210
vector.print %pc0 : vector <16 xf16 >
229
211
}
230
-
212
+
231
213
llvm.call @mgpuDestroySparseLtEnv () : () -> ()
232
214
return
233
215
}
0 commit comments