1
1
// NOTE: this test requires gpu-sm80 and cusparselt
2
2
//
3
- // DEFINE: %{compile} = mlir-opt %s \
4
- // DEFINE: --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
3
+ // DEFINE: %{compile} = mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
4
+ // DEFINE: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
5
+ // DEFINE: %s
5
6
// DEFINE: %{run} = mlir-cpu-runner \
6
7
// DEFINE: --shared-libs=%mlir_cuda_runtime \
7
8
// DEFINE: --shared-libs=%mlir_c_runner_utils \
8
9
// DEFINE: --e main --entry-point-result=void \
9
10
// DEFINE: | FileCheck %s
10
11
//
11
- // with RT lib:
12
- //
13
- // RUN: %{compile} enable-runtime-library=true" | %{run}
14
- //
15
- // without RT lib:
16
- //
17
- // RUN: %{compile} enable-runtime-library=false" | %{run}
18
-
19
- #map = affine_map <(d0 , d1 , d2 ) -> (d0 , d2 )>
20
- #map1 = affine_map <(d0 , d1 , d2 ) -> (d2 , d1 )>
21
- #map2 = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 )>
12
+ // RUN: %{compile} | %{run}
22
13
23
14
module {
24
15
llvm.func @mgpuCreateSparseLtEnv ()
25
16
llvm.func @mgpuDestroySparseLtEnv ()
26
17
27
- //
28
- // TODO: This uses our temporary ATTRIBUTE, replace with 2:4 type!
29
- //
30
- func.func @matmul_2to4 (%arg0: tensor <16 x32 xf16 >, %arg1: tensor <32 x16 xf16 >, %arg2: tensor <16 x16 xf16 >) -> tensor <16 x16 xf16 > {
31
- %0 = linalg.generic { DENSE24 , index ing_maps = [#map , #map1 , #map2 ], iterator_types = [" parallel" , " parallel" , " reduction" ]} ins (%arg0 , %arg1 : tensor <16 x32 xf16 >, tensor <32 x16 xf16 >) outs (%arg2 : tensor <16 x16 xf16 >) {
32
- ^bb0 (%in: f16 , %in_0: f16 , %out: f16 ):
33
- %1 = arith.mulf %in , %in_0 : f16
34
- %2 = arith.addf %out , %1 : f16
35
- linalg.yield %2 : f16
36
- } -> tensor <16 x16 xf16 >
37
- return %0 : tensor <16 x16 xf16 >
18
+ // cuSparselt version for matmul coded by hand.
19
+ func.func @matmul24 (%a : memref <16 x32 xf16 >,
20
+ %b : memref <32 x16 xf16 >,
21
+ %c : memref <16 x16 xf16 >) {
22
+ %c0 = arith.constant 0.0 : f16
23
+ %c1 = arith.constant 1 : index
24
+ %c2 = arith.constant 2 : index
25
+ %c8 = arith.constant 8 : index
26
+ %c16 = arith.constant 16 : index
27
+ %c32 = arith.constant 32 : index
28
+ %c1048576 = arith.constant 1048576 : index
29
+ %token0 = gpu.wait async
30
+ %d_a , %token1 = gpu.alloc async [%token0 ] () : memref <16 x32 xf16 >
31
+ %d_b , %token2 = gpu.alloc async [%token1 ] () : memref <32 x16 xf16 >
32
+ %d_c , %token3 = gpu.alloc async [%token2 ] () : memref <16 x16 xf16 >
33
+ %token4 = gpu.memcpy async [%token3 ] %d_a , %a : memref <16 x32 xf16 >, memref <16 x32 xf16 >
34
+ %token5 = gpu.memcpy async [%token4 ] %d_b , %b : memref <32 x16 xf16 >, memref <32 x16 xf16 >
35
+ %token6 = gpu.memcpy async [%token5 ] %d_c , %c : memref <16 x16 xf16 >, memref <16 x16 xf16 >
36
+ %spmat , %token8 = gpu.create_2to4_spmat async [%token6 ]{PRUNE_AND_CHECK } %c16 , %c32 , %d_a: memref <16 x32 xf16 >
37
+ %dnmat , %token9 = gpu.create_dn_tensor async [%token8 ] %d_b , %c32 , %c16: index , index into memref <32 x16 xf16 >
38
+ %dnmat2 , %token10 = gpu.create_dn_tensor async [%token9 ] %d_c , %c16 , %c16: index , index into memref <16 x16 xf16 >
39
+ %bufferSz0 , %bufferSz1 , %bufferSz2 , %token11 = gpu.spmm_buffer_size async [%token10 ] %spmat {NON_TRANSPOSE }, %dnmat {NON_TRANSPOSE }, %dnmat2 : index , index ,index into f16
40
+ %mem1 , %token12 = gpu.alloc async [%token11 ] (%bufferSz0 ) : memref <?xf16 >
41
+ %mem2 , %token13 = gpu.alloc async [%token12 ] (%bufferSz1 ) : memref <?xf16 >
42
+ %mem3 , %token14 = gpu.alloc async [%token13 ] (%bufferSz2 ) : memref <?xf16 >
43
+ %token15 = gpu.spmm async [%token14 ] %spmat {NON_TRANSPOSE }, %dnmat {NON_TRANSPOSE }, %dnmat2 , %mem1 , %mem2 , %mem3 : memref <?xf16 >, memref <?xf16 >,memref <?xf16 > into f16
44
+ %token16 = gpu.destroy_sp_mat async [%token15 ] %spmat
45
+ %token17 = gpu.destroy_dn_tensor async [%token16 ] %dnmat
46
+ %token18 = gpu.destroy_dn_tensor async [%token17 ] %dnmat2
47
+ %token19 = gpu.memcpy async [%token18 ] %c , %d_c : memref <16 x16 xf16 >, memref <16 x16 xf16 >
48
+ %token20 = gpu.dealloc async [%token19 ] %d_c : memref <16 x16 xf16 >
49
+ %token21 = gpu.dealloc async [%token20 ] %d_b : memref <32 x16 xf16 >
50
+ %token22 = gpu.dealloc async [%token21 ] %d_a : memref <16 x32 xf16 >
51
+ %token23 = gpu.dealloc async [%token22 ] %mem3 : memref <?xf16 >
52
+ %token24 = gpu.dealloc async [%token23 ] %mem2 : memref <?xf16 >
53
+ %token25 = gpu.dealloc async [%token24 ] %mem1 : memref <?xf16 >
54
+ gpu.wait [%token25 ]
55
+ return
38
56
}
39
57
40
58
//
@@ -54,50 +72,49 @@ module {
54
72
%c64 = arith.constant 64 : index
55
73
56
74
// Matrices A, B, C (16x32, 32x16, 16x16).
75
+ %a = memref.alloc () : memref <16 x32 xf16 > // 16x32 with 2:4, row-major
76
+ %b = memref.alloc () : memref <32 x16 xf16 > // regular dense column-major
77
+ %c = memref.alloc () : memref <16 x16 xf16 > // accumulator row-major
57
78
58
79
//
59
80
// Setup matrix A.
60
81
//
61
- %DA = tensor.generate {
62
- ^bb0 (%i: index , %j: index ):
63
- // (i+ j/2 + 1) if j %2 == 0 else 0
64
- %cf0 = arith.constant 0.0 : f16
65
- %cf1 = arith.constant 1.0 : f16
66
- %j_2 = arith.floordivsi %j , %c2 : index
67
- %quotient = arith.remsi %j , %c2 : index
68
- %sum = arith.addi %i , %j_2 : index
69
- %sum_i = arith.index_cast %sum : index to i64
70
- %sum_f = arith.uitofp %sum_i : i64 to f16
71
- %sum_f_plus1 = arith.addf %sum_f , %cf1 : f16
72
- %is_zero = arith.cmpi " eq" , %quotient , %c0 : index
73
- %s = arith.select %is_zero , %sum_f_plus1 , %cf0 : f16
74
- tensor.yield %s : f16
75
- } : tensor <16 x32 xf16 >
82
+ scf.for %ai = %c0 to %c16 step %c1 {
83
+ scf.for %aj = %c0 to %c16 step %c1 {
84
+ %cf0 = arith.constant 0.0 : f16
85
+ %a0 = arith.addi %ai , %aj : index
86
+ %a1 = arith.addi %a0 , %c1 : index
87
+ %a2 = arith.index_cast %a1 : index to i32
88
+ %a3 = arith.sitofp %a2 : i32 to f16
89
+ %ajj = arith.muli %aj , %c2 : index
90
+ %ajj2 = arith.addi %ajj , %c1 : index
91
+ memref.store %a3 , %a [%ai , %ajj ] : memref <16 x32 xf16 >
92
+ memref.store %cf0 , %a [%ai , %ajj2 ] : memref <16 x32 xf16 >
93
+ }
94
+ }
76
95
77
96
//
78
97
// Setup matrix B.
79
98
//
80
- %DB = tensor.generate {
81
- ^bb0 (%i: index , %j: index ):
82
- // if j_i >=8, j_i - 8 else 0
83
- %is_ge8 = arith.cmpi " sge" , %j , %c8 : index
84
- %j_minus8 = arith.subi %j , %c8 : index
85
- %j2 = arith.select %is_ge8 , %j_minus8 , %j : index
86
- %r_i = arith.subi %j2 , %i : index
87
- %r_i64 = arith.index_cast %r_i : index to i64
88
- %r_f = arith.sitofp %r_i64 : i64 to f16
89
- tensor.yield %r_f : f16
90
- } : tensor <32 x16 xf16 >
99
+ scf.for %bi = %c0 to %c8 step %c1 {
100
+ scf.for %bj = %c0 to %c32 step %c1 {
101
+ %b0 = arith.subi %bi , %bj : index
102
+ %b1 = arith.index_cast %b0 : index to i32
103
+ %b2 = arith.sitofp %b1 : i32 to f16
104
+ %bii = arith.addi %bi , %c8 : index
105
+ memref.store %b2 , %b [%bj , %bi ] : memref <32 x16 xf16 >
106
+ memref.store %b2 , %b [%bj , %bii ] : memref <32 x16 xf16 >
107
+ }
108
+ }
91
109
92
110
//
93
111
// Reset matrix C.
94
112
//
95
- %DC = tensor.generate {
96
- ^bb0 (%i: index , %j: index ):
97
- %cf0 = arith.constant 0.0 : f16
98
- tensor.yield %cf0 : f16
99
- } : tensor <16 x16 xf16 >
100
-
113
+ scf.for %ci = %c0 to %c16 step %c1 {
114
+ scf.for %cj = %c0 to %c16 step %c1 {
115
+ memref.store %f0 , %c [%ci , %cj ] : memref <16 x16 xf16 >
116
+ }
117
+ }
101
118
102
119
//
103
120
// Sanity check on 16x32 full 2:4 input matrix A.
@@ -121,7 +138,7 @@ module {
121
138
// CHECK-NEXT: ( 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0 )
122
139
//
123
140
scf.for %pai = %c0 to %c16 step %c1 {
124
- %pa0 = vector.transfer_read %DA [%pai , %c0 ], %f0 : tensor <16 x32 xf16 >, vector <32 xf16 >
141
+ %pa0 = vector.transfer_read %a [%pai , %c0 ], %f0 : memref <16 x32 xf16 >, vector <32 xf16 >
125
142
vector.print %pa0 : vector <32 xf16 >
126
143
}
127
144
@@ -163,14 +180,12 @@ module {
163
180
//
164
181
//
165
182
scf.for %pbi = %c0 to %c32 step %c1 {
166
- %pb0 = vector.transfer_read %DB [%pbi , %c0 ], %f0 : tensor <32 x16 xf16 >, vector <16 xf16 >
183
+ %pb0 = vector.transfer_read %b [%pbi , %c0 ], %f0 : memref <32 x16 xf16 >, vector <16 xf16 >
167
184
vector.print %pb0 : vector <16 xf16 >
168
185
}
169
186
170
187
// Call the kernel.
171
- %t1 = arith.constant 1 : index
172
- %t32 = arith.constant 32 : index
173
- %c_out = call @matmul_2to4 (%DA , %DB , %DC ): (tensor <16 x32 xf16 >, tensor <32 x16 xf16 >, tensor <16 x16 xf16 >) -> tensor <16 x16 xf16 >
188
+ call @matmul24 (%a , %b , %c ): (memref <16 x32 xf16 >, memref <32 x16 xf16 >, memref <16 x16 xf16 >) -> ()
174
189
175
190
//
176
191
// Verify computed matrix C.
@@ -193,7 +208,7 @@ module {
193
208
// CHECK-NEXT: ( -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688, -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688 )
194
209
//
195
210
scf.for %pci = %c0 to %c16 step %c1 {
196
- %pc0 = vector.transfer_read %c_out [%pci , %c0 ], %f0 : tensor <16 x16 xf16 >, vector <16 xf16 >
211
+ %pc0 = vector.transfer_read %c [%pci , %c0 ], %f0 : memref <16 x16 xf16 >, vector <16 xf16 >
197
212
vector.print %pc0 : vector <16 xf16 >
198
213
}
199
214
0 commit comments