1
1
//
2
2
// NOTE: this test requires gpu-sm80
3
3
//
4
+ // with RT lib:
5
+ //
6
+ // RUN: mlir-opt %s \
7
+ // RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
8
+ // RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
9
+ // RUN: mlir-cpu-runner \
10
+ // RUN: --shared-libs=%mlir_cuda_runtime \
11
+ // RUN: --shared-libs=%mlir_c_runner_utils \
12
+ // RUN: --e entry --entry-point-result=void \
13
+ // RUN: | FileCheck %s
14
+ //
15
+ // without RT lib:
16
+ //
4
17
// RUN: mlir-opt %s \
5
- // RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
18
+ // RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
6
19
// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
7
20
// RUN: mlir-cpu-runner \
8
21
// RUN: --shared-libs=%mlir_cuda_runtime \
21
34
indexing_maps = [
22
35
affine_map <(i ,j ,k ) -> (i ,k )>, // A
23
36
affine_map <(i ,j ,k ) -> (k ,j )>, // B
24
- affine_map <(i ,j ,k ) -> (i ,j )> // S (out)
37
+ affine_map <(i ,j ,k ) -> (i ,j )> // S (in/ out)
25
38
],
26
39
iterator_types = [" parallel" , " parallel" , " reduction" ],
27
- doc = " X (i,j) += S(i,j) SUM_k A(i,k) B(k,j)"
40
+ doc = " S (i,j) += spy[ S(i,j)] x SUM_k A(i,k) B(k,j)"
28
41
}
29
42
30
43
//
31
44
// Integration test that lowers a kernel annotated as sparse to
32
- // actual sparse code, initializes a matching sparse storage scheme
33
- // from file, and runs the resulting code with the JIT compiler.
45
+ // actual sparse code, initializes sparse storage schemes, and
46
+ // runs the resulting code with the JIT compiler.
34
47
//
35
48
module {
36
49
//
37
- // A kernel that computes a sampled matrix matrix multiplication.
50
+ // A kernel that computes a sampled dense matrix matrix multiplication
51
+ // using a "spy" function and in-place update of the sampling sparse matrix.
38
52
//
39
53
func.func @sampled_dense_dense (%args: tensor <?x?xf32 , #CSR >,
40
54
%arga: tensor <?x?xf32 >,
@@ -64,7 +78,7 @@ module {
64
78
func.func private @getTensorFilename (index ) -> (!Filename )
65
79
66
80
//
67
- // Main driver that reads matrix from file and calls the sparse kernel .
81
+ // Main driver.
68
82
//
69
83
func.func @entry () {
70
84
%d0 = arith.constant 0.0 : f32
@@ -74,19 +88,13 @@ module {
74
88
%c10 = arith.constant 10 : index
75
89
76
90
// Initialize dense matrices.
77
- %x = tensor.generate %c5 , %c5 {
78
- ^bb0 (%i : index , %j : index ):
79
- tensor.yield %d0 : f32
80
- } : tensor <?x?xf32 >
81
-
82
91
%a = tensor.generate %c5 , %c10 {
83
92
^bb0 (%i: index , %j: index ):
84
93
%p = arith.addi %i , %c1 : index
85
94
%q = arith.index_cast %p : index to i32
86
95
%d = arith.sitofp %q : i32 to f32
87
96
tensor.yield %d : f32
88
97
} : tensor <?x?xf32 >
89
-
90
98
%b = tensor.generate %c10 , %c5 {
91
99
^bb0 (%i: index , %j: index ):
92
100
%p = arith.addi %j , %c1 : index
@@ -104,15 +112,42 @@ module {
104
112
: (tensor <?x?xf32 , #CSR >,
105
113
tensor <?x?xf32 >, tensor <?x?xf32 >) -> tensor <?x?xf32 , #CSR >
106
114
115
+ //
107
116
// Print the result for verification.
108
117
//
109
118
// CHECK: ( 11, 41.4, 42, 102.5, 93, 44.1, 164, 105.2, 255 )
119
+ //
110
120
%vm = sparse_tensor.values %0 : tensor <?x?xf32 , #CSR > to memref <?xf32 >
111
121
%vv = vector.transfer_read %vm [%c0 ], %d0 : memref <?xf32 >, vector <9 xf32 >
112
122
vector.print %vv : vector <9 xf32 >
113
123
124
+ // Create a much sparser sampling matrix.
125
+ %t = arith.constant sparse <[[0 ,0 ], [0 ,1 ], [1 ,0 ], [3 ,4 ], [7 ,7 ]],
126
+ [1.0 , 2.0 , 3.0 , 4.0 , 5.0 ]
127
+ > : tensor <8 x8 xf32 >
128
+ %q = sparse_tensor.convert %t : tensor <8 x8 xf32 > to tensor <?x?xf32 , #CSR >
129
+ %a2 = arith.constant dense <2.0 > : tensor <8 x8 xf32 >
130
+ %b1 = arith.constant dense <1.0 > : tensor <8 x8 xf32 >
131
+ %a2c = tensor.cast %a2 : tensor <8 x8 xf32 > to tensor <?x?xf32 >
132
+ %b1c = tensor.cast %b1 : tensor <8 x8 xf32 > to tensor <?x?xf32 >
133
+
134
+ // Call the kernel again.
135
+ %1 = call @sampled_dense_dense (%q , %a2c , %b1c )
136
+ : (tensor <?x?xf32 , #CSR >,
137
+ tensor <?x?xf32 >, tensor <?x?xf32 >) -> tensor <?x?xf32 , #CSR >
138
+
139
+ //
140
+ // Print the result for verification.
141
+ //
142
+ // CHECK: ( ( 17, 18, 0, 0, 0, 0, 0, 0 ), ( 19, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 20, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 21 ) )
143
+ //
144
+ %d = sparse_tensor.convert %1 : tensor <?x?xf32 , #CSR > to tensor <?x?xf32 >
145
+ %mm = vector.transfer_read %d [%c0 , %c0 ], %d0 : tensor <?x?xf32 >, vector <8 x8 xf32 >
146
+ vector.print %mm : vector <8 x8 xf32 >
147
+
114
148
// Release the resources.
115
149
bufferization.dealloc_tensor %0 : tensor <?x?xf32 , #CSR >
150
+ bufferization.dealloc_tensor %1 : tensor <?x?xf32 , #CSR >
116
151
117
152
return
118
153
}
0 commit comments