Skip to content

Commit 22caafc

Browse files
committed
[mlir][sparse][gpu] end to end test for matmul
(1) minor bug fix in copy back [always nice to run stuff ;-)] (2) run with and without lib (even though some fall back to CPU) Reviewed By: wrengr Differential Revision: https://reviews.llvm.org/D151507
1 parent 837d1ce commit 22caafc

File tree

4 files changed

+218
-11
lines changed

4 files changed

+218
-11
lines changed

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
601601
tokens.clear();
602602

603603
// Done.
604-
rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, matC);
604+
rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);
605605
return success();
606606
}
607607

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
// CHECK: %[[VAL_64:.*]] = gpu.memcpy async {{\[}}%[[VAL_63]]] %[[VAL_34]], %[[VAL_38]] : memref<?x?xf64>, memref<?x?xf64>
6565
// CHECK: %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_38]] : memref<?x?xf64>
6666
// CHECK: gpu.wait {{\[}}%[[VAL_65]]]
67-
// CHECK: %[[VAL_66:.*]] = bufferization.to_tensor %[[VAL_38]] : memref<?x?xf64>
67+
// CHECK: %[[VAL_66:.*]] = bufferization.to_tensor %[[VAL_34]] : memref<?x?xf64>
6868
// CHECK: return %[[VAL_66]] : tensor<?x?xf64>
6969
// CHECK: }
7070
func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
//
2+
// NOTE: this test requires gpu-sm80
3+
//
4+
// with RT lib (SoA COO):
5+
//
6+
// RUN: mlir-opt %s \
7+
// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
8+
// RUN: | mlir-cpu-runner \
9+
// RUN: --shared-libs=%mlir_cuda_runtime \
10+
// RUN: --shared-libs=%mlir_c_runner_utils \
11+
// RUN: --e main --entry-point-result=void \
12+
// RUN: | FileCheck %s
13+
//
14+
// without RT lib (AoS COO): note, may fall back to CPU
15+
//
16+
// RUN: mlir-opt %s \
17+
// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
18+
// RUN: | mlir-cpu-runner \
19+
// RUN: --shared-libs=%mlir_cuda_runtime \
20+
// RUN: --shared-libs=%mlir_c_runner_utils \
21+
// RUN: --e main --entry-point-result=void \
22+
// RUN: | FileCheck %s
23+
24+
#SortedCOO = #sparse_tensor.encoding<{
25+
lvlTypes = [ "compressed-nu", "singleton" ]
26+
}>
27+
28+
#CSR = #sparse_tensor.encoding<{
29+
lvlTypes = [ "dense", "compressed" ],
30+
posWidth = 32,
31+
crdWidth = 32
32+
}>
33+
34+
module {
35+
// Computes C = A x B with A sparse COO.
36+
func.func @matmulCOO(%A: tensor<8x8xf32, #SortedCOO>,
37+
%B: tensor<8x8xf32>,
38+
%C: tensor<8x8xf32>) -> tensor<8x8xf32> {
39+
%D = linalg.matmul
40+
ins(%A, %B: tensor<8x8xf32, #SortedCOO>, tensor<8x8xf32>)
41+
outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32>
42+
return %D: tensor<8x8xf32>
43+
}
44+
45+
// Computes C = A x B with A sparse CSR.
46+
func.func @matmulCSR(%A: tensor<8x8xf32, #CSR>,
47+
%B: tensor<8x8xf32>,
48+
%C: tensor<8x8xf32>) -> tensor<8x8xf32> {
49+
%D = linalg.matmul
50+
ins(%A, %B: tensor<8x8xf32, #CSR>, tensor<8x8xf32>)
51+
outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32>
52+
return %D: tensor<8x8xf32>
53+
}
54+
55+
func.func @dump(%mat: tensor<8x8xf32>) {
56+
%f0 = arith.constant 0.0 : f32
57+
%c0 = arith.constant 0 : index
58+
%c1 = arith.constant 1 : index
59+
%c2 = arith.constant 2 : index
60+
%c3 = arith.constant 3 : index
61+
%c4 = arith.constant 4 : index
62+
%c5 = arith.constant 5 : index
63+
%c6 = arith.constant 6 : index
64+
%c7 = arith.constant 7 : index
65+
%r0 = vector.transfer_read %mat[%c0,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
66+
vector.print %r0 : vector<8xf32>
67+
%r1 = vector.transfer_read %mat[%c1,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
68+
vector.print %r1 : vector<8xf32>
69+
%r2 = vector.transfer_read %mat[%c2,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
70+
vector.print %r2 : vector<8xf32>
71+
%r3 = vector.transfer_read %mat[%c3,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
72+
vector.print %r3 : vector<8xf32>
73+
%r4 = vector.transfer_read %mat[%c4,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
74+
vector.print %r4 : vector<8xf32>
75+
%r5 = vector.transfer_read %mat[%c5,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
76+
vector.print %r5 : vector<8xf32>
77+
%r6 = vector.transfer_read %mat[%c6,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
78+
vector.print %r6 : vector<8xf32>
79+
%r7 = vector.transfer_read %mat[%c7,%c0], %f0 : tensor<8x8xf32>, vector<8xf32>
80+
vector.print %r7 : vector<8xf32>
81+
return
82+
}
83+
84+
//
85+
// Main driver.
86+
//
87+
func.func @main() {
88+
%f0 = arith.constant 0.0 : f32
89+
%f1 = arith.constant 1.0 : f32
90+
91+
// Stress test with a dense matrix DA.
92+
%DA = tensor.generate {
93+
^bb0(%i: index, %j: index):
94+
%k = arith.addi %i, %j : index
95+
%l = arith.index_cast %k : index to i64
96+
%f = arith.uitofp %l : i64 to f32
97+
tensor.yield %f : f32
98+
} : tensor<8x8xf32>
99+
100+
// Convert to a "sparse" matrix A.
101+
%Acoo = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOO>
102+
%Acsr = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #CSR>
103+
104+
// Initial C matrices.
105+
%C0 = tensor.generate {
106+
^bb0(%i: index, %j: index):
107+
tensor.yield %f0 : f32
108+
} : tensor<8x8xf32>
109+
%C1 = tensor.generate {
110+
^bb0(%i: index, %j: index):
111+
tensor.yield %f1 : f32
112+
} : tensor<8x8xf32>
113+
114+
// Call the kernels.
115+
%0 = call @matmulCOO(%Acoo, %DA, %C0) : (tensor<8x8xf32, #SortedCOO>,
116+
tensor<8x8xf32>,
117+
tensor<8x8xf32>) -> tensor<8x8xf32>
118+
%1 = call @matmulCSR(%Acsr, %DA, %C0) : (tensor<8x8xf32, #CSR>,
119+
tensor<8x8xf32>,
120+
tensor<8x8xf32>) -> tensor<8x8xf32>
121+
%2 = call @matmulCOO(%Acoo, %DA, %C1) : (tensor<8x8xf32, #SortedCOO>,
122+
tensor<8x8xf32>,
123+
tensor<8x8xf32>) -> tensor<8x8xf32>
124+
%3 = call @matmulCSR(%Acsr, %DA, %C1) : (tensor<8x8xf32, #CSR>,
125+
tensor<8x8xf32>,
126+
tensor<8x8xf32>) -> tensor<8x8xf32>
127+
128+
//
129+
// Sanity check on results.
130+
//
131+
// CHECK: ( 140, 168, 196, 224, 252, 280, 308, 336 )
132+
// CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 )
133+
// CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 )
134+
// CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 )
135+
// CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 )
136+
// CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 )
137+
// CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 )
138+
// CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 )
139+
//
140+
// CHECK: ( 140, 168, 196, 224, 252, 280, 308, 336 )
141+
// CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 )
142+
// CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 )
143+
// CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 )
144+
// CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 )
145+
// CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 )
146+
// CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 )
147+
// CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 )
148+
//
149+
// CHECK: ( 141, 169, 197, 225, 253, 281, 309, 337 )
150+
// CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 )
151+
// CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 )
152+
// CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 )
153+
// CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 )
154+
// CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 )
155+
// CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 )
156+
// CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 )
157+
//
158+
// CHECK: ( 141, 169, 197, 225, 253, 281, 309, 337 )
159+
// CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 )
160+
// CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 )
161+
// CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 )
162+
// CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 )
163+
// CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 )
164+
// CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 )
165+
// CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 )
166+
//
167+
call @dump(%0) : (tensor<8x8xf32>) -> ()
168+
call @dump(%1) : (tensor<8x8xf32>) -> ()
169+
call @dump(%2) : (tensor<8x8xf32>) -> ()
170+
call @dump(%3) : (tensor<8x8xf32>) -> ()
171+
172+
// Release the resources.
173+
bufferization.dealloc_tensor %Acoo : tensor<8x8xf32, #SortedCOO>
174+
bufferization.dealloc_tensor %Acsr : tensor<8x8xf32, #CSR>
175+
176+
return
177+
}
178+
}

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,15 @@
1111
// RUN: --e main --entry-point-result=void \
1212
// RUN: | FileCheck %s
1313
//
14-
// TODO: without RT lib (AoS COO):
14+
// without RT lib (AoS COO): note, may fall back to CPU
15+
//
16+
// RUN: mlir-opt %s \
17+
// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
18+
// RUN: | mlir-cpu-runner \
19+
// RUN: --shared-libs=%mlir_cuda_runtime \
20+
// RUN: --shared-libs=%mlir_c_runner_utils \
21+
// RUN: --e main --entry-point-result=void \
22+
// RUN: | FileCheck %s
1523

1624
#SortedCOO = #sparse_tensor.encoding<{
1725
lvlTypes = [ "compressed-nu", "singleton" ]
@@ -42,6 +50,7 @@ module {
4250

4351
func.func @main() {
4452
%f0 = arith.constant 0.0 : f64
53+
%f1 = arith.constant 1.0 : f64
4554
%c0 = arith.constant 0 : index
4655
%c1 = arith.constant 1 : index
4756

@@ -52,11 +61,11 @@ module {
5261
%l = arith.index_cast %k : index to i64
5362
%f = arith.uitofp %l : i64 to f64
5463
tensor.yield %f : f64
55-
} : tensor<1024x64xf64>
64+
} : tensor<64x64xf64>
5665

5766
// Convert to a "sparse" m x n matrix A.
58-
%Acoo = sparse_tensor.convert %DA : tensor<1024x64xf64> to tensor<?x?xf64, #SortedCOO>
59-
%Acsr = sparse_tensor.convert %DA : tensor<1024x64xf64> to tensor<?x?xf64, #CSR>
67+
%Acoo = sparse_tensor.convert %DA : tensor<64x64xf64> to tensor<?x?xf64, #SortedCOO>
68+
%Acsr = sparse_tensor.convert %DA : tensor<64x64xf64> to tensor<?x?xf64, #CSR>
6069

6170
// Initialize dense vector with n elements:
6271
// (1, 2, 3, 4, ..., n)
@@ -69,26 +78,46 @@ module {
6978
tensor.yield %f : f64
7079
} : tensor<?xf64>
7180

72-
// Initialize dense vector to m zeros.
81+
// Initialize dense vectors to m zeros and m ones.
7382
%d0 = tensor.dim %Acoo, %c0 : tensor<?x?xf64, #SortedCOO>
74-
%y = tensor.generate %d0 {
83+
%y0 = tensor.generate %d0 {
7584
^bb0(%i : index):
7685
tensor.yield %f0 : f64
7786
} : tensor<?xf64>
87+
%y1 = tensor.generate %d0 {
88+
^bb0(%i : index):
89+
tensor.yield %f1 : f64
90+
} : tensor<?xf64>
7891

7992
// Call the kernels.
80-
%0 = call @matvecCOO(%Acoo, %x, %y) : (tensor<?x?xf64, #SortedCOO>, tensor<?xf64>, tensor<?xf64>) -> tensor<?xf64>
81-
%1 = call @matvecCSR(%Acsr, %x, %y) : (tensor<?x?xf64, #CSR>, tensor<?xf64>, tensor<?xf64>) -> tensor<?xf64>
93+
%0 = call @matvecCOO(%Acoo, %x, %y0) : (tensor<?x?xf64, #SortedCOO>,
94+
tensor<?xf64>,
95+
tensor<?xf64>) -> tensor<?xf64>
96+
%1 = call @matvecCSR(%Acsr, %x, %y0) : (tensor<?x?xf64, #CSR>,
97+
tensor<?xf64>,
98+
tensor<?xf64>) -> tensor<?xf64>
99+
%2 = call @matvecCOO(%Acoo, %x, %y1) : (tensor<?x?xf64, #SortedCOO>,
100+
tensor<?xf64>,
101+
tensor<?xf64>) -> tensor<?xf64>
102+
%3 = call @matvecCSR(%Acsr, %x, %y1) : (tensor<?x?xf64, #CSR>,
103+
tensor<?xf64>,
104+
tensor<?xf64>) -> tensor<?xf64>
82105

83106
//
84-
// Sanity check on results.
107+
// Sanity check on the results.
85108
//
86109
// CHECK-COUNT-2: ( 87360, 89440, 91520, 93600, 95680, 97760, 99840, 101920, 104000, 106080, 108160, 110240, 112320, 114400, 116480, 118560, 120640, 122720, 124800, 126880, 128960, 131040, 133120, 135200, 137280, 139360, 141440, 143520, 145600, 147680, 149760, 151840, 153920, 156000, 158080, 160160, 162240, 164320, 166400, 168480, 170560, 172640, 174720, 176800, 178880, 180960, 183040, 185120, 187200, 189280, 191360, 193440, 195520, 197600, 199680, 201760, 203840, 205920, 208000, 210080, 212160, 214240, 216320, 218400 )
87110
//
111+
// CHECK-COUNT-2: ( 87361, 89441, 91521, 93601, 95681, 97761, 99841, 101921, 104001, 106081, 108161, 110241, 112321, 114401, 116481, 118561, 120641, 122721, 124801, 126881, 128961, 131041, 133121, 135201, 137281, 139361, 141441, 143521, 145601, 147681, 149761, 151841, 153921, 156001, 158081, 160161, 162241, 164321, 166401, 168481, 170561, 172641, 174721, 176801, 178881, 180961, 183041, 185121, 187201, 189281, 191361, 193441, 195521, 197601, 199681, 201761, 203841, 205921, 208001, 210081, 212161, 214241, 216321, 218401 )
112+
//
88113
%pb0 = vector.transfer_read %0[%c0], %f0 : tensor<?xf64>, vector<64xf64>
89114
vector.print %pb0 : vector<64xf64>
90115
%pb1 = vector.transfer_read %1[%c0], %f0 : tensor<?xf64>, vector<64xf64>
91116
vector.print %pb1 : vector<64xf64>
117+
%pb2 = vector.transfer_read %2[%c0], %f0 : tensor<?xf64>, vector<64xf64>
118+
vector.print %pb2 : vector<64xf64>
119+
%pb3 = vector.transfer_read %3[%c0], %f0 : tensor<?xf64>, vector<64xf64>
120+
vector.print %pb3 : vector<64xf64>
92121

93122
// Release the resources.
94123
bufferization.dealloc_tensor %Acoo : tensor<?x?xf64, #SortedCOO>

0 commit comments

Comments
 (0)