Skip to content

Commit cdbdf93

Browse files
committed
[mlir][sparse][gpu] extend SDDMM gpu test
Reviewed By: K-Wu Differential Revision: https://reviews.llvm.org/D153378
1 parent 55a2c4e commit cdbdf93

File tree

1 file changed

+48
-13
lines changed

1 file changed

+48
-13
lines changed

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,21 @@
11
//
22
// NOTE: this test requires gpu-sm80
33
//
4+
// with RT lib:
5+
//
6+
// RUN: mlir-opt %s \
7+
// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
8+
// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
9+
// RUN: mlir-cpu-runner \
10+
// RUN: --shared-libs=%mlir_cuda_runtime \
11+
// RUN: --shared-libs=%mlir_c_runner_utils \
12+
// RUN: --e entry --entry-point-result=void \
13+
// RUN: | FileCheck %s
14+
//
15+
// without RT lib:
16+
//
417
// RUN: mlir-opt %s \
5-
// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
18+
// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
619
// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
720
// RUN: mlir-cpu-runner \
821
// RUN: --shared-libs=%mlir_cuda_runtime \
@@ -21,20 +34,21 @@
2134
indexing_maps = [
2235
affine_map<(i,j,k) -> (i,k)>, // A
2336
affine_map<(i,j,k) -> (k,j)>, // B
24-
affine_map<(i,j,k) -> (i,j)> // S (out)
37+
affine_map<(i,j,k) -> (i,j)> // S (in/out)
2538
],
2639
iterator_types = ["parallel", "parallel", "reduction"],
27-
doc = "X(i,j) += S(i,j) SUM_k A(i,k) B(k,j)"
40+
doc = "S(i,j) += spy[S(i,j)] x SUM_k A(i,k) B(k,j)"
2841
}
2942

3043
//
3144
// Integration test that lowers a kernel annotated as sparse to
32-
// actual sparse code, initializes a matching sparse storage scheme
33-
// from file, and runs the resulting code with the JIT compiler.
45+
// actual sparse code, initializes sparse storage schemes, and
46+
// runs the resulting code with the JIT compiler.
3447
//
3548
module {
3649
//
37-
// A kernel that computes a sampled matrix matrix multiplication.
50+
// A kernel that computes a sampled dense matrix matrix multiplication
51+
// using a "spy" function and in-place update of the sampling sparse matrix.
3852
//
3953
func.func @sampled_dense_dense(%args: tensor<?x?xf32, #CSR>,
4054
%arga: tensor<?x?xf32>,
@@ -64,7 +78,7 @@ module {
6478
func.func private @getTensorFilename(index) -> (!Filename)
6579

6680
//
67-
// Main driver that reads matrix from file and calls the sparse kernel.
81+
// Main driver.
6882
//
6983
func.func @entry() {
7084
%d0 = arith.constant 0.0 : f32
@@ -74,19 +88,13 @@ module {
7488
%c10 = arith.constant 10 : index
7589

7690
// Initialize dense matrices.
77-
%x = tensor.generate %c5, %c5 {
78-
^bb0(%i : index, %j : index):
79-
tensor.yield %d0 : f32
80-
} : tensor<?x?xf32>
81-
8291
%a = tensor.generate %c5, %c10 {
8392
^bb0(%i: index, %j: index):
8493
%p = arith.addi %i, %c1 : index
8594
%q = arith.index_cast %p : index to i32
8695
%d = arith.sitofp %q : i32 to f32
8796
tensor.yield %d : f32
8897
} : tensor<?x?xf32>
89-
9098
%b = tensor.generate %c10, %c5 {
9199
^bb0(%i: index, %j: index):
92100
%p = arith.addi %j, %c1 : index
@@ -104,15 +112,42 @@ module {
104112
: (tensor<?x?xf32, #CSR>,
105113
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR>
106114

115+
//
107116
// Print the result for verification.
108117
//
109118
// CHECK: ( 11, 41.4, 42, 102.5, 93, 44.1, 164, 105.2, 255 )
119+
//
110120
%vm = sparse_tensor.values %0 : tensor<?x?xf32, #CSR> to memref<?xf32>
111121
%vv = vector.transfer_read %vm[%c0], %d0 : memref<?xf32>, vector<9xf32>
112122
vector.print %vv : vector<9xf32>
113123

124+
// Create a much sparser sampling matrix.
125+
%t = arith.constant sparse<[[0,0], [0,1], [1,0], [3,4], [7,7]],
126+
[1.0, 2.0, 3.0, 4.0, 5.0]
127+
> : tensor<8x8xf32>
128+
%q = sparse_tensor.convert %t : tensor<8x8xf32> to tensor<?x?xf32, #CSR>
129+
%a2 = arith.constant dense<2.0> : tensor<8x8xf32>
130+
%b1 = arith.constant dense<1.0> : tensor<8x8xf32>
131+
%a2c = tensor.cast %a2 : tensor<8x8xf32> to tensor<?x?xf32>
132+
%b1c = tensor.cast %b1 : tensor<8x8xf32> to tensor<?x?xf32>
133+
134+
// Call the kernel again.
135+
%1 = call @sampled_dense_dense(%q, %a2c, %b1c)
136+
: (tensor<?x?xf32, #CSR>,
137+
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR>
138+
139+
//
140+
// Print the result for verification.
141+
//
142+
// CHECK: ( ( 17, 18, 0, 0, 0, 0, 0, 0 ), ( 19, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 20, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 21 ) )
143+
//
144+
%d = sparse_tensor.convert %1 : tensor<?x?xf32, #CSR> to tensor<?x?xf32>
145+
%mm = vector.transfer_read %d[%c0, %c0], %d0 : tensor<?x?xf32>, vector<8x8xf32>
146+
vector.print %mm : vector<8x8xf32>
147+
114148
// Release the resources.
115149
bufferization.dealloc_tensor %0 : tensor<?x?xf32, #CSR>
150+
bufferization.dealloc_tensor %1 : tensor<?x?xf32, #CSR>
116151

117152
return
118153
}

0 commit comments

Comments
 (0)