[mlir][sparse] recognize NVidia 2:4 type for matmul (llvm#76758)

aartbik · web-flow · commit 41a07e668c29 · 2024-01-02T14:44:24.000-08:00
This removes the temporary DENSE24 attribute and replaces it with proper
recognition of dense to 24 conversion. The compressionh will be
performed on the device prior to performing the matrix mult. Note that
we no longer need to start with the linalg version, we can lift this to
the proper named linalg op. Also renames some files into more consistent
names.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -448,6 +448,23 @@ static bool isAdmissibleBSR(SparseTensorType &aTp) {
   return false;
 }
 
+/// Test for 2:4 matrix with suitable metadata.
+static bool isAdmissible24(SparseTensorType &aTp) {
+  return aTp.getDimRank() == 2 && aTp.getLvlRank() == 3 && aTp.isDenseLvl(0) &&
+         aTp.isDenseLvl(1) && aTp.is2OutOf4Lvl(2) && isAdmissibleMetaData(aTp);
+}
+
+/// Test for conversion into 2:4 matrix.
+static bool isConversionInto24(Value v) {
+  if (auto cnv = v.getDefiningOp<ConvertOp>()) {
+    Value a = cnv.getResult();
+    Value d = cnv.getSource();
+    SparseTensorType aTp = getSparseTensorType(a);
+    return isDenseTensor(d) && isAdmissible24(aTp);
+  }
+  return false;
+}
+
 /// Returns a suitable sparse format for the operation and given operand
 /// types with cuSparse, or kNone if none is available.
 static CuSparseFormat getCuSparseFormat(SparseTensorType aTp,
@@ -925,6 +942,15 @@ static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
   Value C = op.getOperand(2); // we have C = AB
   SmallVector<Value> tokens;
 
+  // The cuSparselt API currently only allows pruning and compression
+  // to occur on the device. So we recognize the pattern
+  //    A' = convert A  ; dense to 2:4
+  //    C  = A'B        ; 2:4 matrix mult
+  // and then perform compression and matrix multiplication on device.
+  auto cnv = A.getDefiningOp<ConvertOp>();
+  assert(cnv);
+  A = cnv.getSource();
+
   // All input should be dense tensors.
   if (!isDenseTensor(A) || !isDenseTensor(B) || !isDenseTensor(C))
     return failure();
@@ -1260,7 +1286,7 @@ struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
         maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) {
       if (!isDenseTensor(op.getOperand(0)) && !isDenseTensor(op.getOperand(1)))
         return rewriteSpGEMM(rewriter, op, enableRT);
-      if (op->getAttr("DENSE24"))
+      if (isConversionInto24(op.getOperand(0)))
         return rewrite2To4SpMM(rewriter, op);
       return rewriteSpMM(rewriter, op, enableRT);
     }
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -970,7 +970,7 @@ mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
   // Note that this adds a synchronization on the stream.
   // TODO: Do we want that?
   if (prune_flag == 2) {
-    int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream);
+    int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream, false);
     CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck(
         &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream))
     int valid = 0;
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
@@ -1,5 +1,13 @@
 // RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
+#NV_24 = #sparse_tensor.encoding<{
+  map = ( i, j ) ->
+  ( i            : dense,
+    j floordiv 4 : dense,
+    j mod 4      : block2_4
+  )
+}>
+
 // CHECK-LABEL:   func.func @matmul(
 // CHECK-SAME:      %[[VAL_0:.*0]]: tensor<?x?xf16>,
 // CHECK-SAME:      %[[VAL_1:.*1]]: tensor<?x?xf16>,
@@ -51,18 +59,14 @@
 // CHECK:           %[[VAL_55:.*]] = bufferization.to_tensor %[[VAL_19]] : memref<?x?xf16>
 // CHECK:           return %[[VAL_55]] : tensor<?x?xf16>
 // CHECK:         }
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 module {
-  func.func @matmul(%arg0: tensor<?x?xf16>, %arg1: tensor<?x?xf16>, %arg2: tensor<?x?xf16>) -> tensor<?x?xf16> {
-    %0 = linalg.generic { DENSE24, indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<?x?xf16>, tensor<?x?xf16>) outs(%arg2 : tensor<?x?xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %1 = arith.mulf %in, %in_0 : f16
-      %2 = arith.addf %out, %1 : f16
-      linalg.yield %2 : f16
-    } -> tensor<?x?xf16>
-    return %0 : tensor<?x?xf16>
+  func.func @matmul(%Ad: tensor<?x?xf16>,
+                    %B: tensor<?x?xf16>,
+		    %Cin: tensor<?x?xf16>) -> tensor<?x?xf16> {
+    %A = sparse_tensor.convert %Ad : tensor<?x?xf16> to tensor<?x?xf16, #NV_24>
+    %C = linalg.matmul
+      ins(%A, %B: tensor<?x?xf16, #NV_24>, tensor<?x?xf16>)
+      outs(%Cin: tensor<?x?xf16>) -> tensor<?x?xf16>
+    return %C : tensor<?x?xf16>
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-hand.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-hand.mlir
@@ -1,40 +1,58 @@
 // NOTE: this test requires gpu-sm80 and cusparselt
 //
-// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
+// DEFINE: %{compile} = mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
+// DEFINE: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
+// DEFINE: %s
 // DEFINE: %{run} = mlir-cpu-runner \
 // DEFINE:   --shared-libs=%mlir_cuda_runtime \
 // DEFINE:   --shared-libs=%mlir_c_runner_utils \
 // DEFINE:   --e main --entry-point-result=void \
 // DEFINE: | FileCheck %s
 //
-// with RT lib:
-//
-// RUN: %{compile} enable-runtime-library=true"  | %{run}
-//
-// without RT lib:
-//
-// RUN: %{compile} enable-runtime-library=false" | %{run}
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+// RUN: %{compile} | %{run}
 
 module {
   llvm.func @mgpuCreateSparseLtEnv()
   llvm.func @mgpuDestroySparseLtEnv()
 
-  //
-  // TODO: This uses our temporary ATTRIBUTE, replace with 2:4 type!
-  //
-  func.func @matmul_2to4(%arg0: tensor<16x32xf16>, %arg1: tensor<32x16xf16>, %arg2: tensor<16x16xf16>) -> tensor<16x16xf16> {
-    %0 = linalg.generic { DENSE24, indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<16x32xf16>, tensor<32x16xf16>) outs(%arg2 : tensor<16x16xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %1 = arith.mulf %in, %in_0 : f16
-      %2 = arith.addf %out, %1 : f16
-      linalg.yield %2 : f16
-    } -> tensor<16x16xf16>
-    return %0 : tensor<16x16xf16>
+  // cuSparselt version for matmul coded by hand.
+  func.func @matmul24(%a : memref<16x32xf16>,
+                      %b : memref<32x16xf16>,
+                      %c : memref<16x16xf16>) {
+    %c0  = arith.constant 0.0 : f16
+    %c1  = arith.constant 1   : index
+    %c2  = arith.constant 2   : index
+    %c8  = arith.constant 8   : index
+    %c16 = arith.constant 16  : index
+    %c32 = arith.constant 32  : index
+    %c1048576 = arith.constant 1048576 : index
+    %token0 = gpu.wait async
+    %d_a, %token1 = gpu.alloc async [%token0] () : memref<16x32xf16>
+    %d_b, %token2 = gpu.alloc async [%token1] () : memref<32x16xf16>
+    %d_c, %token3 = gpu.alloc async [%token2] () : memref<16x16xf16>
+    %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16>
+    %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16>
+    %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16>
+    %spmat, %token8 = gpu.create_2to4_spmat async  [%token6]{PRUNE_AND_CHECK} %c16, %c32, %d_a: memref<16x32xf16>
+    %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16>
+    %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16>
+    %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16
+    %mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref<?xf16>
+    %mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref<?xf16>
+    %mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref<?xf16>
+    %token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
+    %token16 = gpu.destroy_sp_mat async [%token15] %spmat
+    %token17 = gpu.destroy_dn_tensor async [%token16] %dnmat
+    %token18 = gpu.destroy_dn_tensor async [%token17] %dnmat2
+    %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
+    %token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16>
+    %token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16>
+    %token22 = gpu.dealloc async [%token21] %d_a : memref<16x32xf16>
+    %token23 = gpu.dealloc async [%token22] %mem3 : memref<?xf16>
+    %token24 = gpu.dealloc async [%token23] %mem2 : memref<?xf16>
+    %token25 = gpu.dealloc async [%token24] %mem1 : memref<?xf16>
+    gpu.wait [%token25]
+    return
   }
 
   //
@@ -54,50 +72,49 @@ module {
     %c64 = arith.constant 64  : index
 
     // Matrices A, B, C (16x32, 32x16, 16x16).
+    %a = memref.alloc() : memref<16x32xf16> // 16x32 with 2:4, row-major
+    %b = memref.alloc() : memref<32x16xf16> // regular dense   column-major
+    %c = memref.alloc() : memref<16x16xf16> // accumulator     row-major
 
     //
     // Setup matrix A.
     //
-    %DA = tensor.generate {
-    ^bb0(%i: index, %j: index):
-      // (i+ j/2 + 1) if j %2 == 0 else 0
-      %cf0 = arith.constant 0.0 : f16
-      %cf1 = arith.constant 1.0 : f16
-      %j_2 = arith.floordivsi %j, %c2 : index
-      %quotient = arith.remsi %j, %c2 : index
-      %sum = arith.addi %i, %j_2 : index
-      %sum_i = arith.index_cast %sum : index to i64
-      %sum_f = arith.uitofp %sum_i : i64 to f16
-      %sum_f_plus1 = arith.addf %sum_f, %cf1 : f16
-      %is_zero = arith.cmpi "eq", %quotient, %c0 : index
-      %s = arith.select %is_zero, %sum_f_plus1, %cf0 : f16
-      tensor.yield %s : f16
-    } : tensor<16x32xf16>
+    scf.for %ai = %c0 to %c16 step %c1 {
+      scf.for %aj = %c0 to %c16 step %c1 {
+        %cf0  = arith.constant 0.0: f16
+        %a0 = arith.addi %ai, %aj : index
+        %a1 = arith.addi %a0, %c1 : index
+        %a2 = arith.index_cast %a1 : index to i32
+        %a3 = arith.sitofp %a2 : i32 to f16
+        %ajj = arith.muli %aj, %c2 : index
+        %ajj2 = arith.addi %ajj, %c1 : index
+        memref.store %a3, %a[%ai, %ajj] : memref<16x32xf16>
+        memref.store %cf0, %a[%ai, %ajj2] : memref<16x32xf16>
+      }
+    }
 
     //
     // Setup matrix B.
     //
-    %DB = tensor.generate {
-    ^bb0(%i: index, %j: index):
-      // if j_i >=8, j_i - 8 else 0
-      %is_ge8 = arith.cmpi "sge", %j, %c8 : index
-      %j_minus8 = arith.subi %j, %c8 : index
-      %j2 = arith.select %is_ge8, %j_minus8, %j : index
-      %r_i = arith.subi %j2, %i : index
-      %r_i64 = arith.index_cast %r_i : index to i64
-      %r_f = arith.sitofp %r_i64 : i64 to f16
-      tensor.yield %r_f : f16
-    } : tensor<32x16xf16>
+    scf.for %bi = %c0 to %c8 step %c1 {
+      scf.for %bj = %c0 to %c32 step %c1 {
+        %b0 = arith.subi %bi, %bj : index
+        %b1 = arith.index_cast %b0 : index to i32
+        %b2 = arith.sitofp %b1 : i32 to f16
+        %bii = arith.addi %bi, %c8 : index
+        memref.store %b2, %b[%bj, %bi] : memref<32x16xf16>
+        memref.store %b2, %b[%bj, %bii] : memref<32x16xf16>
+      }
+    }
 
     //
     // Reset matrix C.
     //
-    %DC = tensor.generate {
-    ^bb0(%i: index, %j: index):
-      %cf0 = arith.constant 0.0 : f16
-      tensor.yield %cf0 : f16
-    } : tensor<16x16xf16>
-
+    scf.for %ci = %c0 to %c16 step %c1 {
+      scf.for %cj = %c0 to %c16 step %c1 {
+        memref.store %f0, %c[%ci, %cj] : memref<16x16xf16>
+      }
+    }
 
     //
     // Sanity check on 16x32 full 2:4 input matrix A.
@@ -121,7 +138,7 @@ module {
     // CHECK-NEXT: ( 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0 )
     //
     scf.for %pai = %c0 to %c16 step %c1 {
-      %pa0 = vector.transfer_read %DA[%pai, %c0], %f0 : tensor<16x32xf16>, vector<32xf16>
+      %pa0 = vector.transfer_read %a[%pai, %c0], %f0 : memref<16x32xf16>, vector<32xf16>
       vector.print %pa0 : vector<32xf16>
     }
 
@@ -163,14 +180,12 @@ module {
     //
     //
     scf.for %pbi = %c0 to %c32 step %c1 {
-      %pb0 = vector.transfer_read %DB[%pbi, %c0], %f0 : tensor<32x16xf16>, vector<16xf16>
+      %pb0 = vector.transfer_read %b[%pbi, %c0], %f0 : memref<32x16xf16>, vector<16xf16>
       vector.print %pb0 : vector<16xf16>
     }
 
     // Call the kernel.
-    %t1  = arith.constant 1  : index
-    %t32 = arith.constant 32 : index
-    %c_out = call @matmul_2to4 (%DA, %DB, %DC): (tensor<16x32xf16>, tensor<32x16xf16>, tensor<16x16xf16>) -> tensor<16x16xf16>
+    call @matmul24(%a, %b, %c): (memref<16x32xf16>, memref<32x16xf16>, memref<16x16xf16>) -> ()
 
     //
     // Verify computed matrix C.
@@ -193,7 +208,7 @@ module {
     // CHECK-NEXT: ( -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688, -6320, -5944, -5568, -5192, -4816, -4440, -4064, -3688  )
     //
     scf.for %pci = %c0 to %c16 step %c1 {
-      %pc0 = vector.transfer_read %c_out[%pci, %c0], %f0 : tensor<16x16xf16>, vector<16xf16>
+      %pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x16xf16>, vector<16xf16>
       vector.print %pc0 : vector<16xf16>
     }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir