[mlir][sparse][gpu] add CSC and BSR format to cuSparse GPU ops (llvm#67509)

aartbik · web-flow · commit 39038177ee7b · 2023-09-27T09:32:25.000-07:00
This adds two cuSparse formats to the GPU dialect support. Together with
proper lowering and runtime cuda support. Also fixes a few minor
omissions.
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1764,6 +1764,9 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
     using this operation. The operation returns a handle to the sparse
     matrix descriptor.
 
+    The CSR format has exactly the same memory layout as its transpose
+    in CSC format (and vice versa).
+
     If the `async` keyword is present, the op is executed asynchronously (i.e.
     it does not block until the execution has finished on the device). In
     that case, it returns a !gpu.async.token in addition to the environment.
@@ -1793,6 +1796,94 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
   }];
 }
 
+def GPU_CreateCscOp : GPU_Op<"create_csc", [GPU_AsyncOpInterface]> {
+  let summary = "Create sparse matrix in CSC format operation";
+  let description = [{
+    The `gpu.create_csc` operation initializes a sparse matrix in CSC format
+    with the given sizes from the given position, index, and values buffers.
+    The buffers must already be copied from the host to the device prior to
+    using this operation. The operation returns a handle to the sparse
+    matrix descriptor.
+
+    The CSC format has exactly the same memory layout as its transpose
+    in CSR format (and vice versa).
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %spmat, %token = gpu.create_csc async [%dep] %rows, %cols, %nnz, %colPos,
+        %rowIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   Index:$rows,
+                   Index:$cols,
+                   Index:$nnz,
+                   AnyMemRef:$colPos,
+                   AnyMemRef:$rowIdxs,
+                   AnyMemRef:$values);
+  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
+                      Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $rows `,` $cols `,` $nnz `,` $colPos `,` $rowIdxs `,` $values attr-dict
+    `:` type($colPos) `,` type($rowIdxs) `,` type($values)
+  }];
+}
+
+def GPU_CreateBsrOp : GPU_Op<"create_bsr", [GPU_AsyncOpInterface]> {
+  let summary = "Create sparse matrix in BSR format operation";
+  let description = [{
+    The `gpu.create_bsr` operation initializes a sparse matrix in BSR format
+    with the given sizes for the matrix and blocks from the given position,
+    index, and values buffers. The buffers must already be copied from the
+    host to the device prior to using this operation. The operation returns
+    a handle to the sparse matrix descriptor.
+
+    The BSR format is similar to CSR, where the column indices represent
+    two-dimensional blocks instead of a single matrix entry. Note that this
+    operation (currently) only supports storage with **square** blocks,
+    i.e., `rBlockSize == cBlockSize`.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %spmat, %token = gpu.create_bsr async [%dep]
+       %brows, %bcols, %bnnz, %rBlockSize, %cBlockSize,
+       %bRowPos, %bColIdxs, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   Index:$brows,
+                   Index:$bcols,
+                   Index:$bnnz,
+                   Index:$rBlockSize,
+                   Index:$cBlockSize,
+                   AnyMemRef:$bRowPos,
+                   AnyMemRef:$bColIdxs,
+                   AnyMemRef:$values);
+  let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
+                      Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $brows `,` $bcols `,` $bnnz `,` $rBlockSize `,` $cBlockSize `,`
+    $bRowPos `,` $bColIdxs `,` $values attr-dict
+    `:` type($bRowPos) `,` type($bColIdxs) `,` type($values)
+  }];
+}
+
 def GPU_Prune2To4SpMatFlag : I32EnumAttr<"Prune2To4SpMatFlag",
   "pruning strategy for 2:4 sparse matrix",
   [
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -233,6 +233,19 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
        llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
        llvmInt32Type, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder createCscCallBuilder = {
+      "mgpuCreateCsc",
+      llvmPointerType,
+      {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+       llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
+       llvmInt32Type, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder createBsrCallBuilder = {
+      "mgpuCreateBsr",
+      llvmPointerType,
+      {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
+       llvmIntPtrType, llvmPointerType, llvmPointerType, llvmPointerType,
+       llvmInt32Type, llvmInt32Type, llvmInt32Type,
+       llvmPointerType /* void *stream */}};
   FunctionCallBuilder destroySpMatCallBuilder = {
       "mgpuDestroySpMat",
       llvmVoidType,
@@ -554,6 +567,8 @@ DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(DestroyDnTensorOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCooOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCooAoSOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCsrOp)
+DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCscOp)
+DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateBsrOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(Create2To4SpMatOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(DestroySpMatOp)
 DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpMVBufferSizeOp)
@@ -627,11 +642,11 @@ LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
 
 // Corresponding to cusparseIndexType_t defined in cusparse.h.
 static int32_t getCuSparseIndexTypeFrom(Type type) {
-  if (type.isa<IndexType>())
-    return 3; // CUSPARSE_INDEX_64I
-  else
+  if (type.isInteger(16))
+    return 1; // CUSPARSE_INDEX_16U
+  if (type.isInteger(32))
     return 2; // CUSPARSE_INDEX_32I
-  // TODO: add support to CUSPARSE_INDEX_16U: 1
+  return 3;   // CUSPARSE_INDEX_64I
 }
 
 static int32_t getCuSparseLtDataTypeFrom(Type type) {
@@ -684,6 +699,7 @@ static int32_t getCuSparseDataTypeFrom(Type type) {
 static gpu::Prune2To4SpMatFlag get2To4PruneFlag(Value spMat) {
   return spMat.getDefiningOp<gpu::Create2To4SpMatOp>().getPruneFlag();
 }
+
 // TODO:  We may want a run-time (of the mlir compiler) disablement/warning:
 // cusparseLt currently won't work for cuda architecture <8.0 and will trigger a
 // runtime (of the CUDA program) error , but it might be great if we could at
@@ -696,9 +712,13 @@ static bool is2To4Sparsity(Value spMat) {
     return true;
   if (auto op = spMat.getDefiningOp<gpu::CreateCooOp>())
     return false;
+  if (auto op = spMat.getDefiningOp<gpu::CreateCooAoSOp>())
+    return false;
   if (auto op = spMat.getDefiningOp<gpu::CreateCsrOp>())
     return false;
-  if (auto op = spMat.getDefiningOp<gpu::CreateCooAoSOp>())
+  if (auto op = spMat.getDefiningOp<gpu::CreateCscOp>())
+    return false;
+  if (auto op = spMat.getDefiningOp<gpu::CreateBsrOp>())
     return false;
   // Print the spMat defining op
   spMat.getDefiningOp()->print(llvm::errs());
@@ -1916,6 +1936,83 @@ LogicalResult ConvertSetCsrPointersOpToGpuRuntimeCallPattern::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertCreateCscOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateCscOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pColPos =
+      MemRefDescriptor(adaptor.getColPos()).allocatedPtr(rewriter, loc);
+  Value pRowIdxs =
+      MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc);
+  Value pValues =
+      MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers()) {
+    pColPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColPos);
+    pRowIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowIdxs);
+    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
+  }
+  Type pType =
+      llvm::cast<MemRefType>(op.getColPos().getType()).getElementType();
+  Type iType =
+      llvm::cast<MemRefType>(op.getRowIdxs().getType()).getElementType();
+  Type dType =
+      llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
+  auto ptp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(pType));
+  auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
+  auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
+  auto handle =
+      createCscCallBuilder
+          .create(loc, rewriter,
+                  {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
+                   pColPos, pRowIdxs, pValues, ptp, itp, dtp, stream})
+          .getResult();
+  rewriter.replaceOp(op, {handle, stream});
+  return success();
+}
+
+LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateBsrOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pRowPos =
+      MemRefDescriptor(adaptor.getBRowPos()).allocatedPtr(rewriter, loc);
+  Value pColIdxs =
+      MemRefDescriptor(adaptor.getBColIdxs()).allocatedPtr(rewriter, loc);
+  Value pValues =
+      MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers()) {
+    pRowPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowPos);
+    pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
+    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
+  }
+  Type pType =
+      llvm::cast<MemRefType>(op.getBRowPos().getType()).getElementType();
+  Type iType =
+      llvm::cast<MemRefType>(op.getBColIdxs().getType()).getElementType();
+  Type dType =
+      llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
+  auto ptp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(pType));
+  auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
+  auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
+  auto handle =
+      createBsrCallBuilder
+          .create(loc, rewriter,
+                  {adaptor.getBrows(), adaptor.getBcols(), adaptor.getBnnz(),
+                   adaptor.getRBlockSize(), adaptor.getCBlockSize(), pRowPos,
+                   pColIdxs, pValues, ptp, itp, dtp, stream})
+          .getResult();
+  rewriter.replaceOp(op, {handle, stream});
+  return success();
+}
+
 void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                                RewritePatternSet &patterns,
                                                StringRef gpuBinaryAnnotation,
@@ -1941,6 +2038,8 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                ConvertCreateCooOpToGpuRuntimeCallPattern,
                ConvertCreateCooAoSOpToGpuRuntimeCallPattern,
                ConvertCreateCsrOpToGpuRuntimeCallPattern,
+               ConvertCreateCscOpToGpuRuntimeCallPattern,
+               ConvertCreateBsrOpToGpuRuntimeCallPattern,
                ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern,
                ConvertDestroySpMatOpToGpuRuntimeCallPattern,
                ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -526,6 +526,34 @@ mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
   return reinterpret_cast<void *>(mat);
 }
 
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateCsc(intptr_t rows, intptr_t cols, intptr_t nnz, void *colPos,
+              void *rowIdxs, void *values, int32_t ptp, int32_t itp,
+              int32_t dtp, CUstream /*stream*/) {
+  cusparseSpMatDescr_t mat = nullptr;
+  auto pTp = static_cast<cusparseIndexType_t>(ptp);
+  auto iTp = static_cast<cusparseIndexType_t>(itp);
+  auto dTp = static_cast<cudaDataType_t>(dtp);
+  CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsc(&mat, rows, cols, nnz, colPos,
+                                             rowIdxs, values, pTp, iTp,
+                                             CUSPARSE_INDEX_BASE_ZERO, dTp))
+  return reinterpret_cast<void *>(mat);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateBsr(intptr_t brows, intptr_t bcols, intptr_t bnnz, intptr_t rBsz,
+              intptr_t cBsz, void *rowPos, void *colIdxs, void *values,
+              int32_t ptp, int32_t itp, int32_t dtp, CUstream /*stream*/) {
+  cusparseSpMatDescr_t mat = nullptr;
+  auto pTp = static_cast<cusparseIndexType_t>(ptp);
+  auto iTp = static_cast<cusparseIndexType_t>(itp);
+  auto dTp = static_cast<cudaDataType_t>(dtp);
+  CUSPARSE_REPORT_IF_ERROR(cusparseCreateBsr(
+      &mat, brows, bcols, bnnz, rBsz, cBsz, rowPos, colIdxs, values, pTp, iTp,
+      CUSPARSE_INDEX_BASE_ZERO, dTp, CUSPARSE_ORDER_ROW))
+  return reinterpret_cast<void *>(mat);
+}
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuDestroySpMat(void *m, CUstream /*stream*/) {
   cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m);
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -128,4 +128,29 @@ module attributes {gpu.container_module} {
     return
   }
 
+  // CHECK-LABEL: func @csc_and_bsr
+  // CHECK: llvm.call @mgpuStreamCreate
+  // CHECK: llvm.call @mgpuMemAlloc
+  // CHECK: llvm.call @mgpuMemAlloc
+  // CHECK: llvm.call @mgpuCreateCsc
+  // CHECK: llvm.call @mgpuCreateBsr
+  // CHECK: llvm.call @mgpuDestroySpMat
+  // CHECK: llvm.call @mgpuDestroySpMat
+  // CHECK: llvm.call @mgpuStreamSynchronize
+  // CHECK: llvm.call @mgpuStreamDestroy
+  func.func @csc_and_bsr(%arg0: index) {
+    %token0 = gpu.wait async
+    %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
+    %csc, %token3 = gpu.create_csc async [%token2]
+      %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
+      : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    %bsr, %token4 = gpu.create_bsr async [%token3]
+      %arg0, %arg0, %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
+      : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    %token5 = gpu.destroy_sp_mat async [%token4] %csc
+    %token6 = gpu.destroy_sp_mat async [%token5] %bsr
+    gpu.wait [%token6]
+    return
+  }
 }
diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
@@ -128,4 +128,28 @@ module attributes {gpu.container_module} {
     return
   }
 
+  // CHECK-LABEL: func @csc_and_bsr
+  // CHECK: %{{.*}} = gpu.wait async
+  // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_csc async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_bsr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
+  // CHECK: gpu.wait [%{{.*}}]
+  // CHECK: return
+  func.func @csc_and_bsr(%arg0: index) {
+    %token0 = gpu.wait async
+    %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
+    %csc, %token3 = gpu.create_csc async [%token2]
+      %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
+      : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    %bsr, %token4 = gpu.create_bsr async [%token3]
+      %arg0, %arg0, %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
+      : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    %token5 = gpu.destroy_sp_mat async [%token4] %csc
+    %token6 = gpu.destroy_sp_mat async [%token5] %bsr
+    gpu.wait [%token6]
+    return
+  }
+
 }