Skip to content

[mlir][sparse][gpu] add CSC and BSR format to cuSparse GPU ops #67509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -1764,6 +1764,9 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
using this operation. The operation returns a handle to the sparse
matrix descriptor.

The CSR format has exactly the same memory layout as its transpose
in CSC format (and vice versa).

If the `async` keyword is present, the op is executed asynchronously (i.e.
it does not block until the execution has finished on the device). In
that case, it returns a !gpu.async.token in addition to the environment.
Expand Down Expand Up @@ -1793,6 +1796,94 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
}];
}

def GPU_CreateCscOp : GPU_Op<"create_csc", [GPU_AsyncOpInterface]> {
let summary = "Create sparse matrix in CSC format operation";
let description = [{
The `gpu.create_csc` operation initializes a sparse matrix in CSC format
with the given sizes from the given position, index, and values buffers.
The buffers must already be copied from the host to the device prior to
using this operation. The operation returns a handle to the sparse
matrix descriptor.

The CSC format has exactly the same memory layout as its transpose
in CSR format (and vice versa).

If the `async` keyword is present, the op is executed asynchronously (i.e.
it does not block until the execution has finished on the device). In
that case, it returns a !gpu.async.token in addition to the environment.

Example:

```mlir
%spmat, %token = gpu.create_csc async [%dep] %rows, %cols, %nnz, %colPos,
%rowIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
```
}];

let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
Index:$rows,
Index:$cols,
Index:$nnz,
AnyMemRef:$colPos,
AnyMemRef:$rowIdxs,
AnyMemRef:$values);
let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
Optional<GPU_AsyncToken>:$asyncToken);

let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
$rows `,` $cols `,` $nnz `,` $colPos `,` $rowIdxs `,` $values attr-dict
`:` type($colPos) `,` type($rowIdxs) `,` type($values)
}];
}

def GPU_CreateBsrOp : GPU_Op<"create_bsr", [GPU_AsyncOpInterface]> {
let summary = "Create sparse matrix in BSR format operation";
let description = [{
The `gpu.create_bsr` operation initializes a sparse matrix in BSR format
with the given sizes for the matrix and blocks from the given position,
index, and values buffers. The buffers must already be copied from the
host to the device prior to using this operation. The operation returns
a handle to the sparse matrix descriptor.

The BSR format is similar to CSR, where the column indices represent
two-dimensional blocks instead of a single matrix entry. Note that this
operation (currently) only supports storage with **square** blocks,
i.e., `rBlockSize == cBlockSize`.

If the `async` keyword is present, the op is executed asynchronously (i.e.
it does not block until the execution has finished on the device). In
that case, it returns a !gpu.async.token in addition to the environment.

Example:

```mlir
%spmat, %token = gpu.create_bsr async [%dep]
%brows, %bcols, %bnnz, %rBlockSize, %cBlockSize,
%bRowPos, %bColIdxs, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
```
}];

let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
Index:$brows,
Index:$bcols,
Index:$bnnz,
Index:$rBlockSize,
Index:$cBlockSize,
AnyMemRef:$bRowPos,
AnyMemRef:$bColIdxs,
AnyMemRef:$values);
let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
Optional<GPU_AsyncToken>:$asyncToken);

let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
$brows `,` $bcols `,` $bnnz `,` $rBlockSize `,` $cBlockSize `,`
$bRowPos `,` $bColIdxs `,` $values attr-dict
`:` type($bRowPos) `,` type($bColIdxs) `,` type($values)
}];
}

def GPU_Prune2To4SpMatFlag : I32EnumAttr<"Prune2To4SpMatFlag",
"pruning strategy for 2:4 sparse matrix",
[
Expand Down
109 changes: 104 additions & 5 deletions mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,19 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
{llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
llvmInt32Type, llvmPointerType /* void *stream */}};
FunctionCallBuilder createCscCallBuilder = {
"mgpuCreateCsc",
llvmPointerType,
{llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
llvmInt32Type, llvmPointerType /* void *stream */}};
FunctionCallBuilder createBsrCallBuilder = {
"mgpuCreateBsr",
llvmPointerType,
{llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmPointerType, llvmPointerType, llvmPointerType,
llvmInt32Type, llvmInt32Type, llvmInt32Type,
llvmPointerType /* void *stream */}};
FunctionCallBuilder destroySpMatCallBuilder = {
"mgpuDestroySpMat",
llvmVoidType,
Expand Down Expand Up @@ -554,6 +567,8 @@ DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(DestroyDnTensorOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCooOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCooAoSOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCsrOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCscOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateBsrOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(Create2To4SpMatOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(DestroySpMatOp)
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpMVBufferSizeOp)
Expand Down Expand Up @@ -627,11 +642,11 @@ LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,

// Corresponding to cusparseIndexType_t defined in cusparse.h.
static int32_t getCuSparseIndexTypeFrom(Type type) {
if (type.isa<IndexType>())
return 3; // CUSPARSE_INDEX_64I
else
if (type.isInteger(16))
return 1; // CUSPARSE_INDEX_16U
if (type.isInteger(32))
return 2; // CUSPARSE_INDEX_32I
// TODO: add support to CUSPARSE_INDEX_16U: 1
return 3; // CUSPARSE_INDEX_64I
}

static int32_t getCuSparseLtDataTypeFrom(Type type) {
Expand Down Expand Up @@ -684,6 +699,7 @@ static int32_t getCuSparseDataTypeFrom(Type type) {
static gpu::Prune2To4SpMatFlag get2To4PruneFlag(Value spMat) {
return spMat.getDefiningOp<gpu::Create2To4SpMatOp>().getPruneFlag();
}

// TODO: We may want a run-time (of the mlir compiler) disablement/warning:
// cusparseLt currently won't work for cuda architecture <8.0 and will trigger a
// runtime (of the CUDA program) error , but it might be great if we could at
Expand All @@ -696,9 +712,13 @@ static bool is2To4Sparsity(Value spMat) {
return true;
if (auto op = spMat.getDefiningOp<gpu::CreateCooOp>())
return false;
if (auto op = spMat.getDefiningOp<gpu::CreateCooAoSOp>())
return false;
if (auto op = spMat.getDefiningOp<gpu::CreateCsrOp>())
return false;
if (auto op = spMat.getDefiningOp<gpu::CreateCooAoSOp>())
if (auto op = spMat.getDefiningOp<gpu::CreateCscOp>())
return false;
if (auto op = spMat.getDefiningOp<gpu::CreateBsrOp>())
return false;
// Print the spMat defining op
spMat.getDefiningOp()->print(llvm::errs());
Expand Down Expand Up @@ -1916,6 +1936,83 @@ LogicalResult ConvertSetCsrPointersOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}

LogicalResult ConvertCreateCscOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::CreateCscOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
failed(isAsyncWithOneDependency(rewriter, op)))
return failure();
Location loc = op.getLoc();
auto stream = adaptor.getAsyncDependencies().front();
Value pColPos =
MemRefDescriptor(adaptor.getColPos()).allocatedPtr(rewriter, loc);
Value pRowIdxs =
MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc);
Value pValues =
MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
if (!getTypeConverter()->useOpaquePointers()) {
pColPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColPos);
pRowIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowIdxs);
pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
}
Type pType =
llvm::cast<MemRefType>(op.getColPos().getType()).getElementType();
Type iType =
llvm::cast<MemRefType>(op.getRowIdxs().getType()).getElementType();
Type dType =
llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
auto ptp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(pType));
auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
auto handle =
createCscCallBuilder
.create(loc, rewriter,
{adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
pColPos, pRowIdxs, pValues, ptp, itp, dtp, stream})
.getResult();
rewriter.replaceOp(op, {handle, stream});
return success();
}

LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::CreateBsrOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
failed(isAsyncWithOneDependency(rewriter, op)))
return failure();
Location loc = op.getLoc();
auto stream = adaptor.getAsyncDependencies().front();
Value pRowPos =
MemRefDescriptor(adaptor.getBRowPos()).allocatedPtr(rewriter, loc);
Value pColIdxs =
MemRefDescriptor(adaptor.getBColIdxs()).allocatedPtr(rewriter, loc);
Value pValues =
MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
if (!getTypeConverter()->useOpaquePointers()) {
pRowPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowPos);
pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
}
Type pType =
llvm::cast<MemRefType>(op.getBRowPos().getType()).getElementType();
Type iType =
llvm::cast<MemRefType>(op.getBColIdxs().getType()).getElementType();
Type dType =
llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
auto ptp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(pType));
auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
auto handle =
createBsrCallBuilder
.create(loc, rewriter,
{adaptor.getBrows(), adaptor.getBcols(), adaptor.getBnnz(),
adaptor.getRBlockSize(), adaptor.getCBlockSize(), pRowPos,
pColIdxs, pValues, ptp, itp, dtp, stream})
.getResult();
rewriter.replaceOp(op, {handle, stream});
return success();
}

void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns,
StringRef gpuBinaryAnnotation,
Expand All @@ -1941,6 +2038,8 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
ConvertCreateCooOpToGpuRuntimeCallPattern,
ConvertCreateCooAoSOpToGpuRuntimeCallPattern,
ConvertCreateCsrOpToGpuRuntimeCallPattern,
ConvertCreateCscOpToGpuRuntimeCallPattern,
ConvertCreateBsrOpToGpuRuntimeCallPattern,
ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern,
ConvertDestroySpMatOpToGpuRuntimeCallPattern,
ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,
Expand Down
28 changes: 28 additions & 0 deletions mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,34 @@ mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
return reinterpret_cast<void *>(mat);
}

extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
mgpuCreateCsc(intptr_t rows, intptr_t cols, intptr_t nnz, void *colPos,
void *rowIdxs, void *values, int32_t ptp, int32_t itp,
int32_t dtp, CUstream /*stream*/) {
cusparseSpMatDescr_t mat = nullptr;
auto pTp = static_cast<cusparseIndexType_t>(ptp);
auto iTp = static_cast<cusparseIndexType_t>(itp);
auto dTp = static_cast<cudaDataType_t>(dtp);
CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsc(&mat, rows, cols, nnz, colPos,
rowIdxs, values, pTp, iTp,
CUSPARSE_INDEX_BASE_ZERO, dTp))
return reinterpret_cast<void *>(mat);
}

extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
mgpuCreateBsr(intptr_t brows, intptr_t bcols, intptr_t bnnz, intptr_t rBsz,
intptr_t cBsz, void *rowPos, void *colIdxs, void *values,
int32_t ptp, int32_t itp, int32_t dtp, CUstream /*stream*/) {
cusparseSpMatDescr_t mat = nullptr;
auto pTp = static_cast<cusparseIndexType_t>(ptp);
auto iTp = static_cast<cusparseIndexType_t>(itp);
auto dTp = static_cast<cudaDataType_t>(dtp);
CUSPARSE_REPORT_IF_ERROR(cusparseCreateBsr(
&mat, brows, bcols, bnnz, rBsz, cBsz, rowPos, colIdxs, values, pTp, iTp,
CUSPARSE_INDEX_BASE_ZERO, dTp, CUSPARSE_ORDER_ROW))
return reinterpret_cast<void *>(mat);
}

extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
mgpuDestroySpMat(void *m, CUstream /*stream*/) {
cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,29 @@ module attributes {gpu.container_module} {
return
}

// CHECK-LABEL: func @csc_and_bsr
// CHECK: llvm.call @mgpuStreamCreate
// CHECK: llvm.call @mgpuMemAlloc
// CHECK: llvm.call @mgpuMemAlloc
// CHECK: llvm.call @mgpuCreateCsc
// CHECK: llvm.call @mgpuCreateBsr
// CHECK: llvm.call @mgpuDestroySpMat
// CHECK: llvm.call @mgpuDestroySpMat
// CHECK: llvm.call @mgpuStreamSynchronize
// CHECK: llvm.call @mgpuStreamDestroy
func.func @csc_and_bsr(%arg0: index) {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
%csc, %token3 = gpu.create_csc async [%token2]
%arg0, %arg0, %arg0, %mem1, %mem1, %mem2
: memref<?xindex>, memref<?xindex>, memref<?xf64>
%bsr, %token4 = gpu.create_bsr async [%token3]
%arg0, %arg0, %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
: memref<?xindex>, memref<?xindex>, memref<?xf64>
%token5 = gpu.destroy_sp_mat async [%token4] %csc
%token6 = gpu.destroy_sp_mat async [%token5] %bsr
gpu.wait [%token6]
return
}
}
24 changes: 24 additions & 0 deletions mlir/test/Dialect/GPU/sparse-roundtrip.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,28 @@ module attributes {gpu.container_module} {
return
}

// CHECK-LABEL: func @csc_and_bsr
// CHECK: %{{.*}} = gpu.wait async
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
// CHECK: %{{.*}}, %{{.*}} = gpu.create_csc async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
// CHECK: %{{.*}}, %{{.*}} = gpu.create_bsr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
// CHECK: gpu.wait [%{{.*}}]
// CHECK: return
func.func @csc_and_bsr(%arg0: index) {
%token0 = gpu.wait async
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
%csc, %token3 = gpu.create_csc async [%token2]
%arg0, %arg0, %arg0, %mem1, %mem1, %mem2
: memref<?xindex>, memref<?xindex>, memref<?xf64>
%bsr, %token4 = gpu.create_bsr async [%token3]
%arg0, %arg0, %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
: memref<?xindex>, memref<?xindex>, memref<?xf64>
%token5 = gpu.destroy_sp_mat async [%token4] %csc
%token6 = gpu.destroy_sp_mat async [%token5] %bsr
gpu.wait [%token6]
return
}

}