Skip to content

Commit 3903817

Browse files
authored
[mlir][sparse][gpu] add CSC and BSR format to cuSparse GPU ops (llvm#67509)
This adds two cuSparse formats to the GPU dialect support. Together with proper lowering and runtime cuda support. Also fixes a few minor omissions.
1 parent 7658688 commit 3903817

File tree

5 files changed

+272
-5
lines changed

5 files changed

+272
-5
lines changed

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1764,6 +1764,9 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
17641764
using this operation. The operation returns a handle to the sparse
17651765
matrix descriptor.
17661766

1767+
The CSR format has exactly the same memory layout as its transpose
1768+
in CSC format (and vice versa).
1769+
17671770
If the `async` keyword is present, the op is executed asynchronously (i.e.
17681771
it does not block until the execution has finished on the device). In
17691772
that case, it returns a !gpu.async.token in addition to the environment.
@@ -1793,6 +1796,94 @@ def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
17931796
}];
17941797
}
17951798

1799+
def GPU_CreateCscOp : GPU_Op<"create_csc", [GPU_AsyncOpInterface]> {
1800+
let summary = "Create sparse matrix in CSC format operation";
1801+
let description = [{
1802+
The `gpu.create_csc` operation initializes a sparse matrix in CSC format
1803+
with the given sizes from the given position, index, and values buffers.
1804+
The buffers must already be copied from the host to the device prior to
1805+
using this operation. The operation returns a handle to the sparse
1806+
matrix descriptor.
1807+
1808+
The CSC format has exactly the same memory layout as its transpose
1809+
in CSR format (and vice versa).
1810+
1811+
If the `async` keyword is present, the op is executed asynchronously (i.e.
1812+
it does not block until the execution has finished on the device). In
1813+
that case, it returns a !gpu.async.token in addition to the environment.
1814+
1815+
Example:
1816+
1817+
```mlir
1818+
%spmat, %token = gpu.create_csc async [%dep] %rows, %cols, %nnz, %colPos,
1819+
%rowIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
1820+
```
1821+
}];
1822+
1823+
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
1824+
Index:$rows,
1825+
Index:$cols,
1826+
Index:$nnz,
1827+
AnyMemRef:$colPos,
1828+
AnyMemRef:$rowIdxs,
1829+
AnyMemRef:$values);
1830+
let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
1831+
Optional<GPU_AsyncToken>:$asyncToken);
1832+
1833+
let assemblyFormat = [{
1834+
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
1835+
$rows `,` $cols `,` $nnz `,` $colPos `,` $rowIdxs `,` $values attr-dict
1836+
`:` type($colPos) `,` type($rowIdxs) `,` type($values)
1837+
}];
1838+
}
1839+
1840+
def GPU_CreateBsrOp : GPU_Op<"create_bsr", [GPU_AsyncOpInterface]> {
1841+
let summary = "Create sparse matrix in BSR format operation";
1842+
let description = [{
1843+
The `gpu.create_bsr` operation initializes a sparse matrix in BSR format
1844+
with the given sizes for the matrix and blocks from the given position,
1845+
index, and values buffers. The buffers must already be copied from the
1846+
host to the device prior to using this operation. The operation returns
1847+
a handle to the sparse matrix descriptor.
1848+
1849+
The BSR format is similar to CSR, where the column indices represent
1850+
two-dimensional blocks instead of a single matrix entry. Note that this
1851+
operation (currently) only supports storage with **square** blocks,
1852+
i.e., `rBlockSize == cBlockSize`.
1853+
1854+
If the `async` keyword is present, the op is executed asynchronously (i.e.
1855+
it does not block until the execution has finished on the device). In
1856+
that case, it returns a !gpu.async.token in addition to the environment.
1857+
1858+
Example:
1859+
1860+
```mlir
1861+
%spmat, %token = gpu.create_bsr async [%dep]
1862+
%brows, %bcols, %bnnz, %rBlockSize, %cBlockSize,
1863+
%bRowPos, %bColIdxs, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
1864+
```
1865+
}];
1866+
1867+
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
1868+
Index:$brows,
1869+
Index:$bcols,
1870+
Index:$bnnz,
1871+
Index:$rBlockSize,
1872+
Index:$cBlockSize,
1873+
AnyMemRef:$bRowPos,
1874+
AnyMemRef:$bColIdxs,
1875+
AnyMemRef:$values);
1876+
let results = (outs Res<GPU_SparseSpMatHandle>:$spmat,
1877+
Optional<GPU_AsyncToken>:$asyncToken);
1878+
1879+
let assemblyFormat = [{
1880+
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
1881+
$brows `,` $bcols `,` $bnnz `,` $rBlockSize `,` $cBlockSize `,`
1882+
$bRowPos `,` $bColIdxs `,` $values attr-dict
1883+
`:` type($bRowPos) `,` type($bColIdxs) `,` type($values)
1884+
}];
1885+
}
1886+
17961887
def GPU_Prune2To4SpMatFlag : I32EnumAttr<"Prune2To4SpMatFlag",
17971888
"pruning strategy for 2:4 sparse matrix",
17981889
[

mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

Lines changed: 104 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,19 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
233233
{llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
234234
llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
235235
llvmInt32Type, llvmPointerType /* void *stream */}};
236+
FunctionCallBuilder createCscCallBuilder = {
237+
"mgpuCreateCsc",
238+
llvmPointerType,
239+
{llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
240+
llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
241+
llvmInt32Type, llvmPointerType /* void *stream */}};
242+
FunctionCallBuilder createBsrCallBuilder = {
243+
"mgpuCreateBsr",
244+
llvmPointerType,
245+
{llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
246+
llvmIntPtrType, llvmPointerType, llvmPointerType, llvmPointerType,
247+
llvmInt32Type, llvmInt32Type, llvmInt32Type,
248+
llvmPointerType /* void *stream */}};
236249
FunctionCallBuilder destroySpMatCallBuilder = {
237250
"mgpuDestroySpMat",
238251
llvmVoidType,
@@ -554,6 +567,8 @@ DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(DestroyDnTensorOp)
554567
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCooOp)
555568
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCooAoSOp)
556569
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCsrOp)
570+
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateCscOp)
571+
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(CreateBsrOp)
557572
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(Create2To4SpMatOp)
558573
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(DestroySpMatOp)
559574
DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpMVBufferSizeOp)
@@ -627,11 +642,11 @@ LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
627642

628643
// Corresponding to cusparseIndexType_t defined in cusparse.h.
629644
static int32_t getCuSparseIndexTypeFrom(Type type) {
630-
if (type.isa<IndexType>())
631-
return 3; // CUSPARSE_INDEX_64I
632-
else
645+
if (type.isInteger(16))
646+
return 1; // CUSPARSE_INDEX_16U
647+
if (type.isInteger(32))
633648
return 2; // CUSPARSE_INDEX_32I
634-
// TODO: add support to CUSPARSE_INDEX_16U: 1
649+
return 3; // CUSPARSE_INDEX_64I
635650
}
636651

637652
static int32_t getCuSparseLtDataTypeFrom(Type type) {
@@ -684,6 +699,7 @@ static int32_t getCuSparseDataTypeFrom(Type type) {
684699
static gpu::Prune2To4SpMatFlag get2To4PruneFlag(Value spMat) {
685700
return spMat.getDefiningOp<gpu::Create2To4SpMatOp>().getPruneFlag();
686701
}
702+
687703
// TODO: We may want a run-time (of the mlir compiler) disablement/warning:
688704
// cusparseLt currently won't work for cuda architecture <8.0 and will trigger a
689705
// runtime (of the CUDA program) error , but it might be great if we could at
@@ -696,9 +712,13 @@ static bool is2To4Sparsity(Value spMat) {
696712
return true;
697713
if (auto op = spMat.getDefiningOp<gpu::CreateCooOp>())
698714
return false;
715+
if (auto op = spMat.getDefiningOp<gpu::CreateCooAoSOp>())
716+
return false;
699717
if (auto op = spMat.getDefiningOp<gpu::CreateCsrOp>())
700718
return false;
701-
if (auto op = spMat.getDefiningOp<gpu::CreateCooAoSOp>())
719+
if (auto op = spMat.getDefiningOp<gpu::CreateCscOp>())
720+
return false;
721+
if (auto op = spMat.getDefiningOp<gpu::CreateBsrOp>())
702722
return false;
703723
// Print the spMat defining op
704724
spMat.getDefiningOp()->print(llvm::errs());
@@ -1916,6 +1936,83 @@ LogicalResult ConvertSetCsrPointersOpToGpuRuntimeCallPattern::matchAndRewrite(
19161936
return success();
19171937
}
19181938

1939+
LogicalResult ConvertCreateCscOpToGpuRuntimeCallPattern::matchAndRewrite(
1940+
gpu::CreateCscOp op, OpAdaptor adaptor,
1941+
ConversionPatternRewriter &rewriter) const {
1942+
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
1943+
failed(isAsyncWithOneDependency(rewriter, op)))
1944+
return failure();
1945+
Location loc = op.getLoc();
1946+
auto stream = adaptor.getAsyncDependencies().front();
1947+
Value pColPos =
1948+
MemRefDescriptor(adaptor.getColPos()).allocatedPtr(rewriter, loc);
1949+
Value pRowIdxs =
1950+
MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc);
1951+
Value pValues =
1952+
MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
1953+
if (!getTypeConverter()->useOpaquePointers()) {
1954+
pColPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColPos);
1955+
pRowIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowIdxs);
1956+
pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
1957+
}
1958+
Type pType =
1959+
llvm::cast<MemRefType>(op.getColPos().getType()).getElementType();
1960+
Type iType =
1961+
llvm::cast<MemRefType>(op.getRowIdxs().getType()).getElementType();
1962+
Type dType =
1963+
llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
1964+
auto ptp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(pType));
1965+
auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
1966+
auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
1967+
auto handle =
1968+
createCscCallBuilder
1969+
.create(loc, rewriter,
1970+
{adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
1971+
pColPos, pRowIdxs, pValues, ptp, itp, dtp, stream})
1972+
.getResult();
1973+
rewriter.replaceOp(op, {handle, stream});
1974+
return success();
1975+
}
1976+
1977+
LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
1978+
gpu::CreateBsrOp op, OpAdaptor adaptor,
1979+
ConversionPatternRewriter &rewriter) const {
1980+
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
1981+
failed(isAsyncWithOneDependency(rewriter, op)))
1982+
return failure();
1983+
Location loc = op.getLoc();
1984+
auto stream = adaptor.getAsyncDependencies().front();
1985+
Value pRowPos =
1986+
MemRefDescriptor(adaptor.getBRowPos()).allocatedPtr(rewriter, loc);
1987+
Value pColIdxs =
1988+
MemRefDescriptor(adaptor.getBColIdxs()).allocatedPtr(rewriter, loc);
1989+
Value pValues =
1990+
MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
1991+
if (!getTypeConverter()->useOpaquePointers()) {
1992+
pRowPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowPos);
1993+
pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
1994+
pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
1995+
}
1996+
Type pType =
1997+
llvm::cast<MemRefType>(op.getBRowPos().getType()).getElementType();
1998+
Type iType =
1999+
llvm::cast<MemRefType>(op.getBColIdxs().getType()).getElementType();
2000+
Type dType =
2001+
llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
2002+
auto ptp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(pType));
2003+
auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
2004+
auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
2005+
auto handle =
2006+
createBsrCallBuilder
2007+
.create(loc, rewriter,
2008+
{adaptor.getBrows(), adaptor.getBcols(), adaptor.getBnnz(),
2009+
adaptor.getRBlockSize(), adaptor.getCBlockSize(), pRowPos,
2010+
pColIdxs, pValues, ptp, itp, dtp, stream})
2011+
.getResult();
2012+
rewriter.replaceOp(op, {handle, stream});
2013+
return success();
2014+
}
2015+
19192016
void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
19202017
RewritePatternSet &patterns,
19212018
StringRef gpuBinaryAnnotation,
@@ -1941,6 +2038,8 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
19412038
ConvertCreateCooOpToGpuRuntimeCallPattern,
19422039
ConvertCreateCooAoSOpToGpuRuntimeCallPattern,
19432040
ConvertCreateCsrOpToGpuRuntimeCallPattern,
2041+
ConvertCreateCscOpToGpuRuntimeCallPattern,
2042+
ConvertCreateBsrOpToGpuRuntimeCallPattern,
19442043
ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern,
19452044
ConvertDestroySpMatOpToGpuRuntimeCallPattern,
19462045
ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,34 @@ mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
526526
return reinterpret_cast<void *>(mat);
527527
}
528528

529+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
530+
mgpuCreateCsc(intptr_t rows, intptr_t cols, intptr_t nnz, void *colPos,
531+
void *rowIdxs, void *values, int32_t ptp, int32_t itp,
532+
int32_t dtp, CUstream /*stream*/) {
533+
cusparseSpMatDescr_t mat = nullptr;
534+
auto pTp = static_cast<cusparseIndexType_t>(ptp);
535+
auto iTp = static_cast<cusparseIndexType_t>(itp);
536+
auto dTp = static_cast<cudaDataType_t>(dtp);
537+
CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsc(&mat, rows, cols, nnz, colPos,
538+
rowIdxs, values, pTp, iTp,
539+
CUSPARSE_INDEX_BASE_ZERO, dTp))
540+
return reinterpret_cast<void *>(mat);
541+
}
542+
543+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
544+
mgpuCreateBsr(intptr_t brows, intptr_t bcols, intptr_t bnnz, intptr_t rBsz,
545+
intptr_t cBsz, void *rowPos, void *colIdxs, void *values,
546+
int32_t ptp, int32_t itp, int32_t dtp, CUstream /*stream*/) {
547+
cusparseSpMatDescr_t mat = nullptr;
548+
auto pTp = static_cast<cusparseIndexType_t>(ptp);
549+
auto iTp = static_cast<cusparseIndexType_t>(itp);
550+
auto dTp = static_cast<cudaDataType_t>(dtp);
551+
CUSPARSE_REPORT_IF_ERROR(cusparseCreateBsr(
552+
&mat, brows, bcols, bnnz, rBsz, cBsz, rowPos, colIdxs, values, pTp, iTp,
553+
CUSPARSE_INDEX_BASE_ZERO, dTp, CUSPARSE_ORDER_ROW))
554+
return reinterpret_cast<void *>(mat);
555+
}
556+
529557
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
530558
mgpuDestroySpMat(void *m, CUstream /*stream*/) {
531559
cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m);

mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,29 @@ module attributes {gpu.container_module} {
128128
return
129129
}
130130

131+
// CHECK-LABEL: func @csc_and_bsr
132+
// CHECK: llvm.call @mgpuStreamCreate
133+
// CHECK: llvm.call @mgpuMemAlloc
134+
// CHECK: llvm.call @mgpuMemAlloc
135+
// CHECK: llvm.call @mgpuCreateCsc
136+
// CHECK: llvm.call @mgpuCreateBsr
137+
// CHECK: llvm.call @mgpuDestroySpMat
138+
// CHECK: llvm.call @mgpuDestroySpMat
139+
// CHECK: llvm.call @mgpuStreamSynchronize
140+
// CHECK: llvm.call @mgpuStreamDestroy
141+
func.func @csc_and_bsr(%arg0: index) {
142+
%token0 = gpu.wait async
143+
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
144+
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
145+
%csc, %token3 = gpu.create_csc async [%token2]
146+
%arg0, %arg0, %arg0, %mem1, %mem1, %mem2
147+
: memref<?xindex>, memref<?xindex>, memref<?xf64>
148+
%bsr, %token4 = gpu.create_bsr async [%token3]
149+
%arg0, %arg0, %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
150+
: memref<?xindex>, memref<?xindex>, memref<?xf64>
151+
%token5 = gpu.destroy_sp_mat async [%token4] %csc
152+
%token6 = gpu.destroy_sp_mat async [%token5] %bsr
153+
gpu.wait [%token6]
154+
return
155+
}
131156
}

mlir/test/Dialect/GPU/sparse-roundtrip.mlir

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,28 @@ module attributes {gpu.container_module} {
128128
return
129129
}
130130

131+
// CHECK-LABEL: func @csc_and_bsr
132+
// CHECK: %{{.*}} = gpu.wait async
133+
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
134+
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
135+
// CHECK: %{{.*}}, %{{.*}} = gpu.create_csc async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
136+
// CHECK: %{{.*}}, %{{.*}} = gpu.create_bsr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
137+
// CHECK: gpu.wait [%{{.*}}]
138+
// CHECK: return
139+
func.func @csc_and_bsr(%arg0: index) {
140+
%token0 = gpu.wait async
141+
%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
142+
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
143+
%csc, %token3 = gpu.create_csc async [%token2]
144+
%arg0, %arg0, %arg0, %mem1, %mem1, %mem2
145+
: memref<?xindex>, memref<?xindex>, memref<?xf64>
146+
%bsr, %token4 = gpu.create_bsr async [%token3]
147+
%arg0, %arg0, %arg0, %arg0, %arg0, %mem1, %mem1, %mem2
148+
: memref<?xindex>, memref<?xindex>, memref<?xf64>
149+
%token5 = gpu.destroy_sp_mat async [%token4] %csc
150+
%token6 = gpu.destroy_sp_mat async [%token5] %bsr
151+
gpu.wait [%token6]
152+
return
153+
}
154+
131155
}

0 commit comments

Comments
 (0)