Skip to content

Commit 86888e4

Browse files
committed
[mlir][sparse][gpu] generate proper memcpy in/out host and device
The host registration is a convenient way to get CUDA kernels running, but it may be slow and does not work for all buffer (like global constants). This revision uses the proper alloc copy dealloc chains for buffers, using asynchronous chains to increase overlap. The host registration mechanism is kept under a flag for the output, just for experimentation purposes while this project ramps up. Reviewed By: Peiming Differential Revision: https://reviews.llvm.org/D148682
1 parent 851a121 commit 86888e4

File tree

6 files changed

+314
-36
lines changed

6 files changed

+314
-36
lines changed

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

Lines changed: 148 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -76,32 +76,151 @@ static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
7676
}
7777

7878
/// Constructs code to launch GPU kernel.
79-
static void genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
80-
SmallVectorImpl<Value> &args,
81-
unsigned numThreads) {
79+
static Value genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
80+
SmallVectorImpl<Value> &args,
81+
SmallVectorImpl<Value> &tokens,
82+
unsigned numThreads) {
8283
Location loc = gpuFunc->getLoc();
8384
Value none = TypedValue<::mlir::IntegerType>{};
8485
Value one = constantIndex(builder, loc, 1);
8586
Value numT = constantIndex(builder, loc, numThreads);
8687
gpu::KernelDim3 gridSize = {one, one, one};
8788
gpu::KernelDim3 blckSize = {numT, one, one};
88-
builder.create<gpu::LaunchFuncOp>(loc, gpuFunc, gridSize, blckSize,
89-
/*dynSharedMemSz*/ none, args);
89+
return builder
90+
.create<gpu::LaunchFuncOp>(loc, gpuFunc, gridSize, blckSize,
91+
/*dynSharedMemSz*/ none, args,
92+
builder.getType<gpu::AsyncTokenType>(), tokens)
93+
.getAsyncToken();
9094
}
9195

9296
/// Maps the provided ranked host buffer into the device address space.
9397
/// Writes from the host are guaranteed to be visible to device kernels
9498
/// that are launched afterwards. Writes from the device are guaranteed
9599
/// to be visible on the host after synchronizing with the device kernel
96-
/// completion.
100+
/// completion. Needs to cast the buffer to a unranked buffer.
97101
static Value genHostRegisterMemref(OpBuilder &builder, Location loc,
98102
Value mem) {
99103
MemRefType memTp = mem.getType().cast<MemRefType>();
100104
UnrankedMemRefType resTp =
101105
UnrankedMemRefType::get(memTp.getElementType(), /*memorySpace=*/0);
102106
Value cast = builder.create<memref::CastOp>(loc, resTp, mem);
103107
builder.create<gpu::HostRegisterOp>(loc, cast);
104-
return mem; // convenience pass-through
108+
return cast;
109+
}
110+
111+
/// Unmaps the provided buffer, expecting the casted buffer.
112+
static void genHostUnregisterMemref(OpBuilder &builder, Location loc,
113+
Value cast) {
114+
builder.create<gpu::HostUnregisterOp>(loc, cast);
115+
}
116+
117+
/// Generates first wait in an asynchronous chain.
118+
static Value genFirstWait(OpBuilder &builder, Location loc) {
119+
Type tokenType = builder.getType<gpu::AsyncTokenType>();
120+
return builder.create<gpu::WaitOp>(loc, tokenType, ValueRange())
121+
.getAsyncToken();
122+
}
123+
124+
/// Generates last, blocking wait in an asynchronous chain.
125+
static void genBlockingWait(OpBuilder &builder, Location loc,
126+
ValueRange operands) {
127+
builder.create<gpu::WaitOp>(loc, Type(), operands);
128+
}
129+
130+
/// Allocates memory on the device.
131+
/// TODO: A `host_shared` attribute could be used to indicate that
132+
/// the buffer is visible by both host and device, but lowering
133+
/// that feature does not seem to be fully supported yet.
134+
static gpu::AllocOp genAllocMemRef(OpBuilder &builder, Location loc, Value mem,
135+
Value token) {
136+
auto tp = mem.getType().cast<ShapedType>();
137+
auto elemTp = tp.getElementType();
138+
auto shape = tp.getShape();
139+
auto memTp = MemRefType::get(shape, elemTp);
140+
SmallVector<Value> dynamicSizes;
141+
for (unsigned r = 0, rank = tp.getRank(); r < rank; r++) {
142+
if (shape[r] == ShapedType::kDynamic) {
143+
Value dim = constantIndex(builder, loc, r);
144+
Value dimOp = builder.create<memref::DimOp>(loc, mem, dim);
145+
dynamicSizes.push_back(dimOp);
146+
}
147+
}
148+
return builder.create<gpu::AllocOp>(loc, TypeRange({memTp, token.getType()}),
149+
token, dynamicSizes, ValueRange());
150+
}
151+
152+
/// Deallocates memory from the device.
153+
static Value genDeallocMemRef(OpBuilder &builder, Location loc, Value mem,
154+
Value token) {
155+
return builder.create<gpu::DeallocOp>(loc, token.getType(), token, mem)
156+
.getAsyncToken();
157+
}
158+
159+
/// Copies memory between host and device (direction is implicit).
160+
static Value genCopyMemRef(OpBuilder &builder, Location loc, Value dst,
161+
Value src, Value token) {
162+
return builder.create<gpu::MemcpyOp>(loc, token.getType(), token, dst, src)
163+
.getAsyncToken();
164+
}
165+
166+
/// Prepares the outlined arguments, passing scalars and buffers in. Here we
167+
/// assume that the first buffer is the one allocated for output. We create
168+
/// a set of properly chained asynchronous allocation/copy pairs to increase
169+
/// overlap before launching the kernel.
170+
/// TODO: the output assumption may be a bit too brittle
171+
static Value genParametersIn(OpBuilder &builder, Location loc,
172+
SmallVectorImpl<Value> &scalars,
173+
SmallVectorImpl<Value> &buffers,
174+
SmallVectorImpl<Value> &args,
175+
SmallVectorImpl<Value> &tokens,
176+
bool useHostRegistrationForOut) {
177+
Value out;
178+
// Scalars are passed by value.
179+
for (Value s : scalars)
180+
args.push_back(s);
181+
// Buffers are need to be made visible on device.
182+
for (Value b : buffers) {
183+
if (useHostRegistrationForOut) {
184+
out = genHostRegisterMemref(builder, loc, b);
185+
args.push_back(b);
186+
useHostRegistrationForOut = false;
187+
continue;
188+
}
189+
Value firstToken = genFirstWait(builder, loc);
190+
auto alloc = genAllocMemRef(builder, loc, b, firstToken);
191+
Value devMem = alloc.getResult(0);
192+
Value depToken = alloc.getAsyncToken(); // copy-after-alloc
193+
args.push_back(devMem);
194+
tokens.push_back(genCopyMemRef(builder, loc, devMem, b, depToken));
195+
}
196+
return out;
197+
}
198+
199+
/// Finalizes the outlined arguments. The output buffer is copied depending
200+
/// on the kernel token and then deallocated. All other buffers are simply
201+
/// deallocated. Then we wait for all operations to complete.
202+
static void genParametersOut(OpBuilder &builder, Location loc, Value out,
203+
Value kernelToken, SmallVectorImpl<Value> &scalars,
204+
SmallVectorImpl<Value> &buffers,
205+
SmallVectorImpl<Value> &args,
206+
SmallVectorImpl<Value> &tokens) {
207+
unsigned base = scalars.size();
208+
for (unsigned i = base, e = args.size(); i < e; i++) {
209+
Value firstToken;
210+
if (i == base) {
211+
// Assumed output parameter: unregister or copy-out.
212+
if (out) {
213+
genHostUnregisterMemref(builder, loc, out);
214+
out = Value();
215+
continue;
216+
}
217+
firstToken =
218+
genCopyMemRef(builder, loc, buffers[0], args[i], kernelToken);
219+
} else {
220+
firstToken = genFirstWait(builder, loc);
221+
}
222+
tokens.push_back(genDeallocMemRef(builder, loc, args[i], firstToken));
223+
}
105224
}
106225

107226
/// Constructs code for new GPU kernel.
@@ -158,10 +277,8 @@ static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,
158277

159278
/// Proof-of-concept rewriter. This rule generates a CUDA implementation
160279
/// for each outermost forall loop generated by the sparse compiler.
161-
//
162-
// TODO: right works with parallelization-strategy=dense-outer-loop
163-
// but give this its own flags in the future
164-
//
280+
/// TODO: right works with parallelization-strategy=dense-outer-loop
281+
/// but give this its own flags in the future
165282
struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> {
166283
using OpRewritePattern<scf::ParallelOp>::OpRewritePattern;
167284

@@ -211,22 +328,34 @@ struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> {
211328
else
212329
return failure(); // don't know how to share
213330
}
214-
// Prepare the outlined arguments, register buffers.
331+
// Pass outlined non-constant values.
332+
// TODO: Experiment with `useHostRegistrationForOut` to see if we want to
333+
// keep the feature at all (either through a heuristic or compiler
334+
// option for gpu codegen).
215335
Location loc = forallOp->getLoc();
216336
SmallVector<Value> args;
217-
for (Value s : scalars)
218-
args.push_back(s);
219-
for (Value b : buffers)
220-
args.push_back(genHostRegisterMemref(rewriter, loc, b));
221-
auto saveIp = rewriter.saveInsertionPoint();
337+
SmallVector<Value> tokens;
338+
Value out = genParametersIn(rewriter, loc, scalars, buffers, args, tokens,
339+
/*useHostRegistrationForOut=*/false);
222340
// Set up GPU module and construct GPU function.
341+
auto saveIp = rewriter.saveInsertionPoint();
223342
ModuleOp topModule = forallOp->getParentOfType<ModuleOp>();
224343
auto gpuModule = genGPUModule(rewriter, topModule);
225344
auto gpuFunc = genGPUFunc(rewriter, gpuModule, args);
226345
genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers);
227-
// Generate code that launches the kernel.
346+
// Generate code that launches the kernel asynchronously, blocking on all
347+
// opens tokens and yielding a new token for the output.
348+
// TODO: Passing in tokens to launch up does not seem to be properly lowered
349+
// by cubin yet, hence the current blocking wait.
228350
rewriter.restoreInsertionPoint(saveIp);
229-
genLaunchGPUFunc(rewriter, gpuFunc, args, numThreads);
351+
genBlockingWait(rewriter, loc, tokens);
352+
tokens.clear();
353+
Value kernelToken =
354+
genLaunchGPUFunc(rewriter, gpuFunc, args, tokens, numThreads);
355+
// Finalize the outlined arguments.
356+
genParametersOut(rewriter, loc, out, kernelToken, scalars, buffers, args,
357+
tokens);
358+
genBlockingWait(rewriter, loc, tokens);
230359
rewriter.eraseOp(forallOp);
231360
return success();
232361
}

mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,46 @@
77

88
//
99
// CHECK-LABEL: gpu.module @sparse_kernels
10-
// CHECK-DAG: gpu.func @kernel0
11-
// CHECK-DAG: gpu.func @kernel1
10+
// CHECK: gpu.func @kernel1
11+
// CHECK: gpu.func @kernel0
1212
//
1313
// CHECK-LABEL: func.func @matmuls
14-
// CHECK-DAG: gpu.launch_func @sparse_kernels::@kernel0 blocks
15-
// CHECK-DAG: gpu.launch_func @sparse_kernels::@kernel1 blocks
14+
// CHECK: gpu.alloc async
15+
// CHECK: gpu.memcpy async
16+
// CHECK: gpu.alloc async
17+
// CHECK: gpu.memcpy async
18+
// CHECK: gpu.alloc async
19+
// CHECK: gpu.memcpy async
20+
// CHECK: gpu.alloc async
21+
// CHECK: gpu.memcpy async
22+
// CHECK: gpu.alloc async
23+
// CHECK: gpu.memcpy async
24+
// CHECK: %[[T1:.*]] = gpu.launch_func async @sparse_kernels::@kernel1 blocks
25+
// CHECK: gpu.memcpy async [%[[T1]]]
26+
// CHECK: gpu.dealloc async
27+
// CHECK: gpu.dealloc async
28+
// CHECK: gpu.dealloc async
29+
// CHECK: gpu.dealloc async
30+
// CHECK: gpu.dealloc async
31+
// CHECK: gpu.wait
32+
// CHECK: gpu.alloc async
33+
// CHECK: gpu.memcpy async
34+
// CHECK: gpu.alloc async
35+
// CHECK: gpu.memcpy async
36+
// CHECK: gpu.alloc async
37+
// CHECK: gpu.memcpy async
38+
// CHECK: gpu.alloc async
39+
// CHECK: gpu.memcpy async
40+
// CHECK: gpu.alloc async
41+
// CHECK: gpu.memcpy async
42+
// CHECK: %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks
43+
// CHECK: gpu.memcpy async [%[[T0]]]
44+
// CHECK: gpu.dealloc async
45+
// CHECK: gpu.dealloc async
46+
// CHECK: gpu.dealloc async
47+
// CHECK: gpu.dealloc async
48+
// CHECK: gpu.dealloc async
49+
// CHECK: gpu.wait
1650
//
1751
func.func @matmuls(%A: tensor<1024x8xf64>,
1852
%B: tensor<8x1024xf64, #CSR>,

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,34 @@
4747
//
4848
//
4949
// CHECK-LABEL: func.func @matmul
50-
// CHECK: gpu.host_register
51-
// CHECK: gpu.host_register
52-
// CHECK: gpu.host_register
53-
// CHECK: gpu.host_register
54-
// CHECK: gpu.host_register
55-
// CHECK: gpu.launch_func @sparse_kernels::@kernel0 blocks
50+
// CHECK: gpu.wait async
51+
// CHECK: gpu.alloc async
52+
// CHECK: %[[S0:.*]] = gpu.memcpy async
53+
// CHECK: gpu.wait async
54+
// CHECK: gpu.alloc async
55+
// CHECK: %[[S1:.*]] = gpu.memcpy async
56+
// CHECK: gpu.wait async
57+
// CHECK: gpu.alloc async
58+
// CHECK: %[[S2:.*]] = gpu.memcpy async
59+
// CHECK: gpu.wait async
60+
// CHECK: gpu.alloc async
61+
// CHECK: %[[S3:.*]] = gpu.memcpy async
62+
// CHECK: gpu.wait async
63+
// CHECK: gpu.alloc async
64+
// CHECK: %[[S4:.*]] = gpu.memcpy async
65+
// CHECK: gpu.wait [%[[S0]], %[[S1]], %[[S2]], %[[S3]], %[[S4]]
66+
// CHECK: %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks
67+
// CHECK: %[[M0:.*]] = gpu.memcpy async [%[[T0]]]
68+
// CHECK: %[[M1:.*]] = gpu.dealloc async [%[[M0]]]
69+
// CHECK: %[[M2:.*]] = gpu.wait async
70+
// CHECK: %[[M3:.*]] = gpu.dealloc async [%[[M2]]]
71+
// CHECK: %[[M4:.*]] = gpu.wait async
72+
// CHECK: %[[M5:.*]] = gpu.dealloc async [%[[M4]]]
73+
// CHECK: %[[M6:.*]] = gpu.wait async
74+
// CHECK: %[[M7:.*]] = gpu.dealloc async [%[[M6]]]
75+
// CHECK: %[[M8:.*]] = gpu.wait async
76+
// CHECK: %[[M9:.*]] = gpu.dealloc async [%[[M8]]]
77+
// CHECK: gpu.wait [%[[M1]], %[[M3]], %[[M5]], %[[M7]], %[[M9]]
5678
//
5779
func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
5880
%C_out = linalg.matmul

mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,34 @@
4343
// CHECK: }
4444
//
4545
// CHECK-LABEL: func.func @matvec
46-
// CHECK: gpu.host_register
47-
// CHECK: gpu.host_register
48-
// CHECK: gpu.host_register
49-
// CHECK: gpu.host_register
50-
// CHECK: gpu.host_register
51-
// CHECK: gpu.launch_func @sparse_kernels::@kernel0 blocks
46+
// CHECK: gpu.wait async
47+
// CHECK: gpu.alloc async
48+
// CHECK: %[[S0:.*]] = gpu.memcpy async
49+
// CHECK: gpu.wait async
50+
// CHECK: gpu.alloc async
51+
// CHECK: %[[S1:.*]] = gpu.memcpy async
52+
// CHECK: gpu.wait async
53+
// CHECK: gpu.alloc async
54+
// CHECK: %[[S2:.*]] = gpu.memcpy async
55+
// CHECK: gpu.wait async
56+
// CHECK: gpu.alloc async
57+
// CHECK: %[[S3:.*]] = gpu.memcpy async
58+
// CHECK: gpu.wait async
59+
// CHECK: gpu.alloc async
60+
// CHECK: %[[S4:.*]] = gpu.memcpy async
61+
// CHECK: gpu.wait [%[[S0]], %[[S1]], %[[S2]], %[[S3]], %[[S4]]
62+
// CHECK: %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks
63+
// CHECK: %[[M0:.*]] = gpu.memcpy async [%[[T0]]]
64+
// CHECK: %[[M1:.*]] = gpu.dealloc async [%[[M0]]]
65+
// CHECK: %[[M2:.*]] = gpu.wait async
66+
// CHECK: %[[M3:.*]] = gpu.dealloc async [%[[M2]]]
67+
// CHECK: %[[M4:.*]] = gpu.wait async
68+
// CHECK: %[[M5:.*]] = gpu.dealloc async [%[[M4]]]
69+
// CHECK: %[[M6:.*]] = gpu.wait async
70+
// CHECK: %[[M7:.*]] = gpu.dealloc async [%[[M6]]]
71+
// CHECK: %[[M8:.*]] = gpu.wait async
72+
// CHECK: %[[M9:.*]] = gpu.dealloc async [%[[M8]]]
73+
// CHECK: gpu.wait [%[[M1]], %[[M3]], %[[M5]], %[[M7]], %[[M9]]
5274
//
5375
func.func @matvec(%A: tensor<?x?xf64, #CSR>, %x: tensor<?xf64>, %y_in: tensor<?xf64>) -> tensor<?xf64> {
5476
%y_out = linalg.matvec

0 commit comments

Comments
 (0)