@@ -76,32 +76,151 @@ static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
76
76
}
77
77
78
78
// / Constructs code to launch GPU kernel.
79
- static void genLaunchGPUFunc (OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
80
- SmallVectorImpl<Value> &args,
81
- unsigned numThreads) {
79
+ static Value genLaunchGPUFunc (OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
80
+ SmallVectorImpl<Value> &args,
81
+ SmallVectorImpl<Value> &tokens,
82
+ unsigned numThreads) {
82
83
Location loc = gpuFunc->getLoc ();
83
84
Value none = TypedValue<::mlir::IntegerType>{};
84
85
Value one = constantIndex (builder, loc, 1 );
85
86
Value numT = constantIndex (builder, loc, numThreads);
86
87
gpu::KernelDim3 gridSize = {one, one, one};
87
88
gpu::KernelDim3 blckSize = {numT, one, one};
88
- builder.create <gpu::LaunchFuncOp>(loc, gpuFunc, gridSize, blckSize,
89
- /* dynSharedMemSz*/ none, args);
89
+ return builder
90
+ .create <gpu::LaunchFuncOp>(loc, gpuFunc, gridSize, blckSize,
91
+ /* dynSharedMemSz*/ none, args,
92
+ builder.getType <gpu::AsyncTokenType>(), tokens)
93
+ .getAsyncToken ();
90
94
}
91
95
92
96
// / Maps the provided ranked host buffer into the device address space.
93
97
// / Writes from the host are guaranteed to be visible to device kernels
94
98
// / that are launched afterwards. Writes from the device are guaranteed
95
99
// / to be visible on the host after synchronizing with the device kernel
96
- // / completion.
100
+ // / completion. Needs to cast the buffer to a unranked buffer.
97
101
static Value genHostRegisterMemref (OpBuilder &builder, Location loc,
98
102
Value mem) {
99
103
MemRefType memTp = mem.getType ().cast <MemRefType>();
100
104
UnrankedMemRefType resTp =
101
105
UnrankedMemRefType::get (memTp.getElementType (), /* memorySpace=*/ 0 );
102
106
Value cast = builder.create <memref::CastOp>(loc, resTp, mem);
103
107
builder.create <gpu::HostRegisterOp>(loc, cast);
104
- return mem; // convenience pass-through
108
+ return cast;
109
+ }
110
+
111
+ // / Unmaps the provided buffer, expecting the casted buffer.
112
+ static void genHostUnregisterMemref (OpBuilder &builder, Location loc,
113
+ Value cast) {
114
+ builder.create <gpu::HostUnregisterOp>(loc, cast);
115
+ }
116
+
117
+ // / Generates first wait in an asynchronous chain.
118
+ static Value genFirstWait (OpBuilder &builder, Location loc) {
119
+ Type tokenType = builder.getType <gpu::AsyncTokenType>();
120
+ return builder.create <gpu::WaitOp>(loc, tokenType, ValueRange ())
121
+ .getAsyncToken ();
122
+ }
123
+
124
+ // / Generates last, blocking wait in an asynchronous chain.
125
+ static void genBlockingWait (OpBuilder &builder, Location loc,
126
+ ValueRange operands) {
127
+ builder.create <gpu::WaitOp>(loc, Type (), operands);
128
+ }
129
+
130
+ // / Allocates memory on the device.
131
+ // / TODO: A `host_shared` attribute could be used to indicate that
132
+ // / the buffer is visible by both host and device, but lowering
133
+ // / that feature does not seem to be fully supported yet.
134
+ static gpu::AllocOp genAllocMemRef (OpBuilder &builder, Location loc, Value mem,
135
+ Value token) {
136
+ auto tp = mem.getType ().cast <ShapedType>();
137
+ auto elemTp = tp.getElementType ();
138
+ auto shape = tp.getShape ();
139
+ auto memTp = MemRefType::get (shape, elemTp);
140
+ SmallVector<Value> dynamicSizes;
141
+ for (unsigned r = 0 , rank = tp.getRank (); r < rank; r++) {
142
+ if (shape[r] == ShapedType::kDynamic ) {
143
+ Value dim = constantIndex (builder, loc, r);
144
+ Value dimOp = builder.create <memref::DimOp>(loc, mem, dim);
145
+ dynamicSizes.push_back (dimOp);
146
+ }
147
+ }
148
+ return builder.create <gpu::AllocOp>(loc, TypeRange ({memTp, token.getType ()}),
149
+ token, dynamicSizes, ValueRange ());
150
+ }
151
+
152
+ // / Deallocates memory from the device.
153
+ static Value genDeallocMemRef (OpBuilder &builder, Location loc, Value mem,
154
+ Value token) {
155
+ return builder.create <gpu::DeallocOp>(loc, token.getType (), token, mem)
156
+ .getAsyncToken ();
157
+ }
158
+
159
+ // / Copies memory between host and device (direction is implicit).
160
+ static Value genCopyMemRef (OpBuilder &builder, Location loc, Value dst,
161
+ Value src, Value token) {
162
+ return builder.create <gpu::MemcpyOp>(loc, token.getType (), token, dst, src)
163
+ .getAsyncToken ();
164
+ }
165
+
166
+ // / Prepares the outlined arguments, passing scalars and buffers in. Here we
167
+ // / assume that the first buffer is the one allocated for output. We create
168
+ // / a set of properly chained asynchronous allocation/copy pairs to increase
169
+ // / overlap before launching the kernel.
170
+ // / TODO: the output assumption may be a bit too brittle
171
+ static Value genParametersIn (OpBuilder &builder, Location loc,
172
+ SmallVectorImpl<Value> &scalars,
173
+ SmallVectorImpl<Value> &buffers,
174
+ SmallVectorImpl<Value> &args,
175
+ SmallVectorImpl<Value> &tokens,
176
+ bool useHostRegistrationForOut) {
177
+ Value out;
178
+ // Scalars are passed by value.
179
+ for (Value s : scalars)
180
+ args.push_back (s);
181
+ // Buffers are need to be made visible on device.
182
+ for (Value b : buffers) {
183
+ if (useHostRegistrationForOut) {
184
+ out = genHostRegisterMemref (builder, loc, b);
185
+ args.push_back (b);
186
+ useHostRegistrationForOut = false ;
187
+ continue ;
188
+ }
189
+ Value firstToken = genFirstWait (builder, loc);
190
+ auto alloc = genAllocMemRef (builder, loc, b, firstToken);
191
+ Value devMem = alloc.getResult (0 );
192
+ Value depToken = alloc.getAsyncToken (); // copy-after-alloc
193
+ args.push_back (devMem);
194
+ tokens.push_back (genCopyMemRef (builder, loc, devMem, b, depToken));
195
+ }
196
+ return out;
197
+ }
198
+
199
+ // / Finalizes the outlined arguments. The output buffer is copied depending
200
+ // / on the kernel token and then deallocated. All other buffers are simply
201
+ // / deallocated. Then we wait for all operations to complete.
202
+ static void genParametersOut (OpBuilder &builder, Location loc, Value out,
203
+ Value kernelToken, SmallVectorImpl<Value> &scalars,
204
+ SmallVectorImpl<Value> &buffers,
205
+ SmallVectorImpl<Value> &args,
206
+ SmallVectorImpl<Value> &tokens) {
207
+ unsigned base = scalars.size ();
208
+ for (unsigned i = base, e = args.size (); i < e; i++) {
209
+ Value firstToken;
210
+ if (i == base) {
211
+ // Assumed output parameter: unregister or copy-out.
212
+ if (out) {
213
+ genHostUnregisterMemref (builder, loc, out);
214
+ out = Value ();
215
+ continue ;
216
+ }
217
+ firstToken =
218
+ genCopyMemRef (builder, loc, buffers[0 ], args[i], kernelToken);
219
+ } else {
220
+ firstToken = genFirstWait (builder, loc);
221
+ }
222
+ tokens.push_back (genDeallocMemRef (builder, loc, args[i], firstToken));
223
+ }
105
224
}
106
225
107
226
// / Constructs code for new GPU kernel.
@@ -158,10 +277,8 @@ static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,
158
277
159
278
// / Proof-of-concept rewriter. This rule generates a CUDA implementation
160
279
// / for each outermost forall loop generated by the sparse compiler.
161
- //
162
- // TODO: right works with parallelization-strategy=dense-outer-loop
163
- // but give this its own flags in the future
164
- //
280
+ // / TODO: right works with parallelization-strategy=dense-outer-loop
281
+ // / but give this its own flags in the future
165
282
struct ForallRewriter : public OpRewritePattern <scf::ParallelOp> {
166
283
using OpRewritePattern<scf::ParallelOp>::OpRewritePattern;
167
284
@@ -211,22 +328,34 @@ struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> {
211
328
else
212
329
return failure (); // don't know how to share
213
330
}
214
- // Prepare the outlined arguments, register buffers.
331
+ // Pass outlined non-constant values.
332
+ // TODO: Experiment with `useHostRegistrationForOut` to see if we want to
333
+ // keep the feature at all (either through a heuristic or compiler
334
+ // option for gpu codegen).
215
335
Location loc = forallOp->getLoc ();
216
336
SmallVector<Value> args;
217
- for (Value s : scalars)
218
- args.push_back (s);
219
- for (Value b : buffers)
220
- args.push_back (genHostRegisterMemref (rewriter, loc, b));
221
- auto saveIp = rewriter.saveInsertionPoint ();
337
+ SmallVector<Value> tokens;
338
+ Value out = genParametersIn (rewriter, loc, scalars, buffers, args, tokens,
339
+ /* useHostRegistrationForOut=*/ false );
222
340
// Set up GPU module and construct GPU function.
341
+ auto saveIp = rewriter.saveInsertionPoint ();
223
342
ModuleOp topModule = forallOp->getParentOfType <ModuleOp>();
224
343
auto gpuModule = genGPUModule (rewriter, topModule);
225
344
auto gpuFunc = genGPUFunc (rewriter, gpuModule, args);
226
345
genGPUCode (rewriter, gpuFunc, forallOp, constants, scalars, buffers);
227
- // Generate code that launches the kernel.
346
+ // Generate code that launches the kernel asynchronously, blocking on all
347
+ // opens tokens and yielding a new token for the output.
348
+ // TODO: Passing in tokens to launch up does not seem to be properly lowered
349
+ // by cubin yet, hence the current blocking wait.
228
350
rewriter.restoreInsertionPoint (saveIp);
229
- genLaunchGPUFunc (rewriter, gpuFunc, args, numThreads);
351
+ genBlockingWait (rewriter, loc, tokens);
352
+ tokens.clear ();
353
+ Value kernelToken =
354
+ genLaunchGPUFunc (rewriter, gpuFunc, args, tokens, numThreads);
355
+ // Finalize the outlined arguments.
356
+ genParametersOut (rewriter, loc, out, kernelToken, scalars, buffers, args,
357
+ tokens);
358
+ genBlockingWait (rewriter, loc, tokens);
230
359
rewriter.eraseOp (forallOp);
231
360
return success ();
232
361
}
0 commit comments