@@ -54,8 +54,9 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
54
54
Pure,
55
55
DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
56
56
DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>,
57
- Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> {
58
- let assemblyFormat = "$dimension attr-dict";
57
+ Arguments<(ins GPU_DimensionAttr:$dimension,
58
+ OptionalAttr<IndexAttr>:$upper_bound)>, Results<(outs Index)> {
59
+ let assemblyFormat = "$dimension (`upper_bound` $upper_bound^)? attr-dict";
59
60
let extraClassDefinition = [{
60
61
void $cppClass::getAsmResultNames(
61
62
llvm::function_ref<void(mlir::Value, mlir::StringRef)> setNameFn) {
@@ -66,6 +67,14 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
66
67
setNameFn(getResult(),resultName);
67
68
}
68
69
}];
70
+ let builders = [
71
+ OpBuilder<(ins "::mlir::gpu::Dimension":$dimension), [{
72
+ build($_builder, $_state, dimension, /*upperBound=*/nullptr);
73
+ }]>,
74
+ OpBuilder<(ins "::mlir::Type":$resultType, "::mlir::gpu::Dimension":$dimension), [{
75
+ build($_builder, $_state, resultType, dimension, /*upperBound=*/nullptr);
76
+ }]>
77
+ ];
69
78
}
70
79
71
80
def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
@@ -78,6 +87,12 @@ def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
78
87
```mlir
79
88
%cDimX = gpu.cluster_dim x
80
89
```
90
+
91
+ If `upper_bound` is set, then executing (a lowering of) this operation in an
92
+ environment where the clusters per grid is greater than `upper_bound` causes
93
+ undefined behavior.
94
+
95
+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
81
96
}];
82
97
}
83
98
@@ -91,6 +106,12 @@ def GPU_ClusterDimBlocksOp : GPU_IndexOp<"cluster_dim_blocks"> {
91
106
```mlir
92
107
%cDimBlocksX = gpu.cluster_dim_blocks x
93
108
```
109
+
110
+ If `upper_bound` is set, then executing (a lowering of) this operation in an
111
+ environment where the thread blocks per cluster is greater than `upper_bound`
112
+ causes undefined behavior.
113
+
114
+ There is an implicit upper bound of `kMaxClusterDim` (currently 8).
94
115
}];
95
116
}
96
117
@@ -104,6 +125,12 @@ def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
104
125
```mlir
105
126
%cIdY = gpu.cluster_id y
106
127
```
128
+
129
+ If `upper_bound` is set, then executing (a lowering of) this operation in an
130
+ environment where the number of clusters in the grid along `dimension` is
131
+ greater than `upper_bound` causes undefined behavior.
132
+
133
+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
107
134
}];
108
135
}
109
136
@@ -116,6 +143,12 @@ def GPU_ClusterBlockIdOp : GPU_IndexOp<"cluster_block_id"> {
116
143
```mlir
117
144
%cBlockIdY = gpu.cluster_block_id y
118
145
```
146
+
147
+ If `upper_bound` is set, then executing (a lowering of) this operation in an
148
+ environment where the number of thread blocks per cluster along `dimension`
149
+ is greater than `upper_bound` causes undefined behavior.
150
+
151
+ There is an implicit upper bound of `kMaxClusterDim` (currently 8).
119
152
}];
120
153
}
121
154
@@ -129,6 +162,19 @@ def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
129
162
```mlir
130
163
%bDimX = gpu.block_dim x
131
164
```
165
+
166
+ If `known_block_size` is set on an this operation's enclosing `gpu.func`,
167
+ or `gpu.known_block_size` is set on an enclosing `FunctionOpInterface`
168
+ implementor, or if the enclosing `gpu.launch` specifies a constant size for
169
+ `dimension`'s blocks, these contextual facts may be used to infer that this
170
+ operation has a constant value, though such a transformation will not be
171
+ performed by canonicalization or the default constant folder. Executions which
172
+ cause that constant-value assumption to be false incur undefined behavior.
173
+
174
+ If `upper_bound` is set, executions where the bblock size along `dimension`
175
+ exceeds `upper_bound` cause undefined behavior.
176
+
177
+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
132
178
}];
133
179
}
134
180
def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
@@ -141,6 +187,13 @@ def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
141
187
```mlir
142
188
%bIdY = gpu.block_id y
143
189
```
190
+
191
+ If `upper_bound` is set, or if one can be inferred from `known_grid_size`-type
192
+ annotations in context, executions where the block index in `dimension` would
193
+ be greater than or equal to that bound cause undefined behavior. `upper_bound`
194
+ takes priority over bounds inferrable from context.
195
+
196
+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
144
197
}];
145
198
}
146
199
def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
@@ -153,6 +206,20 @@ def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
153
206
```mlir
154
207
%gDimZ = gpu.grid_dim z
155
208
```
209
+
210
+
211
+ If `known_grid_size` is set on an this operation's enclosing `gpu.func`,
212
+ or `gpu.known_grid_size` is set on an enclosing `FunctionOpInterface`
213
+ implementor, or if the enclosing `gpu.launch` specifies a constant size for
214
+ `dimension`'s grid length, these contextual facts may be used to infer that this
215
+ operation has a constant value, though such a transformation will not be
216
+ performed by canonicalization or the default constant folder. Executions which
217
+ cause that constant-value assumption to be false incur undefined behavior.
218
+
219
+ If `upper_bound` is set, executions where the grid size in `dimension` would
220
+ exceed `upper_bound` cause undefined behavior.
221
+
222
+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
156
223
}];
157
224
}
158
225
def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
@@ -165,6 +232,12 @@ def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
165
232
```mlir
166
233
%tIdX = gpu.thread_id x
167
234
```
235
+
236
+ If `upper_bound` is set, or if one can be inferred from `known_block_size`-type
237
+ annotations in context, executions where the thread index would be greater
238
+ than or equal to that bound cause undefined behavior.
239
+
240
+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
168
241
}];
169
242
}
170
243
@@ -177,14 +250,21 @@ def GPU_LaneIdOp : GPU_Op<"lane_id", [
177
250
```mlir
178
251
%laneId = gpu.lane_id
179
252
```
253
+
254
+ If `upper_bound` is set, executions with more than `upper_bound` lanes per
255
+ subgroup cause undefined behavior. In the abscence of `upper_bound`,
256
+ the lane id is still assumed to be non-negative and less than the
257
+ target-independent `kMaxSubgroupSize` (currently 128).
180
258
}];
259
+ let arguments = (ins OptionalAttr<IndexAttr>:$upper_bound);
181
260
let results = (outs Index:$result);
182
- let assemblyFormat = "attr-dict";
261
+ let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict";
183
262
}
184
263
185
264
def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
186
265
Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
187
- Arguments<(ins)>, Results<(outs Index:$result)> {
266
+ Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
267
+ Results<(outs Index:$result)> {
188
268
let description = [{
189
269
Returns the subgroup id, i.e., the index of the current subgroup within the
190
270
workgroup.
@@ -194,9 +274,13 @@ def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
194
274
```mlir
195
275
%sgId = gpu.subgroup_id : index
196
276
```
277
+
278
+ Executions where there are more than `upper_bound` subgroups per workgroup
279
+ cause undefined behavior. There is an implicit upper bound of `kMaxDim`
280
+ (currently uint32_t::max).
197
281
}];
198
282
199
- let assemblyFormat = "attr-dict `:` type($result)";
283
+ let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
200
284
}
201
285
202
286
def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
@@ -209,14 +293,20 @@ def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
209
293
210
294
```mlir
211
295
%gidX = gpu.global_id x
296
+ %gidX = gpu.global_id x upper_bound 65536
212
297
```
298
+
299
+ The `upper_bound` attribute defines an upper bound analogously to the ones on
300
+ `thread_id` and `block_id`. If one is not set, the bound may be inferred from
301
+ a combination of `known_block_size` and `known_grid_size`-type annotations.
213
302
}];
214
303
}
215
304
216
305
217
306
def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
218
307
Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
219
- Arguments<(ins)>, Results<(outs Index:$result)> {
308
+ Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
309
+ Results<(outs Index:$result)> {
220
310
let description = [{
221
311
Returns the number of subgroups within a workgroup.
222
312
@@ -225,14 +315,19 @@ def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
225
315
```mlir
226
316
%numSg = gpu.num_subgroups : index
227
317
```
318
+
319
+ If `upper_bound` is set, executions with more than `upper_bound` subgroups
320
+ per workgroup cause undefined behavior. There is a default upper bound of
321
+ `kMaxDim` (currently uint32_t::max).
228
322
}];
229
323
230
- let assemblyFormat = "attr-dict `:` type($result)";
324
+ let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
231
325
}
232
326
233
327
def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
234
328
Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
235
- Arguments<(ins)>, Results<(outs Index:$result)> {
329
+ Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
330
+ Results<(outs Index:$result)> {
236
331
let description = [{
237
332
Returns the number of threads within a subgroup.
238
333
@@ -241,11 +336,20 @@ def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
241
336
```mlir
242
337
%sgSz = gpu.subgroup_size : index
243
338
```
339
+
340
+ Executions where the number of threads per subgroup exceed `upper_bound` cause
341
+ undefined behavior. When no `upper_bound` is specified, range analyses and
342
+ similar machinery assume the default bound of `kMaxSubgroupSize`, currently
343
+ 128.
244
344
}];
245
345
246
- let assemblyFormat = "attr-dict `:` type($result)";
346
+ let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
247
347
}
248
348
349
+ def GPU_OptionalDimSizeHintAttr : ConfinedAttr<OptionalAttr<DenseI32ArrayAttr>,
350
+ [AttrConstraint<Or<[IsNullAttr.predicate, DenseArrayCount<3>.predicate]>,
351
+ "with 3 elements (if present)">]>;
352
+
249
353
def GPU_GPUFuncOp : GPU_Op<"func", [
250
354
HasParent<"GPUModuleOp">, AutomaticAllocationScope, FunctionOpInterface,
251
355
IsolatedFromAbove
@@ -274,12 +378,14 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
274
378
body region, are not supported.
275
379
276
380
A function may optionally be annotated with the block and/or grid sizes
277
- that will be used when it is launched using the `gpu. known_block_size` and
278
- `gpu. known_grid_size` attributes, respectively. If set, these attributes must
381
+ that will be used when it is launched using the `known_block_size` and
382
+ `known_grid_size` attributes, respectively. If set, these attributes must
279
383
be arrays of three 32-bit integers giving the x, y, and z launch dimensions.
280
384
Launching a kernel that has these annotations, or that calls a function with
281
385
these annotations, using a block size or grid size other than what is specified
282
- is undefined behavior.
386
+ is undefined behavior. These attributes may be set on non-`gpu.func` functions
387
+ by using `gpu.known_block_size` or `gpu.known_grid_size`, but this carries
388
+ the risk that they will de discarded.
283
389
284
390
Syntax:
285
391
@@ -322,7 +428,9 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
322
428
OptionalAttr<DictArrayAttr>:$arg_attrs,
323
429
OptionalAttr<DictArrayAttr>:$res_attrs,
324
430
OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
325
- OptionalAttr<DictArrayAttr>:$private_attrib_attrs);
431
+ OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
432
+ GPU_OptionalDimSizeHintAttr:$known_block_size,
433
+ GPU_OptionalDimSizeHintAttr:$known_grid_size);
326
434
let regions = (region AnyRegion:$body);
327
435
328
436
let skipDefaultBuilders = 1;
@@ -445,36 +553,6 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
445
553
return "workgroup_attributions";
446
554
}
447
555
448
- static constexpr StringLiteral getKnownBlockSizeAttrName() {
449
- return StringLiteral("gpu.known_block_size");
450
- }
451
-
452
- static constexpr StringLiteral getKnownGridSizeAttrName() {
453
- return StringLiteral("gpu.known_grid_size");
454
- }
455
-
456
- /// Returns the block size this kernel will be launched with along
457
- /// dimension `dim` if known. The value of gpu.thread_id dim will be strictly
458
- /// less than this size.
459
- std::optional<uint32_t> getKnownBlockSize(gpu::Dimension dim) {
460
- if (auto array =
461
- (*this)->getAttrOfType<DenseI32ArrayAttr>(getKnownBlockSizeAttrName())) {
462
- return array[static_cast<uint32_t>(dim)];
463
- }
464
- return std::nullopt;
465
- }
466
-
467
- /// Returns the grid size this kernel will be launched with along
468
- /// dimension `dim` if known. The value of gpu.block_id dim will be strictly
469
- /// less than this size.
470
- std::optional<uint32_t> getKnownGridSize(gpu::Dimension dim) {
471
- if (auto array =
472
- (*this)->getAttrOfType<DenseI32ArrayAttr>(getKnownGridSizeAttrName())) {
473
- return array[static_cast<uint32_t>(dim)];
474
- }
475
- return std::nullopt;
476
- }
477
-
478
556
/// Returns the argument types of this function.
479
557
ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); }
480
558
@@ -495,8 +573,6 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
495
573
LogicalResult verifyBody();
496
574
}];
497
575
let hasCustomAssemblyFormat = 1;
498
-
499
- let hasVerifier = 1;
500
576
}
501
577
502
578
def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
@@ -723,8 +799,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
723
799
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
724
800
Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
725
801
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
726
- Optional<Index>:$clusterSizeX,
727
- Optional<Index>:$clusterSizeY,
802
+ Optional<Index>:$clusterSizeX,
803
+ Optional<Index>:$clusterSizeY,
728
804
Optional<Index>:$clusterSizeZ,
729
805
Optional<I32>:$dynamicSharedMemorySize)>,
730
806
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
@@ -748,7 +824,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
748
824
to the amount of dynamic shared memory a kernel's workgroup should be
749
825
allocated; when this operand is not present, a zero size is assumed.
750
826
751
- The body region has at least _twelve_ arguments, or _eighteen_ if cluster
827
+ The body region has at least _twelve_ arguments, or _eighteen_ if cluster
752
828
dimensions are present, grouped as follows:
753
829
754
830
- three optional arguments that contain cluster identifiers along x,y,z
@@ -821,7 +897,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
821
897
blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
822
898
threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
823
899
{
824
- // Cluster, block and thread identifiers, as well as cluster/block/grid
900
+ // Cluster, block and thread identifiers, as well as cluster/block/grid
825
901
// sizes are immediately usable inside body region.
826
902
"some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
827
903
}
@@ -898,7 +974,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
898
974
unsigned getNumConfigOperands() {
899
975
return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
900
976
}
901
- /// Returns the number of region attributes including cluster size
977
+ /// Returns the number of region attributes including cluster size
902
978
unsigned getNumConfigRegionAttributes() {
903
979
return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
904
980
}
0 commit comments