@@ -161,7 +161,7 @@ def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
161
161
Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
162
162
Arguments<(ins)>, Results<(outs Index:$result)> {
163
163
let description = [{
164
- Returns the subgroup id, i.e. the index of the current subgroup within the
164
+ Returns the subgroup id, i.e., the index of the current subgroup within the
165
165
workgroup.
166
166
167
167
Example:
@@ -1089,8 +1089,8 @@ def AnyIntegerOrFloatOr1DVector :
1089
1089
def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]> {
1090
1090
let summary = "Reduce values among subgroup.";
1091
1091
let description = [{
1092
- The `subgroup_reduce` op reduces the value of every work item across a
1093
- subgroup. The result is equal for all work items of a subgroup .
1092
+ The `subgroup_reduce` op reduces the value of every lane ( work item) across
1093
+ a subgroup. The result is equal for all lanes .
1094
1094
1095
1095
When the reduced value is of a vector type, each vector element is reduced
1096
1096
independently. Only 1-d vector types are allowed.
@@ -1102,8 +1102,8 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
1102
1102
%2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> (vector<4xf16>)
1103
1103
```
1104
1104
1105
- If `uniform` flag is set either none or all work items of a subgroup
1106
- need to execute this op in convergence. The reduction operation must be one
1105
+ If `uniform` flag is set either none or all lanes of a subgroup need to execute
1106
+ this op in convergence. The reduction operation must be one
1107
1107
of:
1108
1108
* Integer types: `add`, `mul`, `minui`, `minsi`, `maxui`, `maxsi`, `and`,
1109
1109
`or`, `xor`
@@ -1155,30 +1155,64 @@ def GPU_ShuffleOp : GPU_Op<
1155
1155
Results<(outs I32I64F32OrF64:$shuffleResult, I1:$valid)> {
1156
1156
let summary = "Shuffles values within a subgroup.";
1157
1157
let description = [{
1158
- The "shuffle" op moves values to a different invocation within the same
1159
- subgroup.
1158
+ The "shuffle" op moves values to a across lanes (a.k.a., invocations,
1159
+ work items) within the same subgroup. The `width` argument specifies the
1160
+ number of lanes that participate in the shuffle, and must be uniform
1161
+ across all lanes. Further, the first `width` lanes of the subgroup must
1162
+ be active.
1160
1163
1161
- Example:
1164
+ The intepretation of the `offset` arguments depends on the selected
1165
+ `mode`.
1166
+
1167
+ Returns the `shuffleResult` and `true` if the current lane id is smaller
1168
+ than `width`, and an unspecified value and `false` otherwise.
1169
+
1170
+ `xor` example:
1162
1171
1163
1172
```mlir
1164
- %1, %2 = gpu.shuffle %0, %offset, %width xor : f32
1173
+ %1, %2 = gpu.shuffle xor %0, %offset, %width : f32
1165
1174
```
1166
1175
1167
- For lane k returns the value from lane `k ^ offset` and `true` if that lane
1168
- is smaller than %width. Otherwise it returns an unspecified value and
1169
- `false`. A lane is the index of an invocation relative to its subgroup.
1176
+ For lane `k`, returns the value `%0` from lane `k ^ offset`. Every lane
1177
+ trades value with exactly one other lane.
1170
1178
1171
- The width specifies the number of invocations that participate in the
1172
- shuffle. The width needs to be the same for all invocations that participate
1173
- in the shuffle. Exactly the first `width` invocations of a subgroup need to
1174
- execute this op in convergence.
1179
+ `down` example:
1180
+
1181
+ ```mlir
1182
+ %cst1 = arith.constant 1 : i32
1183
+ %3, %4 = gpu.shuffle down %0, %cst1, %width : f32
1184
+ ```
1185
+
1186
+ For lane `k`, returns the value from lane `(k + 1) % width`.
1187
+
1188
+ `up` example:
1189
+
1190
+ ```mlir
1191
+ %cst1 = arith.constant 1 : i32
1192
+ %5, %6 = gpu.shuffle up %0, %cst1, %width : f32
1193
+ ```
1194
+
1195
+ For lane `k`, returns the value from lane `(k - 1) % width`.
1196
+
1197
+ `idx` example:
1198
+
1199
+ ```mlir
1200
+ %cst0 = arith.constant 0 : i32
1201
+ %7, %8 = gpu.shuffle idx %0, %cst0, %width : f32
1202
+ ```
1203
+
1204
+ Broadcasts the value from lane 0 to all lanes.
1175
1205
}];
1206
+
1207
+ let assemblyFormat = [{
1208
+ $mode $value `,` $offset `,` $width attr-dict `:` type($value)
1209
+ }];
1210
+
1176
1211
let builders = [
1177
1212
// Helper function that creates a shuffle with constant offset/width.
1178
1213
OpBuilder<(ins "Value":$value, "int32_t":$offset, "int32_t":$width,
1179
1214
"ShuffleMode":$mode)>
1180
1215
];
1181
- let assemblyFormat = "$mode $value `,` $offset `,` $width attr-dict `:` type($value)";
1182
1216
}
1183
1217
1184
1218
def GPU_BarrierOp : GPU_Op<"barrier"> {
0 commit comments