Skip to content

Commit f85e4a2

Browse files
committed
Remove rotation semantic in gpu.shufflw up/down
There is no such semantic in SPIRV OpGroupNonUniformShuffleUp and OpGroupNonUniformShuffleDown. In addition, there is no such semantic in NVVM shfl intrinsics. Refer to NVVM IR spec https://docs.nvidia.com/cuda/archive/12.2.1/nvvm-ir-spec/index.html#data-movement "If the computed source lane index j is in range, the returned i32 value will be the value of %a from lane j; otherwise, it will be the the value of %a from the current thread. If the thread corresponding to lane j is inactive, then the returned i32 value is undefined."
1 parent 07acb22 commit f85e4a2

File tree

3 files changed

+38
-21
lines changed

3 files changed

+38
-21
lines changed

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,7 +1332,8 @@ def GPU_ShuffleOp : GPU_Op<
13321332
%3, %4 = gpu.shuffle down %0, %cst1, %width : f32
13331333
```
13341334

1335-
For lane `k`, returns the value from lane `(k + 1) % width`.
1335+
For lane `k`, returns the value from lane `(k + cst1)`. The resulting value
1336+
is undefined if the lane is out of bounds in the subgroup.
13361337

13371338
`up` example:
13381339

@@ -1341,7 +1342,8 @@ def GPU_ShuffleOp : GPU_Op<
13411342
%5, %6 = gpu.shuffle up %0, %cst1, %width : f32
13421343
```
13431344

1344-
For lane `k`, returns the value from lane `(k - 1) % width`.
1345+
For lane `k`, returns the value from lane `(k - cst1)`. The resulting value
1346+
is undefined if the lane is out of bounds in the subgroup.
13451347

13461348
`idx` example:
13471349

mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -430,12 +430,10 @@ LogicalResult GPUShuffleConversion::matchAndRewrite(
430430
unsigned subgroupSize =
431431
targetEnv.getAttr().getResourceLimits().getSubgroupSize();
432432
IntegerAttr widthAttr;
433-
// The width argument specifies the number of lanes that participate in the
434-
// shuffle. The width value should not exceed the subgroup limit.
435433
if (!matchPattern(shuffleOp.getWidth(), m_Constant(&widthAttr)) ||
436-
widthAttr.getValue().getZExtValue() > subgroupSize)
434+
widthAttr.getValue().getZExtValue() != subgroupSize)
437435
return rewriter.notifyMatchFailure(
438-
shuffleOp, "shuffle width is larger than target subgroup size");
436+
shuffleOp, "shuffle width and target subgroup size mismatch");
439437

440438
Location loc = shuffleOp.getLoc();
441439
Value trueVal = spirv::ConstantOp::getOne(rewriter.getI1Type(),
@@ -453,19 +451,14 @@ LogicalResult GPUShuffleConversion::matchAndRewrite(
453451
loc, scope, adaptor.getValue(), adaptor.getOffset());
454452
break;
455453
case gpu::ShuffleMode::DOWN:
456-
result = rewriter.create<spirv::GroupNonUniformRotateKHROp>(
457-
loc, scope, adaptor.getValue(), adaptor.getOffset(),
458-
shuffleOp.getWidth());
454+
result = rewriter.create<spirv::GroupNonUniformShuffleDownOp>(
455+
loc, scope, adaptor.getValue(), adaptor.getOffset());
459456
break;
460-
case gpu::ShuffleMode::UP: {
461-
Value offsetForShuffleDown = rewriter.create<arith::SubIOp>(
462-
loc, shuffleOp.getWidth(), adaptor.getOffset());
463-
result = rewriter.create<spirv::GroupNonUniformRotateKHROp>(
464-
loc, scope, adaptor.getValue(), offsetForShuffleDown,
465-
shuffleOp.getWidth());
457+
case gpu::ShuffleMode::UP:
458+
result = rewriter.create<spirv::GroupNonUniformShuffleUpOp>(
459+
loc, scope, adaptor.getValue(), adaptor.getOffset());
466460
break;
467461
}
468-
}
469462

470463
rewriter.replaceOp(shuffleOp, {result, trueVal});
471464
return success();

mlir/test/Conversion/GPUToSPIRV/shuffle.mlir

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,29 @@ gpu.module @kernels {
2626

2727
// -----
2828

29+
module attributes {
30+
gpu.container_module,
31+
spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle], []>, #spirv.resource_limits<subgroup_size = 32>>
32+
} {
33+
34+
gpu.module @kernels {
35+
gpu.func @shuffle_xor() kernel
36+
attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
37+
%mask = arith.constant 8 : i32
38+
%width = arith.constant 16 : i32
39+
%val = arith.constant 42.0 : f32
40+
41+
// Cannot convert due to shuffle width and target subgroup size mismatch
42+
// expected-error @+1 {{failed to legalize operation 'gpu.shuffle'}}
43+
%result, %valid = gpu.shuffle xor %val, %mask, %width : f32
44+
gpu.return
45+
}
46+
}
47+
48+
}
49+
50+
// -----
51+
2952
module attributes {
3053
gpu.container_module,
3154
spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle], []>, #spirv.resource_limits<subgroup_size = 16>>
@@ -54,7 +77,7 @@ gpu.module @kernels {
5477

5578
module attributes {
5679
gpu.container_module,
57-
spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle, GroupNonUniformRotateKHR], []>,
80+
spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle, GroupNonUniformShuffleRelative], []>,
5881
#spirv.resource_limits<subgroup_size = 16>>
5982
} {
6083

@@ -70,7 +93,7 @@ gpu.module @kernels {
7093
// CHECK: %[[WIDTH:.+]] = spirv.Constant 16 : i32
7194
// CHECK: %[[VAL:.+]] = spirv.Constant 4.200000e+01 : f32
7295
// CHECK: %{{.+}} = spirv.Constant true
73-
// CHECK: %{{.+}} = spirv.GroupNonUniformRotateKHR <Subgroup> %[[VAL]], %[[OFFSET]], cluster_size(%[[WIDTH]]) : f32, i32, i32 -> f32
96+
// CHECK: %{{.+}} = spirv.GroupNonUniformShuffleDown <Subgroup> %[[VAL]], %[[OFFSET]] : f32, i32
7497
%result, %valid = gpu.shuffle down %val, %offset, %width : f32
7598
gpu.return
7699
}
@@ -82,7 +105,7 @@ gpu.module @kernels {
82105

83106
module attributes {
84107
gpu.container_module,
85-
spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle, GroupNonUniformRotateKHR], []>,
108+
spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle, GroupNonUniformShuffleRelative], []>,
86109
#spirv.resource_limits<subgroup_size = 16>>
87110
} {
88111

@@ -98,8 +121,7 @@ gpu.module @kernels {
98121
// CHECK: %[[WIDTH:.+]] = spirv.Constant 16 : i32
99122
// CHECK: %[[VAL:.+]] = spirv.Constant 4.200000e+01 : f32
100123
// CHECK: %{{.+}} = spirv.Constant true
101-
// CHECK: %[[DOWN_OFFSET:.+]] = spirv.Constant 12 : i32
102-
// CHECK: %{{.+}} = spirv.GroupNonUniformRotateKHR <Subgroup> %[[VAL]], %[[DOWN_OFFSET]], cluster_size(%[[WIDTH]]) : f32, i32, i32 -> f32
124+
// CHECK: %{{.+}} = spirv.GroupNonUniformShuffleUp <Subgroup> %[[VAL]], %[[OFFSET]] : f32, i32
103125
%result, %valid = gpu.shuffle up %val, %offset, %width : f32
104126
gpu.return
105127
}

0 commit comments

Comments
 (0)