Skip to content

Commit 447abfc

Browse files
authored
AMDGPU: Fold bitcasts into readfirstlane, readlane, and permlane64 (#128494)
We should handle this for all the handled readlane and dpp ops.
1 parent a88f4f1 commit 447abfc

File tree

3 files changed

+60
-37
lines changed

3 files changed

+60
-37
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,9 +1128,34 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
11281128
simplifyDemandedLaneMaskArg(IC, II, 1))
11291129
return ⅈ
11301130

1131+
// readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
1132+
if (auto *BC = dyn_cast<BitCastInst>(Src); BC && BC->hasOneUse()) {
1133+
Value *BCSrc = BC->getOperand(0);
1134+
1135+
// TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
1136+
if (isTypeLegal(BCSrc->getType())) {
1137+
Module *M = IC.Builder.GetInsertBlock()->getModule();
1138+
Function *Remangled =
1139+
Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()});
1140+
1141+
// Make sure convergence tokens are preserved.
1142+
// TODO: CreateIntrinsic should allow directly copying bundles
1143+
SmallVector<OperandBundleDef, 2> OpBundles;
1144+
II.getOperandBundlesAsDefs(OpBundles);
1145+
1146+
SmallVector<Value *, 3> Args(II.args());
1147+
Args[0] = BCSrc;
1148+
1149+
CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles);
1150+
NewCall->takeName(&II);
1151+
return new BitCastInst(NewCall, II.getType());
1152+
}
1153+
}
1154+
11311155
return std::nullopt;
11321156
}
11331157
case Intrinsic::amdgcn_writelane: {
1158+
// TODO: Fold bitcast like readlane.
11341159
if (simplifyDemandedLaneMaskArg(IC, II, 1))
11351160
return &II;
11361161
return std::nullopt;

llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
define i32 @test_bitcast_f32_to_i32_readfirstlane(float %val) {
55
; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane(
66
; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
7-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
8-
; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
7+
; CHECK-NEXT: [[RESULT1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]])
8+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
99
; CHECK-NEXT: ret i32 [[RESULT]]
1010
;
1111
%bitcast = bitcast float %val to i32
@@ -16,9 +16,9 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane(float %val) {
1616
define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_store(float %val, ptr %use.ptr) {
1717
; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_store(
1818
; CHECK-SAME: float [[VAL:%.*]], ptr [[USE_PTR:%.*]]) #[[ATTR0]] {
19-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
2019
; CHECK-NEXT: store float [[VAL]], ptr [[USE_PTR]], align 4
21-
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
20+
; CHECK-NEXT: [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]])
21+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[RESULT]] to i32
2222
; CHECK-NEXT: ret i32 [[TMP1]]
2323
;
2424
%bitcast = bitcast float %val to i32
@@ -46,9 +46,7 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_call(float %val) {
4646
define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(float %val) {
4747
; CHECK-LABEL: define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(
4848
; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR0]] {
49-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
50-
; CHECK-NEXT: [[CALL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
51-
; CHECK-NEXT: [[RESULT:%.*]] = bitcast i32 [[CALL]] to float
49+
; CHECK-NEXT: [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]])
5250
; CHECK-NEXT: ret float [[RESULT]]
5351
;
5452
%bitcast = bitcast float %val to i32
@@ -60,8 +58,8 @@ define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(float %val) {
6058
define i32 @test_bitcast_v2f16_to_i32_readfirstlane(<2 x half> %val) {
6159
; CHECK-LABEL: define i32 @test_bitcast_v2f16_to_i32_readfirstlane(
6260
; CHECK-SAME: <2 x half> [[VAL:%.*]]) #[[ATTR0]] {
63-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <2 x half> [[VAL]] to i32
64-
; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
61+
; CHECK-NEXT: [[RESULT1:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[VAL]])
62+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast <2 x half> [[RESULT1]] to i32
6563
; CHECK-NEXT: ret i32 [[RESULT]]
6664
;
6765
%bitcast = bitcast <2 x half> %val to i32
@@ -72,8 +70,8 @@ define i32 @test_bitcast_v2f16_to_i32_readfirstlane(<2 x half> %val) {
7270
define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(<2 x bfloat> %val) {
7371
; CHECK-LABEL: define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(
7472
; CHECK-SAME: <2 x bfloat> [[VAL:%.*]]) #[[ATTR0]] {
75-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <2 x bfloat> [[VAL]] to i32
76-
; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
73+
; CHECK-NEXT: [[RESULT1:%.*]] = call <2 x bfloat> @llvm.amdgcn.readfirstlane.v2bf16(<2 x bfloat> [[VAL]])
74+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast <2 x bfloat> [[RESULT1]] to i32
7775
; CHECK-NEXT: ret i32 [[RESULT]]
7876
;
7977
%bitcast = bitcast <2 x bfloat> %val to i32
@@ -84,8 +82,8 @@ define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(<2 x bfloat> %val) {
8482
define i64 @test_bitcast_f64_to_i64_readfirstlane(double %val) {
8583
; CHECK-LABEL: define i64 @test_bitcast_f64_to_i64_readfirstlane(
8684
; CHECK-SAME: double [[VAL:%.*]]) #[[ATTR0]] {
87-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast double [[VAL]] to i64
88-
; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[BITCAST]])
85+
; CHECK-NEXT: [[RESULT1:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[VAL]])
86+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast double [[RESULT1]] to i64
8987
; CHECK-NEXT: ret i64 [[RESULT]]
9088
;
9189
%bitcast = bitcast double %val to i64
@@ -96,8 +94,8 @@ define i64 @test_bitcast_f64_to_i64_readfirstlane(double %val) {
9694
define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(double %val) {
9795
; CHECK-LABEL: define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(
9896
; CHECK-SAME: double [[VAL:%.*]]) #[[ATTR0]] {
99-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast double [[VAL]] to <2 x i32>
100-
; CHECK-NEXT: [[RESULT:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[BITCAST]])
97+
; CHECK-NEXT: [[RESULT1:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[VAL]])
98+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast double [[RESULT1]] to <2 x i32>
10199
; CHECK-NEXT: ret <2 x i32> [[RESULT]]
102100
;
103101
%bitcast = bitcast double %val to <2 x i32>
@@ -108,8 +106,8 @@ define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(double %val) {
108106
define i64 @test_bitcast_v4i16_to_i64_readfirstlane(<4 x i16> %val) {
109107
; CHECK-LABEL: define i64 @test_bitcast_v4i16_to_i64_readfirstlane(
110108
; CHECK-SAME: <4 x i16> [[VAL:%.*]]) #[[ATTR0]] {
111-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <4 x i16> [[VAL]] to i64
112-
; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[BITCAST]])
109+
; CHECK-NEXT: [[RESULT1:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[VAL]])
110+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast <4 x i16> [[RESULT1]] to i64
113111
; CHECK-NEXT: ret i64 [[RESULT]]
114112
;
115113
%bitcast = bitcast <4 x i16> %val to i64
@@ -145,8 +143,8 @@ define i32 @test_bitcast_v8i4_to_i32_readfirstlane(<8 x i4> %val) {
145143
define float @test_bitcast_i32_to_f32_readfirstlane(i32 %val) {
146144
; CHECK-LABEL: define float @test_bitcast_i32_to_f32_readfirstlane(
147145
; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
148-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32 [[VAL]] to float
149-
; CHECK-NEXT: [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[BITCAST]])
146+
; CHECK-NEXT: [[RESULT1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]])
147+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast i32 [[RESULT1]] to float
150148
; CHECK-NEXT: ret float [[RESULT]]
151149
;
152150
%bitcast = bitcast i32 %val to float
@@ -157,8 +155,8 @@ define float @test_bitcast_i32_to_f32_readfirstlane(i32 %val) {
157155
define i16 @test_bitcast_f16_to_i16_readfirstlane(half %val) {
158156
; CHECK-LABEL: define i16 @test_bitcast_f16_to_i16_readfirstlane(
159157
; CHECK-SAME: half [[VAL:%.*]]) #[[ATTR0]] {
160-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast half [[VAL]] to i16
161-
; CHECK-NEXT: [[RESULT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[BITCAST]])
158+
; CHECK-NEXT: [[RESULT1:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[VAL]])
159+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast half [[RESULT1]] to i16
162160
; CHECK-NEXT: ret i16 [[RESULT]]
163161
;
164162
%bitcast = bitcast half %val to i16
@@ -181,8 +179,8 @@ define i16 @test_bitcast_v2i8_to_i16_readfirstlane(<2 x i8> %val) {
181179
define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(<16 x float> %val) {
182180
; CHECK-LABEL: define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(
183181
; CHECK-SAME: <16 x float> [[VAL:%.*]]) #[[ATTR0]] {
184-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <16 x float> [[VAL]] to <16 x i32>
185-
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.amdgcn.readfirstlane.v16i32(<16 x i32> [[BITCAST]])
182+
; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> [[VAL]])
183+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x float> [[RESULT]] to <16 x i32>
186184
; CHECK-NEXT: ret <16 x i32> [[TMP1]]
187185
;
188186
%bitcast = bitcast <16 x float> %val to <16 x i32>
@@ -193,8 +191,8 @@ define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(<16 x float> %val
193191
define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(<16 x float> %val) {
194192
; CHECK-LABEL: define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(
195193
; CHECK-SAME: <16 x float> [[VAL:%.*]]) #[[ATTR0]] {
196-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <16 x float> [[VAL]] to <8 x i64>
197-
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> [[BITCAST]])
194+
; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> [[VAL]])
195+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x float> [[RESULT]] to <8 x i64>
198196
; CHECK-NEXT: ret <8 x i64> [[TMP1]]
199197
;
200198
%bitcast = bitcast <16 x float> %val to <8 x i64>
@@ -205,8 +203,8 @@ define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(<16 x float> %val)
205203
define i32 @test_bitcast_f32_to_i32_readlane(float %val, i32 inreg %lane.index) {
206204
; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readlane(
207205
; CHECK-SAME: float [[VAL:%.*]], i32 inreg [[LANE_INDEX:%.*]]) #[[ATTR0]] {
208-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
209-
; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[BITCAST]], i32 [[LANE_INDEX]])
206+
; CHECK-NEXT: [[RESULT1:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE_INDEX]])
207+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
210208
; CHECK-NEXT: ret i32 [[RESULT]]
211209
;
212210
%bitcast = bitcast float %val to i32
@@ -290,8 +288,8 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane_convergencetoken(float %val) c
290288
; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane_convergencetoken(
291289
; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR1]] {
292290
; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry()
293-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
294-
; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]]) [ "convergencectrl"(token [[T]]) ]
291+
; CHECK-NEXT: [[RESULT1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]]) [ "convergencectrl"(token [[T]]) ]
292+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
295293
; CHECK-NEXT: ret i32 [[RESULT]]
296294
;
297295
%t = call token @llvm.experimental.convergence.entry()
@@ -304,8 +302,8 @@ define i32 @test_bitcast_f32_to_i32_readlane_convergencetoken(float %val, i32 in
304302
; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readlane_convergencetoken(
305303
; CHECK-SAME: float [[VAL:%.*]], i32 inreg [[LANE_INDEX:%.*]]) #[[ATTR1]] {
306304
; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry()
307-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
308-
; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[BITCAST]], i32 [[LANE_INDEX]]) [ "convergencectrl"(token [[T]]) ]
305+
; CHECK-NEXT: [[RESULT1:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE_INDEX]]) [ "convergencectrl"(token [[T]]) ]
306+
; CHECK-NEXT: [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
309307
; CHECK-NEXT: ret i32 [[RESULT]]
310308
;
311309
%t = call token @llvm.experimental.convergence.entry()

llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ define i32 @test_constant() {
1212

1313
define i32 @test_bitcast_f32_to_i32_permlane64(float %val) {
1414
; CHECK-LABEL: @test_bitcast_f32_to_i32_permlane64(
15-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL:%.*]] to i32
16-
; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[BITCAST]])
17-
; CHECK-NEXT: ret i32 [[RESULT]]
15+
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL1:%.*]])
16+
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
17+
; CHECK-NEXT: ret i32 [[BITCAST]]
1818
;
1919
%bitcast = bitcast float %val to i32
2020
%result = call i32 @llvm.amdgcn.permlane64.i32(i32 %bitcast)
@@ -24,9 +24,9 @@ define i32 @test_bitcast_f32_to_i32_permlane64(float %val) {
2424
define i32 @test_bitcast_f32_to_i32_permlane64_convergencetokenn(float %val) convergent {
2525
; CHECK-LABEL: @test_bitcast_f32_to_i32_permlane64_convergencetokenn(
2626
; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry()
27-
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL:%.*]] to i32
28-
; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[BITCAST]]) [ "convergencectrl"(token [[T]]) ]
29-
; CHECK-NEXT: ret i32 [[RESULT]]
27+
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL1:%.*]]) [ "convergencectrl"(token [[T]]) ]
28+
; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
29+
; CHECK-NEXT: ret i32 [[BITCAST]]
3030
;
3131
%t = call token @llvm.experimental.convergence.entry()
3232
%bitcast = bitcast float %val to i32

0 commit comments

Comments
 (0)