Skip to content

Commit 8ee5e19

Browse files
authored
[AMDGPU] Fix @llvm.amdgcn.cs.chain with SGPR args not provably uniform (#114232)
The correct behaviour is to insert a readfirstlane. SelectionDAG was already doing this in some cases, but not in the general case for chain calls. GlobalISel was already doing this for return values but not for arguments.
1 parent 4b02877 commit 8ee5e19

8 files changed

+497
-208
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -230,13 +230,6 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
230230
return AddrReg.getReg(0);
231231
}
232232

233-
void assignValueToReg(Register ValVReg, Register PhysReg,
234-
const CCValAssign &VA) override {
235-
MIB.addUse(PhysReg, RegState::Implicit);
236-
Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
237-
MIRBuilder.buildCopy(PhysReg, ExtReg);
238-
}
239-
240233
void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
241234
const MachinePointerInfo &MPO,
242235
const CCValAssign &VA) override {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3855,10 +3855,14 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38553855

38563856
unsigned ArgIdx = 0;
38573857
for (auto [Reg, Val] : RegsToPass) {
3858-
if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() &&
3859-
TRI->isSGPRPhysReg(Reg)) {
3860-
// Speculatively insert a readfirstlane in case this is a uniform value in
3861-
// a VGPR.
3858+
if (ArgIdx++ >= NumSpecialInputs &&
3859+
(IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3860+
// For chain calls, the inreg arguments are required to be
3861+
// uniform. Speculatively Insert a readfirstlane in case we cannot prove
3862+
// they are uniform.
3863+
//
3864+
// For other calls, if an inreg arguments is known to be uniform,
3865+
// speculatively insert a readfirstlane in case it is in a VGPR.
38623866
//
38633867
// FIXME: We need to execute this in a waterfall loop if it is a divergent
38643868
// value, so let that continue to produce invalid code.

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,12 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
2424
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
2525
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
2626
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
27-
; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32)
28-
; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32)
29-
; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32)
27+
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
28+
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
29+
; GFX11-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
30+
; GFX11-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
31+
; GFX11-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
32+
; GFX11-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
3033
; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32)
3134
; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5)
3235
; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32)
@@ -50,9 +53,12 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
5053
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
5154
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
5255
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
53-
; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32)
54-
; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32)
55-
; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32)
56+
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
57+
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
58+
; GFX10-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
59+
; GFX10-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
60+
; GFX10-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
61+
; GFX10-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
5662
; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32)
5763
; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5)
5864
; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32)
@@ -82,9 +88,12 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
8288
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
8389
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
8490
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
85-
; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32)
86-
; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32)
87-
; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32)
91+
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
92+
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
93+
; GFX11-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
94+
; GFX11-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
95+
; GFX11-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
96+
; GFX11-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
8897
; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32)
8998
; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5)
9099
; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32)
@@ -108,9 +117,12 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
108117
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
109118
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
110119
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
111-
; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32)
112-
; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32)
113-
; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32)
120+
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
121+
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
122+
; GFX10-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
123+
; GFX10-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
124+
; GFX10-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
125+
; GFX10-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
114126
; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32)
115127
; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5)
116128
; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32)

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
5050
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
5151
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
5252
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
53-
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
53+
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32)
54+
; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32)
5455
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
5556
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
5657
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
@@ -99,8 +100,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
99100
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
100101
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
101102
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
102-
; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
103-
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
103+
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT1]](s32)
104+
; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32)
105+
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[LOAD2]](s32)
106+
; CHECK-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT1]](s32)
104107
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
105108
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
106109
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3

0 commit comments

Comments
 (0)