Skip to content

[AMDGPU] Fix @llvm.amdgcn.cs.chain with SGPR args not provably uniform #114232

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,13 +230,6 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
return AddrReg.getReg(0);
}

void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
Comment on lines -233 to -238
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure how you can delete this, or is related

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is overriding the base class implementation AMDGPUOutgoingValueHandler::assignValueToReg and the only difference is that it removes the readfirstlane stuff.


void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
const MachinePointerInfo &MPO,
const CCValAssign &VA) override {
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3855,10 +3855,14 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,

unsigned ArgIdx = 0;
for (auto [Reg, Val] : RegsToPass) {
if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() &&
TRI->isSGPRPhysReg(Reg)) {
// Speculatively insert a readfirstlane in case this is a uniform value in
// a VGPR.
if (ArgIdx++ >= NumSpecialInputs &&
(IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
// For chain calls, the inreg arguments are required to be
// uniform. Speculatively Insert a readfirstlane in case we cannot prove
// they are uniform.
//
// For other calls, if an inreg arguments is known to be uniform,
// speculatively insert a readfirstlane in case it is in a VGPR.
//
// FIXME: We need to execute this in a waterfall loop if it is a divergent
// value, so let that continue to produce invalid code.
Expand Down
36 changes: 24 additions & 12 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32)
; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
; GFX11-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
; GFX11-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32)
; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5)
; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32)
Expand All @@ -50,9 +53,12 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32)
; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
; GFX10-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
; GFX10-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32)
; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5)
; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32)
Expand Down Expand Up @@ -82,9 +88,12 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32)
; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32)
; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
; GFX11-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
; GFX11-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32)
; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5)
; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32)
Expand All @@ -108,9 +117,12 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32)
; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
; GFX10-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
; GFX10-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32)
; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5)
; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32)
; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
Expand Down Expand Up @@ -99,8 +100,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[LOAD2]](s32)
; CHECK-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
Expand Down
Loading
Loading