Skip to content

Commit 950f323

Browse files
committed
AMDGPU: Fix verifier error on tail call target in vgprs
We allow tail calls of known uniform function pointers. This would produce a verifier error if the uniform value is in VGPRs. Insert readfirstlanes just in case this occurs, which will fold out later if it is unnecessary. GlobalISel should need a similar fix, but it currently does not attempt tail calls of indirect calls. Fixes subissue of #110930
1 parent 65e69f7 commit 950f323

File tree

2 files changed

+78
-0
lines changed

2 files changed

+78
-0
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3565,6 +3565,7 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
35653565
if (IsVarArg)
35663566
return false;
35673567

3568+
// FIXME: We need to know all arguments passed in SGPR are uniform.
35683569
for (const Argument &Arg : CallerF.args()) {
35693570
if (Arg.hasByValAttr())
35703571
return false;
@@ -3877,6 +3878,25 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38773878

38783879
std::vector<SDValue> Ops;
38793880
Ops.push_back(Chain);
3881+
3882+
if (IsTailCall) {
3883+
// isEligibleForTailCallOptimization considered whether the call target is
3884+
// divergent, but we may still end up with a uniform value in a VGPR. Insert
3885+
// a readfirstlane just in case.
3886+
SDValue ReadFirstLaneID =
3887+
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3888+
3889+
SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3890+
if (CLI.ConvergenceControlToken) {
3891+
SDValue TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue,
3892+
CLI.ConvergenceControlToken);
3893+
ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3894+
}
3895+
3896+
Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3897+
ReadfirstlaneArgs);
3898+
}
3899+
38803900
Ops.push_back(Callee);
38813901
// Add a redundant copy of the callee global which will not be legalized, as
38823902
// we need direct access to the callee later.
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
3+
4+
target triple = "amdgcn-amd-amdhsa"
5+
6+
; The tail call target is known uniform, but will be in a VGPR, so we
7+
; need readfirstlane to legalize it.
8+
define void @tail_call_uniform_vgpr_value() {
9+
; CHECK-LABEL: tail_call_uniform_vgpr_value:
10+
; CHECK: ; %bb.0:
11+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
13+
; CHECK-NEXT: ds_read_b64 v[0:1], v0
14+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
15+
; CHECK-NEXT: v_readfirstlane_b32 s17, v1
16+
; CHECK-NEXT: v_readfirstlane_b32 s16, v0
17+
; CHECK-NEXT: s_setpc_b64 s[16:17]
18+
%fptr = load ptr, ptr addrspace(3) null, align 8
19+
tail call void %fptr()
20+
ret void
21+
}
22+
23+
@constant = external hidden addrspace(4) constant ptr
24+
25+
; readfirstlanes should fold out.
26+
define void @tail_call_uniform_sgpr_value() {
27+
; CHECK-LABEL: tail_call_uniform_sgpr_value:
28+
; CHECK: ; %bb.0:
29+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30+
; CHECK-NEXT: s_getpc_b64 s[16:17]
31+
; CHECK-NEXT: s_add_u32 s16, s16, constant@rel32@lo+4
32+
; CHECK-NEXT: s_addc_u32 s17, s17, constant@rel32@hi+12
33+
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
34+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
35+
; CHECK-NEXT: s_setpc_b64 s[16:17]
36+
%fptr = load ptr, ptr addrspace(4) @constant, align 8
37+
tail call void %fptr()
38+
ret void
39+
}
40+
41+
define void @tail_call_uniform_vgpr_value_convergence_tokens() #0 {
42+
; CHECK-LABEL: tail_call_uniform_vgpr_value_convergence_tokens:
43+
; CHECK: ; %bb.0:
44+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
46+
; CHECK-NEXT: ds_read_b64 v[0:1], v0
47+
; CHECK-NEXT: ; meta instruction
48+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
49+
; CHECK-NEXT: v_readfirstlane_b32 s19, v1
50+
; CHECK-NEXT: v_readfirstlane_b32 s18, v0
51+
; CHECK-NEXT: s_setpc_b64 s[18:19]
52+
%t = call token @llvm.experimental.convergence.entry()
53+
%fptr = load ptr, ptr addrspace(3) null, align 8
54+
tail call void %fptr() #0 [ "convergencectrl"(token %t) ]
55+
ret void
56+
}
57+
58+
attributes #0 = { convergent }

0 commit comments

Comments
 (0)