Skip to content

Commit b1d4246

Browse files
authored
[AMDGPU] Fix hidden kernarg preload count inconsistency (#116759)
It is possible that the number of hidden arguments that are selected to be preloaded in AMDGPULowerKernel arguments and isel can differ. This isn't an issue with explicit arguments since isel can lower the argument correctly either way, but with hidden arguments we may have alignment issues if we try to load these hidden arguments that were added to the kernel signature. The reason for the mismatch is that isel reserves an extra synthetic user SGPR for module LDS. Instead of teaching lowerFormalArguments how to handle these properly it makes more sense and is less expensive to fix the mismatch and assert if we ever run into this issue again. We should never be trying to lower these in the normal way. In a future change we probably want to revise how we track "synthetic" user SGPRs and unify the handling in GCNUserSGPRUsageInfo. Sometimes synthetic SGPRSs are considered user SGPRs and sometimes they are not. Until then this patch resolves the inconsistency, fixes the bug, and is otherwise a NFC.
1 parent 1fbbf4c commit b1d4246

File tree

5 files changed

+121
-12
lines changed

5 files changed

+121
-12
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,12 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
520520

521521
// TODO: Align down to dword alignment and extract bits for extending loads.
522522
for (auto &Arg : F.args()) {
523+
// TODO: Add support for kernarg preload.
524+
if (Arg.hasAttribute("amdgpu-hidden-argument")) {
525+
LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
526+
return false;
527+
}
528+
523529
const bool IsByRef = Arg.hasByRefAttr();
524530
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
525531
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,20 +144,20 @@ class PreloadKernelArgInfo {
144144
// Returns the maximum number of user SGPRs that we have available to preload
145145
// arguments.
146146
void setInitialFreeUserSGPRsCount() {
147-
const unsigned MaxUserSGPRs = ST.getMaxNumUserSGPRs();
148147
GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
149-
150-
NumFreeUserSGPRs = MaxUserSGPRs - UserSGPRInfo.getNumUsedUserSGPRs();
148+
NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
151149
}
152150

153151
bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
154152
uint64_t LastExplicitArgOffset) {
155153
// Check if this argument may be loaded into the same register as the
156154
// previous argument.
157-
if (!isAligned(Align(4), ArgOffset) && AllocSize < 4)
155+
if (ArgOffset - LastExplicitArgOffset < 4 &&
156+
!isAligned(Align(4), ArgOffset))
158157
return true;
159158

160159
// Pad SGPRs for kernarg alignment.
160+
ArgOffset = alignDown(ArgOffset, 4);
161161
unsigned Padding = ArgOffset - LastExplicitArgOffset;
162162
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
163163
unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
@@ -170,6 +170,7 @@ class PreloadKernelArgInfo {
170170

171171
// Try to allocate SGPRs to preload implicit kernel arguments.
172172
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
173+
uint64_t LastExplicitArgOffset,
173174
IRBuilder<> &Builder) {
174175
Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
175176
F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
@@ -215,7 +216,6 @@ class PreloadKernelArgInfo {
215216
// argument can actually be preloaded.
216217
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
217218

218-
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
219219
// If we fail to preload any implicit argument we know we don't have SGPRs
220220
// to preload any subsequent ones with larger offsets. Find the first
221221
// argument that we cannot preload.
@@ -229,7 +229,8 @@ class PreloadKernelArgInfo {
229229
LastExplicitArgOffset))
230230
return true;
231231

232-
LastExplicitArgOffset = LoadOffset + LoadSize;
232+
LastExplicitArgOffset =
233+
ImplicitArgsBaseOffset + LoadOffset + LoadSize;
233234
return false;
234235
});
235236

@@ -486,7 +487,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
486487
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
487488
BaseOffset;
488489
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
489-
Builder);
490+
ExplicitArgOffset, Builder);
490491
}
491492

492493
return true;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2546,8 +2546,7 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25462546
unsigned Padding = ArgOffset - LastExplicitArgOffset;
25472547
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
25482548
// Check for free user SGPRs for preloading.
2549-
if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2550-
SGPRInfo.getNumFreeUserSGPRs()) {
2549+
if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
25512550
InPreloadSequence = false;
25522551
break;
25532552
}
@@ -3025,6 +3024,20 @@ SDValue SITargetLowering::LowerFormalArguments(
30253024
NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
30263025
}
30273026
} else {
3027+
// Hidden arguments that are in the kernel signature must be preloaded
3028+
// to user SGPRs. Print a diagnostic error if a hidden argument is in
3029+
// the argument list and is not preloaded.
3030+
if (Arg.isOrigArg()) {
3031+
Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3032+
if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3033+
DiagnosticInfoUnsupported NonPreloadHiddenArg(
3034+
*OrigArg->getParent(),
3035+
"hidden argument in kernel signature was not preloaded",
3036+
DL.getDebugLoc());
3037+
DAG.getContext()->diagnose(NonPreloadHiddenArg);
3038+
}
3039+
}
3040+
30283041
NewArg =
30293042
lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
30303043
Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; RUN: not llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s
2+
; RUN: not llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
3+
; RUN: not llc -global-isel=1 -global-isel-abort=2 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s
4+
; RUN: not llc -global-isel=0 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
5+
6+
define amdgpu_kernel void @no_free_sgprs_block_count_x_no_preload_diag(ptr addrspace(1) inreg %out, i512 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 {
7+
; GISEL: warning: Instruction selection used fallback path for no_free_sgprs_block_count_x_no_preload_diag
8+
; ERROR: error: <unknown>:0:0: in function no_free_sgprs_block_count_x_no_preload_diag void (ptr addrspace(1), i512, i32): hidden argument in kernel signature was not preloaded
9+
store i32 %_hidden_block_count_x, ptr addrspace(1) %out
10+
ret void
11+
}
12+
13+
define amdgpu_kernel void @preloadremainder_z_no_preload_diag(ptr addrspace(1) inreg %out, i256 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 {
14+
; GISEL: warning: Instruction selection used fallback path for preloadremainder_z_no_preload_diag
15+
; ERROR: error: <unknown>:0:0: in function preloadremainder_z_no_preload_diag void (ptr addrspace(1), i256, i32, i32, i32, i16, i16, i16, i16, i16, i16): hidden argument in kernel signature was not preloaded
16+
%conv = zext i16 %_hidden_remainder_z to i32
17+
store i32 %conv, ptr addrspace(1) %out
18+
ret void
19+
}
20+
21+
attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }

llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -599,10 +599,8 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
599599
; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
600600
; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
601601
; GFX940-NEXT: ; %bb.0:
602-
; GFX940-NEXT: s_load_dword s0, s[4:5], 0x1c
602+
; GFX940-NEXT: s_lshr_b32 s0, s15, 16
603603
; GFX940-NEXT: v_mov_b32_e32 v0, 0
604-
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
605-
; GFX940-NEXT: s_lshr_b32 s0, s0, 16
606604
; GFX940-NEXT: v_mov_b32_e32 v1, s0
607605
; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1
608606
; GFX940-NEXT: s_endpgm
@@ -626,4 +624,74 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
626624
ret void
627625
}
628626

627+
; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs.
628+
629+
define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 {
630+
; GFX940-LABEL: preload_block_max_user_sgprs:
631+
; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
632+
; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
633+
; GFX940-NEXT: ; %bb.0:
634+
; GFX940-NEXT: v_mov_b32_e32 v0, 0
635+
; GFX940-NEXT: v_mov_b32_e32 v1, s12
636+
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
637+
; GFX940-NEXT: s_endpgm
638+
;
639+
; GFX90a-LABEL: preload_block_max_user_sgprs:
640+
; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
641+
; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
642+
; GFX90a-NEXT: ; %bb.0:
643+
; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x28
644+
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
645+
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
646+
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
647+
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
648+
; GFX90a-NEXT: s_endpgm
649+
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
650+
%load = load i32, ptr addrspace(4) %imp_arg_ptr
651+
store i32 %load, ptr addrspace(1) %out
652+
ret void
653+
}
654+
655+
define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 {
656+
; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
657+
; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
658+
; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
659+
; GFX940-NEXT: ; %bb.0:
660+
; GFX940-NEXT: s_lshr_b32 s0, s9, 16
661+
; GFX940-NEXT: s_and_b32 s1, s8, 0xffff
662+
; GFX940-NEXT: v_mov_b32_e32 v3, 0
663+
; GFX940-NEXT: v_mov_b32_e32 v0, s6
664+
; GFX940-NEXT: v_mov_b32_e32 v1, s1
665+
; GFX940-NEXT: v_mov_b32_e32 v2, s0
666+
; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
667+
; GFX940-NEXT: s_endpgm
668+
;
669+
; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
670+
; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
671+
; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
672+
; GFX90a-NEXT: ; %bb.0:
673+
; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
674+
; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff
675+
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
676+
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
677+
; GFX90a-NEXT: v_mov_b32_e32 v1, s1
678+
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
679+
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
680+
; GFX90a-NEXT: s_endpgm
681+
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
682+
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
683+
%gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
684+
%gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
685+
%load0 = load i32, ptr addrspace(4) %gep0
686+
%load1 = load i16, ptr addrspace(4) %gep1
687+
%load2 = load i16, ptr addrspace(4) %gep2
688+
%conv1 = zext i16 %load1 to i32
689+
%conv2 = zext i16 %load2 to i32
690+
%ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0
691+
%ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1
692+
%ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2
693+
store <3 x i32> %ins.2, ptr addrspace(1) %out
694+
ret void
695+
}
696+
629697
attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }

0 commit comments

Comments
 (0)