Skip to content

Commit 954ab83

Browse files
authored
[AMDGPU] Include unused preload kernarg in KD total SGPR count (#104743)
Unlike with implicitly preloaded data UserSGPRs firmware is unable to handle cases where SGPRs for kernel arguments contain preloaded data but not are not explicitly referenced in the kernel. We need to include these preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT calculation to not clobber SGPRs in adjacent waves.
1 parent 74405b9 commit 954ab83

File tree

4 files changed

+114
-2
lines changed

4 files changed

+114
-2
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
900900

901901
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
902902
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
903+
} else if (isKernel(F.getCallingConv()) &&
904+
MFI->getNumKernargPreloadedSGPRs()) {
905+
// Consider cases where the total number of UserSGPRs with trailing
906+
// allocated preload SGPRs, is greater than the number of explicitly
907+
// referenced SGPRs.
908+
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
909+
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
910+
ProgInfo.NumSGPR =
911+
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
903912
}
904913

905914
// Adjust number of registers used to meet default/requested minimum/maximum

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5834,6 +5834,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
58345834
if (!Seen.contains(".amdhsa_next_free_sgpr"))
58355835
return TokError(".amdhsa_next_free_sgpr directive is required");
58365836

5837+
unsigned UserSGPRCount = ExplicitUserSGPRCount.value_or(ImpliedUserSGPRCount);
5838+
5839+
// Consider the case where the total number of UserSGPRs with trailing
5840+
// allocated preload SGPRs, is greater than the number of explicitly
5841+
// referenced SGPRs.
5842+
if (PreloadLength) {
5843+
MCContext &Ctx = getContext();
5844+
NextFreeSGPR = AMDGPUMCExpr::createMax(
5845+
{NextFreeSGPR, MCConstantExpr::create(UserSGPRCount, Ctx)}, Ctx);
5846+
}
5847+
58375848
const MCExpr *VGPRBlocks;
58385849
const MCExpr *SGPRBlocks;
58395850
if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
@@ -5870,8 +5881,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
58705881
return TokError("amdgpu_user_sgpr_count smaller than than implied by "
58715882
"enabled user SGPRs");
58725883

5873-
unsigned UserSGPRCount = ExplicitUserSGPRCount.value_or(ImpliedUserSGPRCount);
5874-
58755884
if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
58765885
return TokError("too many user SGPRs enabled");
58775886
AMDGPU::MCKernelDescriptor::bits_set(
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=ASM %s
3+
4+
; OBJDUMP: Contents of section .rodata:
5+
; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................
6+
; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................
7+
; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................
8+
; OBJDUMP-NOT: 0030 0000af00 94130000 1a000400 00000000 ................
9+
; OBJDUMP-NEXT: 0030 4000af00 94130000 1a000400 00000000 @...............
10+
11+
; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
12+
; ASM: .amdhsa_user_sgpr_count 10
13+
; ASM: .amdhsa_next_free_sgpr 10
14+
; ASM: ; NumSgprs: 16
15+
; ASM: ; NumSGPRsForWavesPerEU: 16
16+
17+
; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
18+
; feild that are not explicitly referenced in the kernel. This test has 6 implicit
19+
; user SPGRs enabled, 4 preloaded kernarg SGPRs, plus 6 extra SGPRs allocated
20+
; for flat scratch, ect. The total number of allocated SGPRs encoded in the
21+
; kernel descriptor should be 16. That's a 1 in the KD field since the granule
22+
; size is 8 and it's NumGranules - 1. The encoding for that looks like '40'.
23+
24+
define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret void }
25+
26+
; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
27+
; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................
28+
; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................
29+
; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............
30+
31+
; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
32+
; ASM: .amdhsa_user_sgpr_count 10
33+
; ASM: .amdhsa_next_free_sgpr 10
34+
; ASM: ; NumSgprs: 16
35+
; ASM: ; NumSGPRsForWavesPerEU: 16
36+
37+
; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
38+
; implicit, and 6 extra.
39+
40+
define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ret void }
41+
42+
; OBJDUMP-NEXT: 0080 00000000 00000000 08010000 00000000 ................
43+
; OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 ................
44+
; OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 ................
45+
; OBJDUMP-NEXT: 00b0 4000af00 86000000 08000100 00000000 @...............
46+
47+
; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
48+
; ASM: .amdhsa_user_sgpr_count 3
49+
; ASM: .amdhsa_next_free_sgpr 3
50+
; ASM: ; NumSgprs: 9
51+
; ASM: ; NumSGPRsForWavesPerEU: 9
52+
53+
; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
54+
55+
define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { ret void }
56+
57+
; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
58+
; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
59+
; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
60+
; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
61+
62+
; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
63+
; ASM: .amdhsa_user_sgpr_count 2
64+
; ASM: .amdhsa_next_free_sgpr 0
65+
; ASM: ; NumSgprs: 6
66+
; ASM: ; NumSGPRsForWavesPerEU: 6
67+
68+
; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
69+
; Encoded like '00'.
70+
71+
define amdgpu_kernel void @amdhsa_kernarg_preload_0_implicit_2(i32) #0 { ret void }
72+
73+
attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s -o - | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
2+
3+
.amdgcn_target "amdgcn-amd-amdhsa--gfx940"
4+
5+
.rodata
6+
7+
// Account for preload kernarg SGPRs in KD field GRANULATED_WAVEFRONT_SGPR_COUNT.
8+
9+
// OBJDUMP: Contents of section .rodata:
10+
// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 ................
11+
// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................
12+
// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................
13+
// OBJDUMP-NOT: 0030 0000ac00 92000000 00000900 00000000 ................
14+
// OBJDUMP-NEXT: 0030 4000ac00 92000000 00000900 00000000 @...............
15+
16+
.amdhsa_kernel amdhsa_kd_kernarg
17+
.amdhsa_user_sgpr_kernarg_preload_length 9
18+
.amdhsa_next_free_sgpr 0
19+
.amdhsa_next_free_vgpr 0
20+
.amdhsa_accum_offset 4
21+
.end_amdhsa_kernel

0 commit comments

Comments
 (0)