Skip to content

[AMDGPU] Include unused preload kernarg in KD total SGPR count #104743

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -900,6 +900,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,

ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
} else if (isKernel(F.getCallingConv()) &&
MFI->getNumKernargPreloadedSGPRs()) {
// Consider cases where the total number of UserSGPRs with trailing
// allocated preload SGPRs, is greater than the number of explicitly
// referenced SGPRs.
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
ProgInfo.NumSGPR =
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
}

// Adjust number of registers used to meet default/requested minimum/maximum
Expand Down
13 changes: 11 additions & 2 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5834,6 +5834,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (!Seen.contains(".amdhsa_next_free_sgpr"))
return TokError(".amdhsa_next_free_sgpr directive is required");

unsigned UserSGPRCount = ExplicitUserSGPRCount.value_or(ImpliedUserSGPRCount);

// Consider the case where the total number of UserSGPRs with trailing
// allocated preload SGPRs, is greater than the number of explicitly
// referenced SGPRs.
if (PreloadLength) {
MCContext &Ctx = getContext();
NextFreeSGPR = AMDGPUMCExpr::createMax(
{NextFreeSGPR, MCConstantExpr::create(UserSGPRCount, Ctx)}, Ctx);
}

const MCExpr *VGPRBlocks;
const MCExpr *SGPRBlocks;
if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
Expand Down Expand Up @@ -5870,8 +5881,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return TokError("amdgpu_user_sgpr_count smaller than than implied by "
"enabled user SGPRs");

unsigned UserSGPRCount = ExplicitUserSGPRCount.value_or(ImpliedUserSGPRCount);

if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
return TokError("too many user SGPRs enabled");
AMDGPU::MCKernelDescriptor::bits_set(
Expand Down
73 changes: 73 additions & 0 deletions llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=ASM %s

; OBJDUMP: Contents of section .rodata:
; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................
; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NOT: 0030 0000af00 94130000 1a000400 00000000 ................
; OBJDUMP-NEXT: 0030 4000af00 94130000 1a000400 00000000 @...............

; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
; ASM: .amdhsa_user_sgpr_count 10
; ASM: .amdhsa_next_free_sgpr 10
; ASM: ; NumSgprs: 16
; ASM: ; NumSGPRsForWavesPerEU: 16

; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
; feild that are not explicitly referenced in the kernel. This test has 6 implicit
; user SPGRs enabled, 4 preloaded kernarg SGPRs, plus 6 extra SGPRs allocated
; for flat scratch, ect. The total number of allocated SGPRs encoded in the
; kernel descriptor should be 16. That's a 1 in the KD field since the granule
; size is 8 and it's NumGranules - 1. The encoding for that looks like '40'.

define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret void }

; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............

; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
; ASM: .amdhsa_user_sgpr_count 10
; ASM: .amdhsa_next_free_sgpr 10
; ASM: ; NumSgprs: 16
; ASM: ; NumSGPRsForWavesPerEU: 16

; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
; implicit, and 6 extra.

define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ret void }

; OBJDUMP-NEXT: 0080 00000000 00000000 08010000 00000000 ................
; OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 00b0 4000af00 86000000 08000100 00000000 @...............

; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
; ASM: .amdhsa_user_sgpr_count 3
; ASM: .amdhsa_next_free_sgpr 3
; ASM: ; NumSgprs: 9
; ASM: ; NumSGPRsForWavesPerEU: 9

; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.

define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { ret void }

; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................

; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
; ASM: .amdhsa_user_sgpr_count 2
; ASM: .amdhsa_next_free_sgpr 0
; ASM: ; NumSgprs: 6
; ASM: ; NumSGPRsForWavesPerEU: 6

; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
; Encoded like '00'.

define amdgpu_kernel void @amdhsa_kernarg_preload_0_implicit_2(i32) #0 { ret void }

attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
21 changes: 21 additions & 0 deletions llvm/test/MC/AMDGPU/amdhsa-kd-kernarg-preload.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s -o - | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s

.amdgcn_target "amdgcn-amd-amdhsa--gfx940"

.rodata

// Account for preload kernarg SGPRs in KD field GRANULATED_WAVEFRONT_SGPR_COUNT.

// OBJDUMP: Contents of section .rodata:
// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 ................
// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................
// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................
// OBJDUMP-NOT: 0030 0000ac00 92000000 00000900 00000000 ................
// OBJDUMP-NEXT: 0030 4000ac00 92000000 00000900 00000000 @...............

.amdhsa_kernel amdhsa_kd_kernarg
.amdhsa_user_sgpr_kernarg_preload_length 9
.amdhsa_next_free_sgpr 0
.amdhsa_next_free_vgpr 0
.amdhsa_accum_offset 4
.end_amdhsa_kernel
Loading