Skip to content

Commit fc83f1d

Browse files
committed
[AMDGPU] Add backend support for new PAL ELF Metadata 3.0
PAL Metadata 3.0 introduces an explicit structure in metadata for the programmable registers written out by the compiler backend. Rather than using opaque registers which can change between different architectures and requires encoding the bitfield information in the backend, which may change between versions. This is the initial minimal implementation that enables the use of PAL Metadata 3.0. The change itself should be NFC for non-PAL, although the way RSRC2 register is handled has been changed slightly. The test is fairly minimal, but checks that the metadata format looks as expected and verifies a couple of special cases such as tgid_[xyz]_en handling and PsInputAddr/Ena which also change to explicit fields. Differential Revision: https://reviews.llvm.org/D147143
1 parent e4251fc commit fc83f1d

File tree

6 files changed

+476
-58
lines changed

6 files changed

+476
-58
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 111 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
441441

442442
assert(isUInt<32>(PI.ScratchSize));
443443
assert(isUInt<32>(PI.getComputePGMRSrc1()));
444-
assert(isUInt<32>(PI.ComputePGMRSrc2));
444+
assert(isUInt<32>(PI.getComputePGMRSrc2()));
445445

446446
KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
447447
KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
@@ -450,7 +450,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
450450
KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
451451

452452
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
453-
KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
453+
KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
454454
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
455455

456456
assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
@@ -579,28 +579,27 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
579579
OutStreamer->emitRawComment(
580580
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
581581

582-
OutStreamer->emitRawComment(
583-
" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
584-
Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
585-
OutStreamer->emitRawComment(
586-
" COMPUTE_PGM_RSRC2:USER_SGPR: " +
587-
Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
588-
OutStreamer->emitRawComment(
589-
" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
590-
Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
591-
OutStreamer->emitRawComment(
592-
" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
593-
Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
594-
OutStreamer->emitRawComment(
595-
" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
596-
Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
597-
OutStreamer->emitRawComment(
598-
" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
599-
Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
600-
OutStreamer->emitRawComment(
601-
" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
602-
Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
603-
false);
582+
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
583+
Twine(CurrentProgramInfo.ScratchEnable),
584+
false);
585+
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
586+
Twine(CurrentProgramInfo.UserSGPR),
587+
false);
588+
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
589+
Twine(CurrentProgramInfo.TrapHandlerEnable),
590+
false);
591+
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
592+
Twine(CurrentProgramInfo.TGIdXEnable),
593+
false);
594+
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
595+
Twine(CurrentProgramInfo.TGIdYEnable),
596+
false);
597+
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
598+
Twine(CurrentProgramInfo.TGIdZEnable),
599+
false);
600+
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
601+
Twine(CurrentProgramInfo.TIdIGCompCount),
602+
false);
604603

605604
assert(STM.hasGFX90AInsts() ||
606605
CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
@@ -922,22 +921,21 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
922921
// anything to disable it if we know the stack isn't used here. We may still
923922
// have emitted code reading it to initialize scratch, but if that's unused
924923
// reading garbage should be OK.
925-
const bool EnablePrivateSegment =
924+
ProgInfo.ScratchEnable =
926925
ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
927-
ProgInfo.ComputePGMRSrc2 =
928-
S_00B84C_SCRATCH_EN(EnablePrivateSegment) |
929-
S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
930-
// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
931-
S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
932-
S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
933-
S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
934-
S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
935-
S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
936-
S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
937-
S_00B84C_EXCP_EN_MSB(0) |
938-
// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
939-
S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
940-
S_00B84C_EXCP_EN(0);
926+
ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
927+
// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
928+
ProgInfo.TrapHandlerEnable =
929+
STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
930+
ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
931+
ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
932+
ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
933+
ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
934+
ProgInfo.TIdIGCompCount = TIDIGCompCnt;
935+
ProgInfo.EXCPEnMSB = 0;
936+
// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
937+
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
938+
ProgInfo.EXCPEnable = 0;
941939

942940
if (STM.hasGFX90AInsts()) {
943941
AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
@@ -978,7 +976,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
978976
OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
979977

980978
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
981-
OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
979+
OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
982980

983981
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
984982
OutStreamer->emitInt32(
@@ -1038,25 +1036,87 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
10381036
}
10391037

10401038
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1041-
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1042-
if (AMDGPU::isCompute(CC)) {
1043-
MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
1039+
if (MD->getPALMajorVersion() < 3) {
1040+
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1041+
if (AMDGPU::isCompute(CC)) {
1042+
MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
1043+
} else {
1044+
if (CurrentProgramInfo.ScratchBlocks > 0)
1045+
MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1046+
}
10441047
} else {
1045-
if (CurrentProgramInfo.ScratchBlocks > 0)
1046-
MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1048+
// Priority?
1049+
MD->setHwStage(CC, ".float_mode", CurrentProgramInfo.FloatMode);
1050+
// Priv?
1051+
// DX10Clamp?
1052+
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1053+
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1054+
MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1055+
MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1056+
1057+
if (AMDGPU::isCompute(CC)) {
1058+
MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1059+
MD->setHwStage(CC, ".trap_present",
1060+
(bool)CurrentProgramInfo.TrapHandlerEnable);
1061+
1062+
// Compute registers
1063+
// If the front-end has set tgid_x/y/z_en - assert that the
1064+
// CurrentProgramInfo is consistent (usually set with function attributes
1065+
// amdgpu-no-workgroup-id-x etc.).
1066+
assert(MD->checkComputeRegisters(".tgid_x_en",
1067+
(bool)CurrentProgramInfo.TGIdXEnable));
1068+
assert(MD->checkComputeRegisters(".tgid_y_en",
1069+
(bool)CurrentProgramInfo.TGIdYEnable));
1070+
assert(MD->checkComputeRegisters(".tgid_z_en",
1071+
(bool)CurrentProgramInfo.TGIdZEnable));
1072+
1073+
// EXCPEnMSB?
1074+
const unsigned LdsDwGranularity = 128;
1075+
MD->setHwStage(CC, ".lds_size",
1076+
(unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
1077+
sizeof(uint32_t)));
1078+
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1079+
} else {
1080+
MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1081+
}
10471082
}
1083+
10481084
// ScratchSize is in bytes, 16 aligned.
10491085
MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
10501086
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
10511087
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
10521088
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
10531089
: CurrentProgramInfo.LDSBlocks;
1054-
MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1055-
MD->setSpiPsInputEna(MFI->getPSInputEnable());
1056-
MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1090+
if (MD->getPALMajorVersion() < 3) {
1091+
MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1092+
MD->setSpiPsInputEna(MFI->getPSInputEnable());
1093+
MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1094+
} else {
1095+
// Graphics registers
1096+
MD->setGraphicsRegisters(".ps_extra_lds_size", ExtraLDSSize);
1097+
// Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1098+
static StringLiteral const PsInputFields[] = {
1099+
".persp_sample_ena", ".persp_center_ena",
1100+
".persp_centroid_ena", ".persp_pull_model_ena",
1101+
".linear_sample_ena", ".linear_center_ena",
1102+
".linear_centroid_ena", ".line_stipple_tex_ena",
1103+
".pos_x_float_ena", ".pos_y_float_ena",
1104+
".pos_z_float_ena", ".pos_w_float_ena",
1105+
".front_face_ena", ".ancillary_ena",
1106+
".sample_coverage_ena", ".pos_fixed_pt_ena"};
1107+
unsigned PSInputEna = MFI->getPSInputEnable();
1108+
unsigned PSInputAddr = MFI->getPSInputAddr();
1109+
for (auto [Idx, Field] : enumerate(PsInputFields)) {
1110+
MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1111+
(bool)((PSInputEna >> Idx) & 1));
1112+
MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1113+
(bool)((PSInputAddr >> Idx) & 1));
1114+
}
1115+
}
10571116
}
10581117

1059-
if (STM.isWave32())
1118+
// For version 3 and above the wave front size is already set in the metadata
1119+
if (MD->getPALMajorVersion() < 3 && STM.isWave32())
10601120
MD->setWave32(MF.getFunction().getCallingConv());
10611121
}
10621122

@@ -1068,7 +1128,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
10681128
// Set compute registers
10691129
MD->setRsrc1(CallingConv::AMDGPU_CS,
10701130
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
1071-
MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
1131+
MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
10721132

10731133
// Set optional info
10741134
MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
@@ -1104,7 +1164,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
11041164

11051165
Out.compute_pgm_resource_registers =
11061166
CurrentProgramInfo.getComputePGMRSrc1() |
1107-
(CurrentProgramInfo.ComputePGMRSrc2 << 32);
1167+
(CurrentProgramInfo.getComputePGMRSrc2() << 32);
11081168
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
11091169

11101170
if (CurrentProgramInfo.DynamicCallStack)

llvm/lib/Target/AMDGPU/SIProgramInfo.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,23 @@ uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC) const {
5454
}
5555
return Reg;
5656
}
57+
58+
uint64_t SIProgramInfo::getComputePGMRSrc2() const {
59+
uint64_t Reg =
60+
S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) |
61+
S_00B84C_TRAP_HANDLER(TrapHandlerEnable) |
62+
S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) |
63+
S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) |
64+
S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) |
65+
S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) |
66+
S_00B84C_EXCP_EN(EXCPEnable);
67+
68+
return Reg;
69+
}
70+
71+
uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
72+
if (AMDGPU::isCompute(CC))
73+
return getComputePGMRSrc2();
74+
75+
return 0;
76+
}

llvm/lib/Target/AMDGPU/SIProgramInfo.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,23 @@ struct SIProgramInfo {
3636
uint32_t MemOrdered = 0; // GFX10+
3737
uint64_t ScratchSize = 0;
3838

39-
// Fields set in PGM_RSRC2 pm4 packet.
39+
// State used to calculate fields set in PGM_RSRC2 pm4 packet.
4040
uint32_t LDSBlocks = 0;
4141
uint32_t ScratchBlocks = 0;
4242

43-
uint64_t ComputePGMRSrc2 = 0;
43+
// Fields set in PGM_RSRC2 pm4 packet
44+
uint32_t ScratchEnable = 0;
45+
uint32_t UserSGPR = 0;
46+
uint32_t TrapHandlerEnable = 0;
47+
uint32_t TGIdXEnable = 0;
48+
uint32_t TGIdYEnable = 0;
49+
uint32_t TGIdZEnable = 0;
50+
uint32_t TGSizeEnable = 0;
51+
uint32_t TIdIGCompCount = 0;
52+
uint32_t EXCPEnMSB = 0;
53+
uint32_t LdsSize = 0;
54+
uint32_t EXCPEnable = 0;
55+
4456
uint64_t ComputePGMRSrc3GFX90A = 0;
4557

4658
uint32_t NumVGPR = 0;
@@ -75,6 +87,10 @@ struct SIProgramInfo {
7587
/// Compute the value of the ComputePGMRsrc1 register.
7688
uint64_t getComputePGMRSrc1() const;
7789
uint64_t getPGMRSrc1(CallingConv::ID CC) const;
90+
91+
/// Compute the value of the ComputePGMRsrc2 register.
92+
uint64_t getComputePGMRSrc2() const;
93+
uint64_t getPGMRSrc2(CallingConv::ID CC) const;
7894
};
7995

8096
} // namespace llvm

0 commit comments

Comments
 (0)