@@ -441,7 +441,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
441
441
442
442
assert (isUInt<32 >(PI.ScratchSize ));
443
443
assert (isUInt<32 >(PI.getComputePGMRSrc1 ()));
444
- assert (isUInt<32 >(PI.ComputePGMRSrc2 ));
444
+ assert (isUInt<32 >(PI.getComputePGMRSrc2 () ));
445
445
446
446
KernelDescriptor.group_segment_fixed_size = PI.LDSSize ;
447
447
KernelDescriptor.private_segment_fixed_size = PI.ScratchSize ;
@@ -450,7 +450,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
450
450
KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize (F, MaxKernArgAlign);
451
451
452
452
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1 ();
453
- KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2 ;
453
+ KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2 () ;
454
454
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties (MF);
455
455
456
456
assert (STM.hasGFX90AInsts () || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0 );
@@ -579,28 +579,27 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
579
579
OutStreamer->emitRawComment (
580
580
" WaveLimiterHint : " + Twine (MFI->needsWaveLimiter ()), false );
581
581
582
- OutStreamer->emitRawComment (
583
- " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
584
- Twine (G_00B84C_SCRATCH_EN (CurrentProgramInfo.ComputePGMRSrc2 )), false );
585
- OutStreamer->emitRawComment (
586
- " COMPUTE_PGM_RSRC2:USER_SGPR: " +
587
- Twine (G_00B84C_USER_SGPR (CurrentProgramInfo.ComputePGMRSrc2 )), false );
588
- OutStreamer->emitRawComment (
589
- " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
590
- Twine (G_00B84C_TRAP_HANDLER (CurrentProgramInfo.ComputePGMRSrc2 )), false );
591
- OutStreamer->emitRawComment (
592
- " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
593
- Twine (G_00B84C_TGID_X_EN (CurrentProgramInfo.ComputePGMRSrc2 )), false );
594
- OutStreamer->emitRawComment (
595
- " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
596
- Twine (G_00B84C_TGID_Y_EN (CurrentProgramInfo.ComputePGMRSrc2 )), false );
597
- OutStreamer->emitRawComment (
598
- " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
599
- Twine (G_00B84C_TGID_Z_EN (CurrentProgramInfo.ComputePGMRSrc2 )), false );
600
- OutStreamer->emitRawComment (
601
- " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
602
- Twine (G_00B84C_TIDIG_COMP_CNT (CurrentProgramInfo.ComputePGMRSrc2 )),
603
- false );
582
+ OutStreamer->emitRawComment (" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
583
+ Twine (CurrentProgramInfo.ScratchEnable ),
584
+ false );
585
+ OutStreamer->emitRawComment (" COMPUTE_PGM_RSRC2:USER_SGPR: " +
586
+ Twine (CurrentProgramInfo.UserSGPR ),
587
+ false );
588
+ OutStreamer->emitRawComment (" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
589
+ Twine (CurrentProgramInfo.TrapHandlerEnable ),
590
+ false );
591
+ OutStreamer->emitRawComment (" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
592
+ Twine (CurrentProgramInfo.TGIdXEnable ),
593
+ false );
594
+ OutStreamer->emitRawComment (" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
595
+ Twine (CurrentProgramInfo.TGIdYEnable ),
596
+ false );
597
+ OutStreamer->emitRawComment (" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
598
+ Twine (CurrentProgramInfo.TGIdZEnable ),
599
+ false );
600
+ OutStreamer->emitRawComment (" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
601
+ Twine (CurrentProgramInfo.TIdIGCompCount ),
602
+ false );
604
603
605
604
assert (STM.hasGFX90AInsts () ||
606
605
CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0 );
@@ -922,22 +921,21 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
922
921
// anything to disable it if we know the stack isn't used here. We may still
923
922
// have emitted code reading it to initialize scratch, but if that's unused
924
923
// reading garbage should be OK.
925
- const bool EnablePrivateSegment =
924
+ ProgInfo. ScratchEnable =
926
925
ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack ;
927
- ProgInfo.ComputePGMRSrc2 =
928
- S_00B84C_SCRATCH_EN (EnablePrivateSegment) |
929
- S_00B84C_USER_SGPR (MFI->getNumUserSGPRs ()) |
930
- // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
931
- S_00B84C_TRAP_HANDLER (STM.isAmdHsaOS () ? 0 : STM.isTrapHandlerEnabled ()) |
932
- S_00B84C_TGID_X_EN (MFI->hasWorkGroupIDX ()) |
933
- S_00B84C_TGID_Y_EN (MFI->hasWorkGroupIDY ()) |
934
- S_00B84C_TGID_Z_EN (MFI->hasWorkGroupIDZ ()) |
935
- S_00B84C_TG_SIZE_EN (MFI->hasWorkGroupInfo ()) |
936
- S_00B84C_TIDIG_COMP_CNT (TIDIGCompCnt) |
937
- S_00B84C_EXCP_EN_MSB (0 ) |
938
- // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
939
- S_00B84C_LDS_SIZE (STM.isAmdHsaOS () ? 0 : ProgInfo.LDSBlocks ) |
940
- S_00B84C_EXCP_EN (0 );
926
+ ProgInfo.UserSGPR = MFI->getNumUserSGPRs ();
927
+ // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
928
+ ProgInfo.TrapHandlerEnable =
929
+ STM.isAmdHsaOS () ? 0 : STM.isTrapHandlerEnabled ();
930
+ ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX ();
931
+ ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY ();
932
+ ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ ();
933
+ ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo ();
934
+ ProgInfo.TIdIGCompCount = TIDIGCompCnt;
935
+ ProgInfo.EXCPEnMSB = 0 ;
936
+ // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
937
+ ProgInfo.LdsSize = STM.isAmdHsaOS () ? 0 : ProgInfo.LDSBlocks ;
938
+ ProgInfo.EXCPEnable = 0 ;
941
939
942
940
if (STM.hasGFX90AInsts ()) {
943
941
AMDHSA_BITS_SET (ProgInfo.ComputePGMRSrc3GFX90A ,
@@ -978,7 +976,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
978
976
OutStreamer->emitInt32 (CurrentProgramInfo.getComputePGMRSrc1 ());
979
977
980
978
OutStreamer->emitInt32 (R_00B84C_COMPUTE_PGM_RSRC2);
981
- OutStreamer->emitInt32 (CurrentProgramInfo.ComputePGMRSrc2 );
979
+ OutStreamer->emitInt32 (CurrentProgramInfo.getComputePGMRSrc2 () );
982
980
983
981
OutStreamer->emitInt32 (R_00B860_COMPUTE_TMPRING_SIZE);
984
982
OutStreamer->emitInt32 (
@@ -1038,25 +1036,87 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1038
1036
}
1039
1037
1040
1038
MD->setNumUsedSgprs (CC, CurrentProgramInfo.NumSGPRsForWavesPerEU );
1041
- MD->setRsrc1 (CC, CurrentProgramInfo.getPGMRSrc1 (CC));
1042
- if (AMDGPU::isCompute (CC)) {
1043
- MD->setRsrc2 (CC, CurrentProgramInfo.ComputePGMRSrc2 );
1039
+ if (MD->getPALMajorVersion () < 3 ) {
1040
+ MD->setRsrc1 (CC, CurrentProgramInfo.getPGMRSrc1 (CC));
1041
+ if (AMDGPU::isCompute (CC)) {
1042
+ MD->setRsrc2 (CC, CurrentProgramInfo.getComputePGMRSrc2 ());
1043
+ } else {
1044
+ if (CurrentProgramInfo.ScratchBlocks > 0 )
1045
+ MD->setRsrc2 (CC, S_00B84C_SCRATCH_EN (1 ));
1046
+ }
1044
1047
} else {
1045
- if (CurrentProgramInfo.ScratchBlocks > 0 )
1046
- MD->setRsrc2 (CC, S_00B84C_SCRATCH_EN (1 ));
1048
+ // Priority?
1049
+ MD->setHwStage (CC, " .float_mode" , CurrentProgramInfo.FloatMode );
1050
+ // Priv?
1051
+ // DX10Clamp?
1052
+ MD->setHwStage (CC, " .debug_mode" , (bool )CurrentProgramInfo.DebugMode );
1053
+ MD->setHwStage (CC, " .ieee_mode" , (bool )CurrentProgramInfo.IEEEMode );
1054
+ MD->setHwStage (CC, " .wgp_mode" , (bool )CurrentProgramInfo.WgpMode );
1055
+ MD->setHwStage (CC, " .mem_ordered" , (bool )CurrentProgramInfo.MemOrdered );
1056
+
1057
+ if (AMDGPU::isCompute (CC)) {
1058
+ MD->setHwStage (CC, " .scratch_en" , (bool )CurrentProgramInfo.ScratchEnable );
1059
+ MD->setHwStage (CC, " .trap_present" ,
1060
+ (bool )CurrentProgramInfo.TrapHandlerEnable );
1061
+
1062
+ // Compute registers
1063
+ // If the front-end has set tgid_x/y/z_en - assert that the
1064
+ // CurrentProgramInfo is consistent (usually set with function attributes
1065
+ // amdgpu-no-workgroup-id-x etc.).
1066
+ assert (MD->checkComputeRegisters (" .tgid_x_en" ,
1067
+ (bool )CurrentProgramInfo.TGIdXEnable ));
1068
+ assert (MD->checkComputeRegisters (" .tgid_y_en" ,
1069
+ (bool )CurrentProgramInfo.TGIdYEnable ));
1070
+ assert (MD->checkComputeRegisters (" .tgid_z_en" ,
1071
+ (bool )CurrentProgramInfo.TGIdZEnable ));
1072
+
1073
+ // EXCPEnMSB?
1074
+ const unsigned LdsDwGranularity = 128 ;
1075
+ MD->setHwStage (CC, " .lds_size" ,
1076
+ (unsigned )(CurrentProgramInfo.LdsSize * LdsDwGranularity *
1077
+ sizeof (uint32_t )));
1078
+ MD->setHwStage (CC, " .excp_en" , CurrentProgramInfo.EXCPEnable );
1079
+ } else {
1080
+ MD->setHwStage (CC, " .scratch_en" , (bool )CurrentProgramInfo.ScratchEnable );
1081
+ }
1047
1082
}
1083
+
1048
1084
// ScratchSize is in bytes, 16 aligned.
1049
1085
MD->setScratchSize (CC, alignTo (CurrentProgramInfo.ScratchSize , 16 ));
1050
1086
if (MF.getFunction ().getCallingConv () == CallingConv::AMDGPU_PS) {
1051
1087
unsigned ExtraLDSSize = STM.getGeneration () >= AMDGPUSubtarget::GFX11
1052
1088
? divideCeil (CurrentProgramInfo.LDSBlocks , 2 )
1053
1089
: CurrentProgramInfo.LDSBlocks ;
1054
- MD->setRsrc2 (CC, S_00B02C_EXTRA_LDS_SIZE (ExtraLDSSize));
1055
- MD->setSpiPsInputEna (MFI->getPSInputEnable ());
1056
- MD->setSpiPsInputAddr (MFI->getPSInputAddr ());
1090
+ if (MD->getPALMajorVersion () < 3 ) {
1091
+ MD->setRsrc2 (CC, S_00B02C_EXTRA_LDS_SIZE (ExtraLDSSize));
1092
+ MD->setSpiPsInputEna (MFI->getPSInputEnable ());
1093
+ MD->setSpiPsInputAddr (MFI->getPSInputAddr ());
1094
+ } else {
1095
+ // Graphics registers
1096
+ MD->setGraphicsRegisters (" .ps_extra_lds_size" , ExtraLDSSize);
1097
+ // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1098
+ static StringLiteral const PsInputFields[] = {
1099
+ " .persp_sample_ena" , " .persp_center_ena" ,
1100
+ " .persp_centroid_ena" , " .persp_pull_model_ena" ,
1101
+ " .linear_sample_ena" , " .linear_center_ena" ,
1102
+ " .linear_centroid_ena" , " .line_stipple_tex_ena" ,
1103
+ " .pos_x_float_ena" , " .pos_y_float_ena" ,
1104
+ " .pos_z_float_ena" , " .pos_w_float_ena" ,
1105
+ " .front_face_ena" , " .ancillary_ena" ,
1106
+ " .sample_coverage_ena" , " .pos_fixed_pt_ena" };
1107
+ unsigned PSInputEna = MFI->getPSInputEnable ();
1108
+ unsigned PSInputAddr = MFI->getPSInputAddr ();
1109
+ for (auto [Idx, Field] : enumerate(PsInputFields)) {
1110
+ MD->setGraphicsRegisters (" .spi_ps_input_ena" , Field,
1111
+ (bool )((PSInputEna >> Idx) & 1 ));
1112
+ MD->setGraphicsRegisters (" .spi_ps_input_addr" , Field,
1113
+ (bool )((PSInputAddr >> Idx) & 1 ));
1114
+ }
1115
+ }
1057
1116
}
1058
1117
1059
- if (STM.isWave32 ())
1118
+ // For version 3 and above the wave front size is already set in the metadata
1119
+ if (MD->getPALMajorVersion () < 3 && STM.isWave32 ())
1060
1120
MD->setWave32 (MF.getFunction ().getCallingConv ());
1061
1121
}
1062
1122
@@ -1068,7 +1128,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1068
1128
// Set compute registers
1069
1129
MD->setRsrc1 (CallingConv::AMDGPU_CS,
1070
1130
CurrentProgramInfo.getPGMRSrc1 (CallingConv::AMDGPU_CS));
1071
- MD->setRsrc2 (CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2 );
1131
+ MD->setRsrc2 (CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2 () );
1072
1132
1073
1133
// Set optional info
1074
1134
MD->setFunctionLdsSize (MF, CurrentProgramInfo.LDSSize );
@@ -1104,7 +1164,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1104
1164
1105
1165
Out.compute_pgm_resource_registers =
1106
1166
CurrentProgramInfo.getComputePGMRSrc1 () |
1107
- (CurrentProgramInfo.ComputePGMRSrc2 << 32 );
1167
+ (CurrentProgramInfo.getComputePGMRSrc2 () << 32 );
1108
1168
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1109
1169
1110
1170
if (CurrentProgramInfo.DynamicCallStack )
0 commit comments