@@ -137,10 +137,10 @@ enum WaitEventType {
137
137
// We reserve a fixed number of VGPR slots in the scoring tables for
138
138
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
139
139
enum RegisterMapping {
140
- SQ_MAX_PGM_VGPRS = 512 , // Maximum programmable VGPRs across all targets.
141
- AGPR_OFFSET = 256 , // Maximum programmable ArchVGPRs across all targets.
142
- SQ_MAX_PGM_SGPRS = 256 , // Maximum programmable SGPRs across all targets.
143
- NUM_EXTRA_VGPRS = 9 , // Reserved slots for DS.
140
+ SQ_MAX_PGM_VGPRS = 1024 , // Maximum programmable VGPRs across all targets.
141
+ AGPR_OFFSET = 512 , // Maximum programmable ArchVGPRs across all targets.
142
+ SQ_MAX_PGM_SGPRS = 256 , // Maximum programmable SGPRs across all targets.
143
+ NUM_EXTRA_VGPRS = 9 , // Reserved slots for DS.
144
144
// Artificial register slots to track LDS writes into specific LDS locations
145
145
// if a location is known. When slots are exhausted or location is
146
146
// unknown use the first slot. The first slot is also always updated in
@@ -165,6 +165,18 @@ enum VmemType {
165
165
NUM_VMEM_TYPES
166
166
};
167
167
168
+ static unsigned getRegPoint (const GCNSubtarget &ST, MCRegister Reg,
169
+ const SIRegisterInfo &TRI) {
170
+ // Order register interval points so that intervals of 32-bit VGPRs
171
+ // include intervals of their 16-bit halves.
172
+ MCRegister MCReg = AMDGPU::getMCReg (Reg, ST);
173
+ unsigned RegIdx = TRI.getHWRegIndex (MCReg);
174
+ bool IsHi = AMDGPU::isHi16Reg (MCReg, TRI);
175
+ bool IsVector = TRI.isVectorRegister (MCReg);
176
+ assert (isUInt<8 >(RegIdx));
177
+ return (IsVector ? 0x200 : 0 ) | (RegIdx << 1 ) | (IsHi ? 1 : 0 );
178
+ }
179
+
168
180
// Maps values of InstCounterType to the instruction that waits on that
169
181
// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
170
182
// returns true.
@@ -757,30 +769,31 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
757
769
758
770
RegInterval Result;
759
771
760
- unsigned Reg = TRI->getEncodingValue (AMDGPU::getMCReg (Op.getReg (), *ST)) &
761
- AMDGPU::HWEncoding::REG_IDX_MASK;
772
+ unsigned Reg = getRegPoint (*ST, Op.getReg (), *TRI);
773
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass (Op.getReg ());
774
+ unsigned Size = TRI->getRegSizeInBits (*RC);
762
775
776
+ // VGPRs are tracked every 16 bits, SGPRs by 32 bits
763
777
if (TRI->isVectorRegister (*MRI, Op.getReg ())) {
764
778
assert (Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL );
765
779
Result.first = Reg - Encoding.VGPR0 ;
766
780
if (TRI->isAGPR (*MRI, Op.getReg ()))
767
781
Result.first += AGPR_OFFSET;
768
782
assert (Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
783
+ assert (Size % 16 == 0 );
784
+ Result.second = Result.first + (Size / 16 );
769
785
} else if (TRI->isSGPRReg (*MRI, Op.getReg ())) {
770
- assert (Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
771
- Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
786
+ assert (Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS * 2 );
787
+ Result.first = (( Reg - Encoding.SGPR0 ) >> 1 ) + NUM_ALL_VGPRS;
772
788
assert (Result.first >= NUM_ALL_VGPRS &&
773
789
Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
790
+ Result.second = Result.first + divideCeil (Size, 32 );
774
791
}
775
792
// TODO: Handle TTMP
776
793
// else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
777
794
else
778
795
return {-1 , -1 };
779
796
780
- const TargetRegisterClass *RC = TRI->getPhysRegBaseClass (Op.getReg ());
781
- unsigned Size = TRI->getRegSizeInBits (*RC);
782
- Result.second = Result.first + ((Size + 16 ) / 32 );
783
-
784
797
return Result;
785
798
}
786
799
@@ -2452,16 +2465,14 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2452
2465
2453
2466
unsigned NumVGPRsMax = ST->getAddressableNumVGPRs ();
2454
2467
unsigned NumSGPRsMax = ST->getAddressableNumSGPRs ();
2455
- assert (NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2468
+ assert (NumVGPRsMax + AGPR_OFFSET <= SQ_MAX_PGM_VGPRS);
2456
2469
assert (NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2457
2470
2458
2471
RegisterEncoding Encoding = {};
2459
- Encoding.VGPR0 =
2460
- TRI->getEncodingValue (AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2461
- Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1 ;
2462
- Encoding.SGPR0 =
2463
- TRI->getEncodingValue (AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2464
- Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1 ;
2472
+ Encoding.VGPR0 = getRegPoint (*ST, AMDGPU::VGPR0, *TRI);
2473
+ Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax * 2 - 1 ;
2474
+ Encoding.SGPR0 = getRegPoint (*ST, AMDGPU::SGPR0, *TRI);
2475
+ Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax * 2 - 1 ;
2465
2476
2466
2477
BlockInfos.clear ();
2467
2478
bool Modified = false ;
0 commit comments