Skip to content

Commit 222b99d

Browse files
[AMDGPU][True16][CodeGen] update waitcnt for true16 (#128927)
update waitcnt pass to check hi16 and lo16 in true16 mode --------- Co-authored-by: Jay Foad <[email protected]>
1 parent 7129205 commit 222b99d

File tree

2 files changed

+19
-14
lines changed

2 files changed

+19
-14
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,10 @@ enum WaitEventType {
130130
// We reserve a fixed number of VGPR slots in the scoring tables for
131131
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
132132
enum RegisterMapping {
133-
SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
134-
AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
135-
SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets.
136-
NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
133+
SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets.
134+
AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
135+
SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets.
136+
NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
137137
// Artificial register slots to track LDS writes into specific LDS locations
138138
// if a location is known. When slots are exhausted or location is
139139
// unknown use the first slot. The first slot is also always updated in
@@ -748,27 +748,32 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
748748

749749
RegInterval Result;
750750

751-
unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
752-
AMDGPU::HWEncoding::REG_IDX_MASK;
751+
MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST);
752+
unsigned RegIdx = TRI->getHWRegIndex(MCReg);
753+
assert(isUInt<8>(RegIdx));
753754

755+
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
756+
unsigned Size = TRI->getRegSizeInBits(*RC);
757+
758+
// AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
754759
if (TRI->isVectorRegister(*MRI, Op.getReg())) {
755-
assert(Reg <= SQ_MAX_PGM_VGPRS);
760+
unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
761+
assert(Reg < AGPR_OFFSET);
756762
Result.first = Reg;
757763
if (TRI->isAGPR(*MRI, Op.getReg()))
758764
Result.first += AGPR_OFFSET;
759765
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
760-
} else if (TRI->isSGPRReg(*MRI, Op.getReg()) && Reg < SQ_MAX_PGM_SGPRS) {
766+
assert(Size % 16 == 0);
767+
Result.second = Result.first + (Size / 16);
768+
} else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
761769
// SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
762770
// sources like SRC_PRIVATE_BASE.
763-
Result.first = Reg + NUM_ALL_VGPRS;
771+
Result.first = RegIdx + NUM_ALL_VGPRS;
772+
Result.second = Result.first + divideCeil(Size, 32);
764773
} else {
765774
return {-1, -1};
766775
}
767776

768-
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
769-
unsigned Size = TRI->getRegSizeInBits(*RC);
770-
Result.second = Result.first + ((Size + 16) / 32);
771-
772777
return Result;
773778
}
774779

llvm/test/CodeGen/AMDGPU/spillv16.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ define void @spill_i16_alu_two_vals() {
6161
; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
6262
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
6363
; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
64-
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
6564
; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
65+
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
6666
; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc
6767
; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
6868
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc

0 commit comments

Comments
 (0)