-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16][CodeGen] update waitcnt for true16 #128927
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
8f8c844
to
1ae1ff6
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) Changesupdate waitcnt pass to check hi16 and lo16 in true16 mode Full diff: https://github.com/llvm/llvm-project/pull/128927.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ee263f58bcaf2..d86bf27aa96cd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -137,10 +137,10 @@ enum WaitEventType {
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
- SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
- AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
- SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
- NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
+ SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets.
+ AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
+ SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
+ NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
// Artificial register slots to track LDS writes into specific LDS locations
// if a location is known. When slots are exhausted or location is
// unknown use the first slot. The first slot is also always updated in
@@ -165,6 +165,17 @@ enum VmemType {
NUM_VMEM_TYPES
};
+static unsigned getRegPoint(MCRegister Reg, const SIRegisterInfo &TRI) {
+ // Order register interval points so that intervals of 32-bit VGPRs
+ // include intervals of their 16-bit halves.
+ MCRegister MCReg = AMDGPU::getMCReg(Reg, TRI.getSubtarget());
+ unsigned RegIdx = TRI.getHWRegIndex(MCReg);
+ bool IsHi = AMDGPU::isHi16Reg(MCReg, TRI);
+ bool IsVector = TRI.isVectorRegister(MCReg);
+ assert(isUInt<8>(RegIdx));
+ return (IsVector ? 0x200 : 0) | (RegIdx << 1) | (IsHi ? 1 : 0);
+}
+
// Maps values of InstCounterType to the instruction that waits on that
// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
// returns true.
@@ -757,30 +768,31 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
RegInterval Result;
- unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
- AMDGPU::HWEncoding::REG_IDX_MASK;
+ unsigned Reg = getRegPoint(Op.getReg(), *TRI);
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ // VGPRs are tracked every 16 bits, SGPRs by 32 bits
if (TRI->isVectorRegister(*MRI, Op.getReg())) {
assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
Result.first = Reg - Encoding.VGPR0;
if (TRI->isAGPR(*MRI, Op.getReg()))
Result.first += AGPR_OFFSET;
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
+ assert(Size % 16 == 0);
+ Result.second = Result.first + (Size / 16);
} else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
- assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
- Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
+ assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS * 2);
+ Result.first = ((Reg - Encoding.SGPR0) >> 1) + NUM_ALL_VGPRS;
assert(Result.first >= NUM_ALL_VGPRS &&
Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
+ Result.second = Result.first + divideCeil(Size, 32);
}
// TODO: Handle TTMP
// else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
else
return {-1, -1};
- const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
- unsigned Size = TRI->getRegSizeInBits(*RC);
- Result.second = Result.first + ((Size + 16) / 32);
-
return Result;
}
@@ -2452,16 +2464,14 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
- assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+ assert(NumVGPRsMax + AGPR_OFFSET <= SQ_MAX_PGM_VGPRS);
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
RegisterEncoding Encoding = {};
- Encoding.VGPR0 =
- TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
- Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
- Encoding.SGPR0 =
- TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
- Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
+ Encoding.VGPR0 = getRegPoint(AMDGPU::VGPR0, *TRI);
+ Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax * 2 - 1;
+ Encoding.SGPR0 = getRegPoint(AMDGPU::SGPR0, *TRI);
+ Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax * 2 - 1;
BlockInfos.clear();
bool Modified = false;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 6f99c7b4c4962..e41628dff3f5a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3611,6 +3611,12 @@ SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI,
return getSubRegisterClass(SrcRC, MO.getSubReg());
}
+bool SIRegisterInfo::isVGPR(MCRegister Reg) const {
+ const TargetRegisterClass *RC = getPhysRegBaseClass(Reg);
+ // Registers without classes are unaddressable, SGPR-like registers.
+ return RC && isVGPRClass(RC);
+}
+
bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
Register Reg) const {
const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
@@ -3618,6 +3624,13 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
return RC && isVGPRClass(RC);
}
+bool SIRegisterInfo::isAGPR(MCRegister Reg) const {
+ const TargetRegisterClass *RC = getPhysRegBaseClass(Reg);
+
+ // Registers without classes are unaddressable, SGPR-like registers.
+ return RC && isAGPRClass(RC);
+}
+
bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
Register Reg) const {
const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a64180daea2ad..1dd9e94e1f195 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -64,6 +64,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
bool hasReg() { return VGPR != 0; }
};
+ const GCNSubtarget &getSubtarget() const { return ST; }
+
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
@@ -295,8 +297,13 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
getRegClassForOperandReg(const MachineRegisterInfo &MRI,
const MachineOperand &MO) const;
+ bool isVGPR(MCRegister Reg) const;
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const;
+ bool isAGPR(MCRegister Reg) const;
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const;
+ bool isVectorRegister(MCRegister Reg) const {
+ return isVGPR(Reg) || isAGPR(Reg);
+ }
bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const {
return isVGPR(MRI, Reg) || isAGPR(MRI, Reg);
}
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 0e45df223465d..3d21860e2af40 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -61,8 +61,8 @@ define void @spill_i16_alu_two_vals() {
; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc
; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
|
1ae1ff6
to
6c761e7
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
6c761e7
to
47baf50
Compare
ping! |
AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. | ||
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. | ||
NUM_EXTRA_VGPRS = 9, // Reserved slots for DS. | ||
SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there any consequence to widening this on targets without true16?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tracking is not differentiated by the presence or absence of True16 instructions on the subtarget. So it would 2x the size of unsigned VgprScores, which also causes 2x the iterations in the for loop in determineWait and setScoreByInterval on all subtargets.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we avoid that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I understand correctly, this is mainly about compile speed right?
It seems determineWait and setScoreByInterval are doing for loops and checking for boundary. I assumed we mostly just update waitcnt of a few registers at one time right? If that's true, we could benefits through maintaing a sorted list of the score so that we can turn the for loop into a binary search. The benefits in search should overcome the loss in updating value.
Otherwise we might go with Ivan's suggestions to only increase this number for target with true16 enabled. Please let me know your comments on this. Thanks!
dafa28b
to
0573ac9
Compare
MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); | ||
unsigned RegIdx = TRI->getHWRegIndex(MCReg); | ||
assert(isUInt<8>(RegIdx)); | ||
unsigned Reg = (RegIdx << 1) | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks putting this under useRealTrue16Insts()
can save time iterating through the larger ranges where 16-bit registers are not employed? Could be a separate patch?
0573ac9
to
3400d81
Compare
3400d81
to
3f3c575
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with some final nits
Co-authored-by: Jay Foad <[email protected]>
update waitcnt pass to check hi16 and lo16 in true16 mode