-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Inplace FI elimination during PEI for scalar copy instruction #99556
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4c43e74
7b80c8b
126bd30
eb1e303
aa34010
ab19c1e
ce41a13
4254f3b
7bcea66
23c7344
b5a09ea
2f16d7b
adf2f18
4c97dde
bb9f139
548f498
a4d8201
f7f8b16
9ae1cd2
0b696d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2449,7 +2449,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, | |
? &AMDGPU::SReg_32RegClass | ||
: &AMDGPU::VGPR_32RegClass; | ||
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || | ||
MI->getOpcode() == AMDGPU::V_MOV_B32_e64; | ||
MI->getOpcode() == AMDGPU::V_MOV_B32_e64 || | ||
MI->getOpcode() == AMDGPU::S_MOV_B32; | ||
Register ResultReg = | ||
IsCopy ? MI->getOperand(0).getReg() | ||
: RS->scavengeRegisterBackwards(*RC, MI, false, 0); | ||
|
@@ -2458,7 +2459,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, | |
if (Offset == 0) { | ||
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 | ||
: AMDGPU::V_LSHRREV_B32_e64; | ||
auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg); | ||
Register TmpResultReg = ResultReg; | ||
if (IsSALU && LiveSCC) { | ||
TmpResultReg = RS->scavengeRegisterBackwards( | ||
AMDGPU::VGPR_32RegClass, MI, false, 0); | ||
} | ||
|
||
auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg); | ||
if (OpCode == AMDGPU::V_LSHRREV_B32_e64) | ||
// For V_LSHRREV, the operands are reversed (the shift count goes | ||
// first). | ||
|
@@ -2468,11 +2475,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, | |
if (IsSALU && !LiveSCC) | ||
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. | ||
if (IsSALU && LiveSCC) { | ||
Register NewDest = RS->scavengeRegisterBackwards( | ||
AMDGPU::SReg_32RegClass, Shift, false, 0); | ||
Register NewDest = | ||
IsCopy ? ResultReg | ||
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass, | ||
Shift, false, 0); | ||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), | ||
NewDest) | ||
.addReg(ResultReg); | ||
.addReg(TmpResultReg); | ||
ResultReg = NewDest; | ||
} | ||
} else { | ||
|
@@ -2523,22 +2532,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, | |
|
||
// We may have 1 free scratch SGPR even though a carry out is | ||
// unavailable. Only one additional mov is needed. | ||
Register TmpScaledReg = RS->scavengeRegisterBackwards( | ||
AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false); | ||
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; | ||
Register TmpScaledReg = IsCopy && IsSALU | ||
? ResultReg | ||
: RS->scavengeRegisterBackwards( | ||
AMDGPU::SReg_32_XM0RegClass, MI, | ||
false, 0, /*AllowSpill=*/false); | ||
Register ScaledReg = | ||
TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; | ||
Register TmpResultReg = ScaledReg; | ||
|
||
if (!LiveSCC) { | ||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg) | ||
.addReg(FrameReg) | ||
.addImm(ST.getWavefrontSizeLog2()); | ||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg) | ||
.addReg(TmpResultReg, RegState::Kill) | ||
.addImm(Offset); | ||
} else { | ||
TmpResultReg = RS->scavengeRegisterBackwards( | ||
AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true); | ||
|
||
MachineInstrBuilder Add; | ||
if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) { | ||
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), | ||
TmpResultReg) | ||
.addImm(ST.getWavefrontSizeLog2()) | ||
.addReg(FrameReg); | ||
if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) { | ||
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), | ||
ResultReg) | ||
.addImm(Offset); | ||
Add.addReg(ResultReg, RegState::Kill) | ||
.addReg(TmpResultReg, RegState::Kill) | ||
.addImm(0); | ||
} else | ||
Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill); | ||
} else { | ||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When you do this reassociation, you need to multiply the offset by the wavesize. You're doing the mul + add in terms of waves There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs comments about why this is doing this reassociation vs. above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the offset is an inline immediate, you can just fold it directly into the mad24 operand as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should address this in a follow up There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also should special case folding frame indexes into S_ADD_I32/S_OR_B32 |
||
TmpResultReg) | ||
.addImm(Offset); | ||
assert(Offset > 0 && | ||
isUInt<24>(2 * ST.getMaxWaveScratchSize()) && | ||
"offset is unsafe for v_mad_u32_u24"); | ||
// We start with a frame pointer with a wave space value, and an | ||
// offset in lane-space. We are materializing a lane space | ||
// value. We can either do a right shift of the frame pointer to | ||
// get to lane space, or a left shift of the offset to get to | ||
// wavespace. We can right shift after the computation to get | ||
// back to the desired per-lane value. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mention we are using the mad_u32_u24 primarily as an add with no carry out clobber |
||
// We are using the mad_u32_u24 primarily as an add with no | ||
// carry out clobber. | ||
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64), | ||
TmpResultReg) | ||
.addReg(TmpResultReg, RegState::Kill) | ||
.addImm(ST.getWavefrontSize()) | ||
.addReg(FrameReg) | ||
.addImm(0); | ||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), | ||
TmpResultReg) | ||
.addImm(ST.getWavefrontSizeLog2()) | ||
.addReg(FrameReg); | ||
} | ||
|
||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) | ||
.addReg(FrameReg) | ||
.addImm(ST.getWavefrontSizeLog2()); | ||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) | ||
.addReg(ScaledReg, RegState::Kill) | ||
.addImm(Offset); | ||
Register NewDest = IsCopy ? ResultReg | ||
: RS->scavengeRegisterBackwards( | ||
AMDGPU::SReg_32RegClass, *Add, | ||
false, 0, /*AllowSpill=*/true); | ||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), | ||
NewDest) | ||
.addReg(TmpResultReg); | ||
Comment on lines
+2601
to
+2603
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This whole sequence is repeated above? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, that is when offset is 0. |
||
ResultReg = NewDest; | ||
} | ||
if (!IsSALU) | ||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) | ||
.addReg(ScaledReg, RegState::Kill); | ||
.addReg(TmpResultReg, RegState::Kill); | ||
else | ||
ResultReg = ScaledReg; | ||
|
||
ResultReg = TmpResultReg; | ||
// If there were truly no free SGPRs, we need to undo everything. | ||
if (!TmpScaledReg.isValid()) { | ||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This exact sequence is repeated above?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see these two instruction above, for offset 0 we have shift but not add.