Skip to content

[AMDGPU] Inplace FI elimination during PEI for scalar copy instruction #99556

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
4c43e74
[AMDGPU] Inplace FI elimination during PEI for scalar copy instruction
PankajDwivedi-25 Jul 18, 2024
7b80c8b
refactored SIRegisterInfo, reduce-frame-index
PankajDwivedi-25 Jul 19, 2024
126bd30
refactored,checks with gfx90a
PankajDwivedi-25 Jul 19, 2024
eb1e303
refactored,checks with gfx803
PankajDwivedi-25 Jul 19, 2024
aa34010
[WIP] updated check for gfx8,gfx9
PankajDwivedi-25 Jul 19, 2024
ab19c1e
[AMDGPU] this patch fixes all cases
PankajDwivedi-25 Jul 24, 2024
ce41a13
[AMDGPU] renamed test file
PankajDwivedi-25 Jul 24, 2024
4254f3b
[WIP] replaced v_mad_i32_i24 with unsigned version, refactored nested…
PankajDwivedi-25 Jul 24, 2024
7bcea66
[WIP] renamed test file, removed unnecessesry null check
PankajDwivedi-25 Jul 24, 2024
23c7344
[WIP] renamed test file
PankajDwivedi-25 Jul 24, 2024
b5a09ea
[WIP] rephrased comment
PankajDwivedi-25 Jul 24, 2024
2f16d7b
[WIP] assert non positive offset
PankajDwivedi-25 Jul 29, 2024
adf2f18
[WIP] reused vgpr for copying offset
PankajDwivedi-25 Jul 29, 2024
4c97dde
[WIP] mul offset with wavesize
PankajDwivedi-25 Jul 29, 2024
bb9f139
[WIP] added target gfx10,gfx11,gfx12
PankajDwivedi-25 Jul 29, 2024
548f498
[WIP] updated assert, refactored test
PankajDwivedi-25 Jul 29, 2024
a4d8201
[WIP] V_MAD_U32_U24_e64 utilises wave wavespace
PankajDwivedi-25 Jul 30, 2024
f7f8b16
[WIP] added comment
PankajDwivedi-25 Jul 30, 2024
9ae1cd2
[WIP] refactored
PankajDwivedi-25 Jul 30, 2024
0b696d8
[WIP] updated opcode V_LSHR_B32_e64 with V_LSHRREV_B32_e64
PankajDwivedi-25 Jul 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 86 additions & 17 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2449,7 +2449,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
? &AMDGPU::SReg_32RegClass
: &AMDGPU::VGPR_32RegClass;
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
MI->getOpcode() == AMDGPU::S_MOV_B32;
Register ResultReg =
IsCopy ? MI->getOperand(0).getReg()
: RS->scavengeRegisterBackwards(*RC, MI, false, 0);
Expand All @@ -2458,7 +2459,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (Offset == 0) {
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
: AMDGPU::V_LSHRREV_B32_e64;
auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
Register TmpResultReg = ResultReg;
if (IsSALU && LiveSCC) {
TmpResultReg = RS->scavengeRegisterBackwards(
AMDGPU::VGPR_32RegClass, MI, false, 0);
}

auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
// For V_LSHRREV, the operands are reversed (the shift count goes
// first).
Expand All @@ -2468,11 +2475,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (IsSALU && !LiveSCC)
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
if (IsSALU && LiveSCC) {
Register NewDest = RS->scavengeRegisterBackwards(
AMDGPU::SReg_32RegClass, Shift, false, 0);
Register NewDest =
IsCopy ? ResultReg
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
Shift, false, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
NewDest)
.addReg(ResultReg);
.addReg(TmpResultReg);
ResultReg = NewDest;
}
} else {
Expand Down Expand Up @@ -2523,22 +2532,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,

// We may have 1 free scratch SGPR even though a carry out is
// unavailable. Only one additional mov is needed.
Register TmpScaledReg = RS->scavengeRegisterBackwards(
AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
Register TmpScaledReg = IsCopy && IsSALU
? ResultReg
: RS->scavengeRegisterBackwards(
AMDGPU::SReg_32_XM0RegClass, MI,
false, 0, /*AllowSpill=*/false);
Register ScaledReg =
TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
Register TmpResultReg = ScaledReg;

if (!LiveSCC) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
.addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2());
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
.addReg(TmpResultReg, RegState::Kill)
.addImm(Offset);
Comment on lines +2545 to +2550
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This exact sequence is repeated above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see these two instruction above, for offset 0 we have shift but not add.

} else {
TmpResultReg = RS->scavengeRegisterBackwards(
AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);

MachineInstrBuilder Add;
if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
TmpResultReg)
.addImm(ST.getWavefrontSizeLog2())
.addReg(FrameReg);
if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32),
ResultReg)
.addImm(Offset);
Add.addReg(ResultReg, RegState::Kill)
.addReg(TmpResultReg, RegState::Kill)
.addImm(0);
} else
Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
} else {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you do this reassociation, you need to multiply the offset by the wavesize. You're doing the mul + add in terms of waves

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs comments about why this is doing this reassociation vs. above

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the offset is an inline immediate, you can just fold it directly into the mad24 operand as well

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should address this in a follow up

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also should special case folding frame indexes into S_ADD_I32/S_OR_B32

TmpResultReg)
.addImm(Offset);
assert(Offset > 0 &&
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
"offset is unsafe for v_mad_u32_u24");
// We start with a frame pointer with a wave space value, and an
// offset in lane-space. We are materializing a lane space
// value. We can either do a right shift of the frame pointer to
// get to lane space, or a left shift of the offset to get to
// wavespace. We can right shift after the computation to get
// back to the desired per-lane value.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mention we are using the mad_u32_u24 primarily as an add with no carry out clobber

// We are using the mad_u32_u24 primarily as an add with no
// carry out clobber.
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
TmpResultReg)
.addReg(TmpResultReg, RegState::Kill)
.addImm(ST.getWavefrontSize())
.addReg(FrameReg)
.addImm(0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
TmpResultReg)
.addImm(ST.getWavefrontSizeLog2())
.addReg(FrameReg);
}

BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
.addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2());
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
.addReg(ScaledReg, RegState::Kill)
.addImm(Offset);
Register NewDest = IsCopy ? ResultReg
: RS->scavengeRegisterBackwards(
AMDGPU::SReg_32RegClass, *Add,
false, 0, /*AllowSpill=*/true);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
NewDest)
.addReg(TmpResultReg);
Comment on lines +2601 to +2603
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole sequence is repeated above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, that is when offset is 0.

ResultReg = NewDest;
}
if (!IsSALU)
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
.addReg(ScaledReg, RegState::Kill);
.addReg(TmpResultReg, RegState::Kill);
else
ResultReg = ScaledReg;

ResultReg = TmpResultReg;
// If there were truly no free SGPRs, we need to undo everything.
if (!TmpScaledReg.isValid()) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
Expand Down
Loading