Skip to content

[AMDGPU][True16][CodeGen] legalize 16bit and 32bit use-def chain for moveToVALU in si-fix-sgpr-lowering #138734

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 55 additions & 40 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7227,27 +7227,52 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
return DeferredList.contains(MI);
}

// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
// subreg access properly. This can be removed after we have sgpr16 in place
void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
// Legalize size mismatches between 16bit and 32bit registers in v2s copy
// lowering (change spgr to vgpr).
// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
// size. Need to legalize the size of the operands during the vgpr lowering
// chain. This can be removed after we have sgpr16 in place
void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
MachineRegisterInfo &MRI) const {
unsigned Opcode = Inst.getOpcode();
if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
if (!ST.useRealTrue16Insts())
return;

for (MachineOperand &Op : Inst.explicit_operands()) {
unsigned OpIdx = Op.getOperandNo();
if (!OpIdx)
continue;
if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
if (RI.getRegSizeInBits(*RC) == 16) {
Op.setSubReg(AMDGPU::lo16);
}
}
}
unsigned Opcode = MI.getOpcode();
MachineBasicBlock *MBB = MI.getParent();
// Legalize operands and check for size mismatch
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
OpIdx >= get(Opcode).getNumOperands())
return;

MachineOperand &Op = MI.getOperand(OpIdx);
if (!Op.isReg() || !Op.getReg().isVirtual())
return;

const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
if (!RI.isVGPRClass(CurrRC))
return;

unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
Op.setSubReg(AMDGPU::lo16);
} else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
const DebugLoc &DL = MI.getDebugLoc();
Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
.addReg(Op.getReg())
.addImm(AMDGPU::lo16)
.addReg(Undef)
.addImm(AMDGPU::hi16);
Op.setReg(NewDstReg);
}
}
void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
MachineRegisterInfo &MRI) const {
for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
legalizeOperandsVALUt16(MI, OpIdx, MRI);
}

void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
Expand Down Expand Up @@ -7769,15 +7794,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}

// If this is a v2s copy src from 16bit to 32bit,
// replace vgpr copy to reg_sequence
// If this is a v2s copy between 16bit and 32bit reg,
// replace vgpr copy to reg_sequence/extract_subreg
// This can be remove after we have sgpr16 in place
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
Inst.getOperand(1).getReg().isVirtual() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
32 == RI.getRegSizeInBits(*NewDstRC)) {
if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
Expand All @@ -7789,7 +7813,13 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.addReg(Undef)
.addImm(AMDGPU::hi16);
Inst.eraseFromParent();

MRI.replaceRegWith(DstReg, NewDstReg);
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
return;
} else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
AMDGPU::lo16)) {
Inst.getOperand(1).setSubReg(AMDGPU::lo16);
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
return;
Expand Down Expand Up @@ -7885,23 +7915,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
assert(NewDstRC);
NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);

// Check useMI of NewInstr. If used by a true16 instruction,
// add a lo16 subreg access if size mismatched
// This can be remove after we have sgpr16 in place
if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
E = MRI.use_end();
I != E; ++I) {
MachineInstr &UseMI = *I->getParent();
unsigned UseMIOpcode = UseMI.getOpcode();
if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
(16 ==
RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
I->setSubReg(AMDGPU::lo16);
}
}
}
}
fixImplicitOperands(*NewInstr);

Expand Down Expand Up @@ -8709,6 +8722,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
++I;
} while (I != E && I->getParent() == &UseMI);
} else {
legalizeOperandsVALUt16(UseMI, OpNo, MRI);

++I;
}
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1304,6 +1304,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// Fix operands in Inst to fix 16bit SALU to VALU lowering.
void legalizeOperandsVALUt16(MachineInstr &Inst,
MachineRegisterInfo &MRI) const;
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
MachineRegisterInfo &MRI) const;

/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
Expand Down
Loading