Skip to content

[AMDGPU] Do not allow M0 as v_readfirstlane_b32 dst #128851

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,

for (unsigned i = 0; i < NumParts; ++i) {
Register SrcPart = SrcParts[i];
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);

const TargetRegisterClass *Constrained =
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ DECODE_OPERAND_REG_8(VReg_512)
DECODE_OPERAND_REG_8(VReg_1024)

DECODE_OPERAND_REG_7(SReg_32, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1069,6 +1069,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
<< " is being turned to v_readfirstlane_b32"
<< " Score: " << C.second.Score << "\n");
Register DstReg = MI->getOperand(0).getReg();
MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);

Register SrcReg = MI->getOperand(1).getReg();
unsigned SubReg = MI->getOperand(1).getSubReg();
const TargetRegisterClass *SrcRC =
Expand All @@ -1092,7 +1094,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
Result, *MRI, MI->getOperand(1), SrcRC,
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
Register PartialDst =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MBB, *Result, Result->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
.addReg(PartialSrc);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ class PrologEpilogSGPRSpillBuilder {

buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
TmpVGPR, FI, FrameReg, DwordOff);
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
.addReg(TmpVGPR, RegState::Kill);
DwordOff += 4;
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4569,7 +4569,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Register PhiExec = MRI.createVirtualRegister(BoolRC);
Register NewExec = MRI.createVirtualRegister(BoolRC);
Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register CondReg = MRI.createVirtualRegister(BoolRC);

BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
Expand Down Expand Up @@ -5255,18 +5256,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32;
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
.addReg(Src0.getReg());
Src0.setReg(RegOp0);
}
if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
.addReg(Src1.getReg());
Src1.setReg(RegOp1);
}
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
if (TRI->isVectorRegister(MRI, Src2.getReg())) {
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
.addReg(Src2.getReg());
Expand Down
25 changes: 19 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6515,7 +6515,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
Register VScalarOp = ScalarOp->getReg();

if (NumSubRegs == 1) {
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
.addReg(VScalarOp);
Expand Down Expand Up @@ -6547,8 +6547,10 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
"Unhandled register size");

for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
Register CurRegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register CurRegHi =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

// Read the next variant <- also loop target.
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
Expand Down Expand Up @@ -7657,9 +7659,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
if (Inst.isCopy() && DstReg.isPhysical() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
// TODO: Only works for 32 bit registers.
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
.add(Inst.getOperand(1));
if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.add(Inst.getOperand(1));
} else {
Register NewDst =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
.add(Inst.getOperand(1));
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
DstReg)
.addReg(NewDst);
}
Inst.eraseFromParent();
return;
}
Expand Down
28 changes: 20 additions & 8 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2994,10 +2994,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (IsSALU && !LiveSCC)
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
if (IsSALU && LiveSCC) {
Register NewDest =
IsCopy ? ResultReg
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
Shift, false, 0);
Register NewDest;
if (IsCopy) {
MF->getRegInfo().constrainRegClass(ResultReg,
&AMDGPU::SReg_32_XM0RegClass);
NewDest = ResultReg;
} else {
NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
Shift, false, 0);
}
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
.addReg(TmpResultReg);
ResultReg = NewDest;
Expand Down Expand Up @@ -3120,10 +3125,17 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addReg(TmpResultReg);
}

Register NewDest = IsCopy ? ResultReg
: RS->scavengeRegisterBackwards(
AMDGPU::SReg_32RegClass, *Add,
false, 0, /*AllowSpill=*/true);
Register NewDest;
if (IsCopy) {
MF->getRegInfo().constrainRegClass(ResultReg,
&AMDGPU::SReg_32_XM0RegClass);
NewDest = ResultReg;
} else {
NewDest = RS->scavengeRegisterBackwards(
AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
/*AllowSpill=*/true);
}

BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
NewDest)
.addReg(TmpResultReg);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
} // End isMoveImm = 1

def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
let DstRC = RegisterOperand<SReg_32>;
let DstRC = RegisterOperand<SReg_32_XM0>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also applies to readlane and writelane. I assume this misses those because they have different signatures

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not writelane

let Src0RC32 = VRegOrLdsSrc_32;
let Asm32 = " $vdst, $src0";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ body: |
; CHECK-NEXT: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
Expand All @@ -33,7 +33,7 @@ body: |
%4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
%5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
%5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
S_ENDPGM 0
...

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ body: |
%0:vreg_64 = IMPLICIT_DEF
%1:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
%2:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
%3:sreg_32 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
%3:sreg_32_xm0 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
S_ENDPGM 0
...

Expand Down Expand Up @@ -50,7 +50,7 @@ body: |
%1:vreg_64 = IMPLICIT_DEF
%2:vgpr_32(s32) = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
%3:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
%4:sreg_32 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
%4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
S_ENDPGM 0
...

Expand Down Expand Up @@ -104,7 +104,7 @@ body: |

%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
Expand Down
Loading