Skip to content

Commit 5231736

Browse files
authored
[AMDGPU] Do not allow M0 as v_readfirstlane_b32 dst (#128851)
M0 can only be written to by the SALU, so `v_readfirstlane_b32 m0` is effectively useless. Represent this by restricting the dest RC of that instruction to `SReg_32_XM0` which excludes M0. There is a lot of test changes due to the register class changing, but most changes are trivial. In some cases, an extra register and `s_mov_b32` is needed. Fixes SWDEV-513269
1 parent 5f4d1f7 commit 5231736

File tree

149 files changed

+3885
-3864
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+3885
-3864
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
733733

734734
for (unsigned i = 0; i < NumParts; ++i) {
735735
Register SrcPart = SrcParts[i];
736-
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
736+
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
737737
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738738

739739
const TargetRegisterClass *Constrained =

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,7 @@ DECODE_OPERAND_REG_8(VReg_512)
271271
DECODE_OPERAND_REG_8(VReg_1024)
272272

273273
DECODE_OPERAND_REG_7(SReg_32, OPW32)
274+
DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32)
274275
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
275276
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
276277
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1069,6 +1069,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10691069
<< " is being turned to v_readfirstlane_b32"
10701070
<< " Score: " << C.second.Score << "\n");
10711071
Register DstReg = MI->getOperand(0).getReg();
1072+
MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
1073+
10721074
Register SrcReg = MI->getOperand(1).getReg();
10731075
unsigned SubReg = MI->getOperand(1).getSubReg();
10741076
const TargetRegisterClass *SrcRC =
@@ -1092,7 +1094,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10921094
Result, *MRI, MI->getOperand(1), SrcRC,
10931095
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
10941096
Register PartialDst =
1095-
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1097+
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
10961098
BuildMI(*MBB, *Result, Result->getDebugLoc(),
10971099
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
10981100
.addReg(PartialSrc);

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ class PrologEpilogSGPRSpillBuilder {
305305

306306
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
307307
TmpVGPR, FI, FrameReg, DwordOff);
308+
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
308309
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
309310
.addReg(TmpVGPR, RegState::Kill);
310311
DwordOff += 4;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4569,7 +4569,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
45694569
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
45704570
Register PhiExec = MRI.createVirtualRegister(BoolRC);
45714571
Register NewExec = MRI.createVirtualRegister(BoolRC);
4572-
Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4572+
Register CurrentIdxReg =
4573+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
45734574
Register CondReg = MRI.createVirtualRegister(BoolRC);
45744575

45754576
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
@@ -5255,18 +5256,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
52555256
? AMDGPU::S_ADDC_U32
52565257
: AMDGPU::S_SUBB_U32;
52575258
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5258-
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5259+
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
52595260
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
52605261
.addReg(Src0.getReg());
52615262
Src0.setReg(RegOp0);
52625263
}
52635264
if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5264-
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5265+
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
52655266
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
52665267
.addReg(Src1.getReg());
52675268
Src1.setReg(RegOp1);
52685269
}
5269-
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5270+
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
52705271
if (TRI->isVectorRegister(MRI, Src2.getReg())) {
52715272
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
52725273
.addReg(Src2.getReg());

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6515,7 +6515,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
65156515
Register VScalarOp = ScalarOp->getReg();
65166516

65176517
if (NumSubRegs == 1) {
6518-
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6518+
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
65196519

65206520
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
65216521
.addReg(VScalarOp);
@@ -6547,8 +6547,10 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
65476547
"Unhandled register size");
65486548

65496549
for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6550-
Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6551-
Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6550+
Register CurRegLo =
6551+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6552+
Register CurRegHi =
6553+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
65526554

65536555
// Read the next variant <- also loop target.
65546556
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
@@ -7657,9 +7659,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
76577659
if (Inst.isCopy() && DstReg.isPhysical() &&
76587660
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
76597661
// TODO: Only works for 32 bit registers.
7660-
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7661-
get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7662-
.add(Inst.getOperand(1));
7662+
if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
7663+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7664+
get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
7665+
.add(Inst.getOperand(1));
7666+
} else {
7667+
Register NewDst =
7668+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7669+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7670+
get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7671+
.add(Inst.getOperand(1));
7672+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7673+
DstReg)
7674+
.addReg(NewDst);
7675+
}
76637676
Inst.eraseFromParent();
76647677
return;
76657678
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2994,10 +2994,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29942994
if (IsSALU && !LiveSCC)
29952995
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
29962996
if (IsSALU && LiveSCC) {
2997-
Register NewDest =
2998-
IsCopy ? ResultReg
2999-
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
3000-
Shift, false, 0);
2997+
Register NewDest;
2998+
if (IsCopy) {
2999+
MF->getRegInfo().constrainRegClass(ResultReg,
3000+
&AMDGPU::SReg_32_XM0RegClass);
3001+
NewDest = ResultReg;
3002+
} else {
3003+
NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3004+
Shift, false, 0);
3005+
}
30013006
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
30023007
.addReg(TmpResultReg);
30033008
ResultReg = NewDest;
@@ -3120,10 +3125,17 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
31203125
.addReg(TmpResultReg);
31213126
}
31223127

3123-
Register NewDest = IsCopy ? ResultReg
3124-
: RS->scavengeRegisterBackwards(
3125-
AMDGPU::SReg_32RegClass, *Add,
3126-
false, 0, /*AllowSpill=*/true);
3128+
Register NewDest;
3129+
if (IsCopy) {
3130+
MF->getRegInfo().constrainRegClass(ResultReg,
3131+
&AMDGPU::SReg_32_XM0RegClass);
3132+
NewDest = ResultReg;
3133+
} else {
3134+
NewDest = RS->scavengeRegisterBackwards(
3135+
AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3136+
/*AllowSpill=*/true);
3137+
}
3138+
31273139
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
31283140
NewDest)
31293141
.addReg(TmpResultReg);

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
243243
} // End isMoveImm = 1
244244

245245
def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
246-
let DstRC = RegisterOperand<SReg_32>;
246+
let DstRC = RegisterOperand<SReg_32_XM0>;
247247
let Src0RC32 = VRegOrLdsSrc_32;
248248
let Asm32 = " $vdst, $src0";
249249
}

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ body: |
1212
; CHECK-NEXT: ALL VALUES UNIFORM
1313
%0:vgpr_32 = IMPLICIT_DEF
1414
%1:vgpr_32 = IMPLICIT_DEF
15-
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
15+
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
1616
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
1717
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
1818
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
@@ -33,7 +33,7 @@ body: |
3333
%4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
3434
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
3535
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
36-
%5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
36+
%5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
3737
S_ENDPGM 0
3838
...
3939

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ body: |
1414
%0:vreg_64 = IMPLICIT_DEF
1515
%1:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
1616
%2:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
17-
%3:sreg_32 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
17+
%3:sreg_32_xm0 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
1818
S_ENDPGM 0
1919
...
2020

@@ -50,7 +50,7 @@ body: |
5050
%1:vreg_64 = IMPLICIT_DEF
5151
%2:vgpr_32(s32) = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
5252
%3:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
53-
%4:sreg_32 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
53+
%4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
5454
S_ENDPGM 0
5555
...
5656

@@ -104,7 +104,7 @@ body: |
104104
105105
%0:vgpr_32 = IMPLICIT_DEF
106106
%1:vgpr_32 = IMPLICIT_DEF
107-
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
107+
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
108108
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
109109
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
110110
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec

0 commit comments

Comments
 (0)