Skip to content

Commit bbb4ebf

Browse files
authored
(cherry-pick) [AMDGPU] Prevent m0 from being used as v_readlane/v_readfirstlane dst (llvm#1080)
Combined cherry-pick of 0f0d3fb and 5231736 from amd-staging Fixes SWDEV-513763
1 parent 6958fc1 commit bbb4ebf

File tree

151 files changed

+3791
-3742
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

151 files changed

+3791
-3742
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
733733

734734
for (unsigned i = 0; i < NumParts; ++i) {
735735
Register SrcPart = SrcParts[i];
736-
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
736+
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
737737
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738738

739739
const TargetRegisterClass *Constrained =

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ DECODE_OPERAND_REG_8(VReg_512)
289289
DECODE_OPERAND_REG_8(VReg_1024)
290290

291291
DECODE_OPERAND_REG_7(SReg_32, OPW32)
292+
DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32)
292293
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
293294
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
294295
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10581058
<< " is being turned to v_readfirstlane_b32"
10591059
<< " Score: " << C.second.Score << "\n");
10601060
Register DstReg = MI->getOperand(0).getReg();
1061+
MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
1062+
10611063
Register SrcReg = MI->getOperand(1).getReg();
10621064
unsigned SubReg = MI->getOperand(1).getSubReg();
10631065
const TargetRegisterClass *SrcRC =
@@ -1081,7 +1083,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10811083
Result, *MRI, MI->getOperand(1), SrcRC,
10821084
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
10831085
Register PartialDst =
1084-
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1086+
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
10851087
BuildMI(*MBB, *Result, Result->getDebugLoc(),
10861088
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
10871089
.addReg(PartialSrc);

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ class PrologEpilogSGPRSpillBuilder {
439439

440440
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
441441
TmpVGPR, FI, FrameReg, DwordOff);
442+
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
442443
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
443444
.addReg(TmpVGPR, RegState::Kill);
444445
DwordOff += 4;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4516,7 +4516,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
45164516
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
45174517
Register PhiExec = MRI.createVirtualRegister(BoolRC);
45184518
Register NewExec = MRI.createVirtualRegister(BoolRC);
4519-
Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4519+
Register CurrentIdxReg =
4520+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
45204521
Register CondReg = MRI.createVirtualRegister(BoolRC);
45214522

45224523
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
@@ -4932,7 +4933,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49324933
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
49334934

49344935
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4935-
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4936+
Register LaneValueReg =
4937+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
49364938

49374939
bool IsWave32 = ST.isWave32();
49384940
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -5180,18 +5182,18 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
51805182
? AMDGPU::S_ADDC_U32
51815183
: AMDGPU::S_SUBB_U32;
51825184
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5183-
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5185+
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
51845186
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
51855187
.addReg(Src0.getReg());
51865188
Src0.setReg(RegOp0);
51875189
}
51885190
if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5189-
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5191+
Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
51905192
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
51915193
.addReg(Src1.getReg());
51925194
Src1.setReg(RegOp1);
51935195
}
5194-
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5196+
Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
51955197
if (TRI->isVectorRegister(MRI, Src2.getReg())) {
51965198
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
51975199
.addReg(Src2.getReg());

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2230,6 +2230,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22302230

22312231
case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
22322232
MI.setDesc(get(AMDGPU::V_READLANE_B32));
2233+
MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2234+
&AMDGPU::SReg_32_XM0RegClass);
22332235
break;
22342236

22352237
case AMDGPU::V_MOV_B64_PSEUDO: {
@@ -6356,7 +6358,7 @@ static void emitLoadScalarOpsFromVGPRLoop(
63566358
Register VScalarOp = ScalarOp->getReg();
63576359

63586360
if (NumSubRegs == 1) {
6359-
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6361+
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
63606362

63616363
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
63626364
.addReg(VScalarOp);
@@ -6387,8 +6389,10 @@ static void emitLoadScalarOpsFromVGPRLoop(
63876389
"Unhandled register size");
63886390

63896391
for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6390-
Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6391-
Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6392+
Register CurRegLo =
6393+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6394+
Register CurRegHi =
6395+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
63926396

63936397
// Read the next variant <- also loop target.
63946398
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
@@ -7444,9 +7448,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
74447448
if (Inst.isCopy() && DstReg.isPhysical() &&
74457449
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
74467450
// TODO: Only works for 32 bit registers.
7447-
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7448-
get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7449-
.add(Inst.getOperand(1));
7451+
if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
7452+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7453+
get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
7454+
.add(Inst.getOperand(1));
7455+
} else {
7456+
Register NewDst =
7457+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7458+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7459+
get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7460+
.add(Inst.getOperand(1));
7461+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7462+
DstReg)
7463+
.addReg(NewDst);
7464+
}
74507465
Inst.eraseFromParent();
74517466
return;
74527467
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2115,6 +2115,8 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
21152115
// Don't need to write VGPR out.
21162116
}
21172117

2118+
MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2119+
21182120
// Restore clobbered registers in the specified restore block.
21192121
MI = RestoreMBB.end();
21202122
SB.setMI(&RestoreMBB, MI);
@@ -2129,6 +2131,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
21292131
SB.NumSubRegs == 1
21302132
? SB.SuperReg
21312133
: Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2134+
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
21322135
bool LastSubReg = (i + 1 == e);
21332136
auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
21342137
SubReg)
@@ -2960,10 +2963,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29602963
if (IsSALU && !LiveSCC)
29612964
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
29622965
if (IsSALU && LiveSCC) {
2963-
Register NewDest =
2964-
IsCopy ? ResultReg
2965-
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
2966-
Shift, false, 0);
2966+
Register NewDest;
2967+
if (IsCopy) {
2968+
MF->getRegInfo().constrainRegClass(ResultReg,
2969+
&AMDGPU::SReg_32_XM0RegClass);
2970+
NewDest = ResultReg;
2971+
} else {
2972+
NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2973+
Shift, false, 0);
2974+
}
29672975
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
29682976
.addReg(TmpResultReg);
29692977
ResultReg = NewDest;
@@ -3086,10 +3094,16 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
30863094
.addReg(TmpResultReg);
30873095
}
30883096

3089-
Register NewDest = IsCopy ? ResultReg
3090-
: RS->scavengeRegisterBackwards(
3091-
AMDGPU::SReg_32RegClass, *Add,
3092-
false, 0, /*AllowSpill=*/true);
3097+
Register NewDest;
3098+
if (IsCopy) {
3099+
MF->getRegInfo().constrainRegClass(ResultReg,
3100+
&AMDGPU::SReg_32_XM0RegClass);
3101+
NewDest = ResultReg;
3102+
} else {
3103+
NewDest = RS->scavengeRegisterBackwards(
3104+
AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3105+
/*AllowSpill=*/true);
3106+
}
30933107
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
30943108
NewDest)
30953109
.addReg(TmpResultReg);

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
241241
} // End isMoveImm = 1
242242

243243
def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
244-
let DstRC = RegisterOperand<SReg_32>;
244+
let DstRC = RegisterOperand<SReg_32_XM0>;
245245
let Src0RC32 = VRegOrLdsSrc_32;
246246
let Asm32 = " $vdst, $src0";
247247
}

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
685685
}
686686

687687
def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> {
688-
let Outs32 = (outs SReg_32:$vdst);
688+
let Outs32 = (outs SReg_32_XM0:$vdst);
689689
let Outs64 = Outs32;
690690
let Ins32 = (ins VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1);
691691
let Ins64 = Ins32;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ body: |
1212
; CHECK-NEXT: ALL VALUES UNIFORM
1313
%0:vgpr_32 = IMPLICIT_DEF
1414
%1:vgpr_32 = IMPLICIT_DEF
15-
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
15+
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
1616
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
1717
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
1818
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
@@ -33,7 +33,7 @@ body: |
3333
%4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
3434
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
3535
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
36-
%5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
36+
%5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
3737
S_ENDPGM 0
3838
...
3939

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ body: |
1414
%0:vreg_64 = IMPLICIT_DEF
1515
%1:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
1616
%2:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
17-
%3:sreg_32 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
17+
%3:sreg_32_xm0 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
1818
S_ENDPGM 0
1919
...
2020

@@ -50,7 +50,7 @@ body: |
5050
%1:vreg_64 = IMPLICIT_DEF
5151
%2:vgpr_32(s32) = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
5252
%3:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
53-
%4:sreg_32 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
53+
%4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
5454
S_ENDPGM 0
5555
...
5656

@@ -104,7 +104,7 @@ body: |
104104
105105
%0:vgpr_32 = IMPLICIT_DEF
106106
%1:vgpr_32 = IMPLICIT_DEF
107-
%2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
107+
%2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
108108
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
109109
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
110110
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec

0 commit comments

Comments
 (0)