@@ -7227,48 +7227,53 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
7227
7227
return DeferredList.contains (MI);
7228
7228
}
7229
7229
7230
- // legalize operand between 16bit and 32bit registers in v2s copy
7230
+ // Legalize size mismatches between 16bit and 32bit registers in v2s copy
7231
7231
// lowering (change spgr to vgpr).
7232
7232
// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7233
7233
// size. Need to legalize the size of the operands during the vgpr lowering
7234
7234
// chain. This can be removed after we have sgpr16 in place
7235
- void SIInstrInfo::legalizeOperandsVALUt16 (MachineInstr &MI,
7235
+ void SIInstrInfo::legalizeOperandsVALUt16 (MachineInstr &MI, unsigned OpIdx,
7236
7236
MachineRegisterInfo &MRI) const {
7237
7237
if (!ST.useRealTrue16Insts ())
7238
7238
return ;
7239
7239
7240
7240
unsigned Opcode = MI.getOpcode ();
7241
7241
MachineBasicBlock *MBB = MI.getParent ();
7242
+ // Legalize operands and check for size mismatch
7243
+ if (!OpIdx || OpIdx >= MI.getNumExplicitOperands () ||
7244
+ OpIdx >= get (Opcode).getNumOperands ())
7245
+ return ;
7242
7246
7243
- // legalize operands and check for size mismatch
7244
- for (MachineOperand &Op : MI.explicit_operands ()) {
7245
- unsigned OpIdx = Op.getOperandNo ();
7246
- if (!OpIdx)
7247
- continue ;
7248
- if (Op.isReg () && Op.getReg ().isVirtual ()) {
7249
- const TargetRegisterClass *DefRC = MRI.getRegClass (Op.getReg ());
7250
- if (!RI.isVGPRClass (DefRC))
7251
- continue ;
7252
- unsigned RCID = get (Opcode).operands ()[OpIdx].RegClass ;
7253
- const TargetRegisterClass *UseRC = RI.getRegClass (RCID);
7254
- if (RI.getMatchingSuperRegClass (DefRC, UseRC, AMDGPU::lo16)) {
7255
- Op.setSubReg (AMDGPU::lo16);
7256
- } else if (RI.getMatchingSuperRegClass (UseRC, DefRC, AMDGPU::lo16)) {
7257
- const DebugLoc &DL = MI.getDebugLoc ();
7258
- Register NewDstReg =
7259
- MRI.createVirtualRegister (&AMDGPU::VGPR_32RegClass);
7260
- Register Undef = MRI.createVirtualRegister (&AMDGPU::VGPR_16RegClass);
7261
- BuildMI (*MBB, MI, DL, get (AMDGPU::IMPLICIT_DEF), Undef);
7262
- BuildMI (*MBB, MI, DL, get (AMDGPU::REG_SEQUENCE), NewDstReg)
7263
- .addReg (Op.getReg ())
7264
- .addImm (AMDGPU::lo16)
7265
- .addReg (Undef)
7266
- .addImm (AMDGPU::hi16);
7267
- Op.setReg (NewDstReg);
7268
- }
7269
- }
7247
+ MachineOperand &Op = MI.getOperand (OpIdx);
7248
+ if (!Op.isReg () || !Op.getReg ().isVirtual ())
7249
+ return ;
7250
+
7251
+ const TargetRegisterClass *CurrRC = MRI.getRegClass (Op.getReg ());
7252
+ if (!RI.isVGPRClass (CurrRC))
7253
+ return ;
7254
+
7255
+ unsigned RCID = get (Opcode).operands ()[OpIdx].RegClass ;
7256
+ const TargetRegisterClass *ExpectedRC = RI.getRegClass (RCID);
7257
+ if (RI.getMatchingSuperRegClass (CurrRC, ExpectedRC, AMDGPU::lo16)) {
7258
+ Op.setSubReg (AMDGPU::lo16);
7259
+ } else if (RI.getMatchingSuperRegClass (ExpectedRC, CurrRC, AMDGPU::lo16)) {
7260
+ const DebugLoc &DL = MI.getDebugLoc ();
7261
+ Register NewDstReg = MRI.createVirtualRegister (&AMDGPU::VGPR_32RegClass);
7262
+ Register Undef = MRI.createVirtualRegister (&AMDGPU::VGPR_16RegClass);
7263
+ BuildMI (*MBB, MI, DL, get (AMDGPU::IMPLICIT_DEF), Undef);
7264
+ BuildMI (*MBB, MI, DL, get (AMDGPU::REG_SEQUENCE), NewDstReg)
7265
+ .addReg (Op.getReg ())
7266
+ .addImm (AMDGPU::lo16)
7267
+ .addReg (Undef)
7268
+ .addImm (AMDGPU::hi16);
7269
+ Op.setReg (NewDstReg);
7270
7270
}
7271
7271
}
7272
+ void SIInstrInfo::legalizeOperandsVALUt16 (MachineInstr &MI,
7273
+ MachineRegisterInfo &MRI) const {
7274
+ for (unsigned OpIdx = 1 ; OpIdx < MI.getNumExplicitOperands (); OpIdx++)
7275
+ legalizeOperandsVALUt16 (MI, OpIdx, MRI);
7276
+ }
7272
7277
7273
7278
void SIInstrInfo::moveToVALU (SIInstrWorklist &Worklist,
7274
7279
MachineDominatorTree *MDT) const {
@@ -7789,15 +7794,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7789
7794
return ;
7790
7795
}
7791
7796
7792
- // If this is a v2s copy src from 16bit to 32bit,
7793
- // replace vgpr copy to reg_sequence
7797
+ // If this is a v2s copy between 16bit and 32bit reg ,
7798
+ // replace vgpr copy to reg_sequence/extract_subreg
7794
7799
// This can be remove after we have sgpr16 in place
7795
7800
if (ST.useRealTrue16Insts () && Inst.isCopy () &&
7796
7801
Inst.getOperand (1 ).getReg ().isVirtual () &&
7797
7802
RI.isVGPR (MRI, Inst.getOperand (1 ).getReg ())) {
7798
7803
const TargetRegisterClass *SrcRegRC = getOpRegClass (Inst, 1 );
7799
- if (16 == RI.getRegSizeInBits (*SrcRegRC) &&
7800
- 32 == RI.getRegSizeInBits (*NewDstRC)) {
7804
+ if (RI.getMatchingSuperRegClass (NewDstRC, SrcRegRC, AMDGPU::lo16)) {
7801
7805
Register NewDstReg = MRI.createVirtualRegister (NewDstRC);
7802
7806
Register Undef = MRI.createVirtualRegister (&AMDGPU::VGPR_16RegClass);
7803
7807
BuildMI (*Inst.getParent (), &Inst, Inst.getDebugLoc (),
@@ -7810,18 +7814,13 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7810
7814
.addImm (AMDGPU::hi16);
7811
7815
Inst.eraseFromParent ();
7812
7816
MRI.replaceRegWith (DstReg, NewDstReg);
7813
- // legalize useMI with mismatched size
7814
- for (MachineRegisterInfo::use_iterator I = MRI.use_begin (NewDstReg),
7815
- E = MRI.use_end ();
7816
- I != E; ++I) {
7817
- MachineInstr &UseMI = *I->getParent ();
7818
- unsigned UseMIOpcode = UseMI.getOpcode ();
7819
- if (AMDGPU::isTrue16Inst (UseMIOpcode) &&
7820
- (16 ==
7821
- RI.getRegSizeInBits (*getOpRegClass (UseMI, I.getOperandNo ())))) {
7822
- I->setSubReg (AMDGPU::lo16);
7823
- }
7824
- }
7817
+ addUsersToMoveToVALUWorklist (NewDstReg, MRI, Worklist);
7818
+ return ;
7819
+ } else if (RI.getMatchingSuperRegClass (SrcRegRC, NewDstRC,
7820
+ AMDGPU::lo16)) {
7821
+ Inst.getOperand (1 ).setSubReg (AMDGPU::lo16);
7822
+ Register NewDstReg = MRI.createVirtualRegister (NewDstRC);
7823
+ MRI.replaceRegWith (DstReg, NewDstReg);
7825
7824
addUsersToMoveToVALUWorklist (NewDstReg, MRI, Worklist);
7826
7825
return ;
7827
7826
}
@@ -7916,23 +7915,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7916
7915
assert (NewDstRC);
7917
7916
NewDstReg = MRI.createVirtualRegister (NewDstRC);
7918
7917
MRI.replaceRegWith (DstReg, NewDstReg);
7919
-
7920
- // Check useMI of NewInstr. If used by a true16 instruction,
7921
- // add a lo16 subreg access if size mismatched
7922
- // This can be remove after we have sgpr16 in place
7923
- if (ST.useRealTrue16Insts () && NewDstRC == &AMDGPU::VGPR_32RegClass) {
7924
- for (MachineRegisterInfo::use_iterator I = MRI.use_begin (NewDstReg),
7925
- E = MRI.use_end ();
7926
- I != E; ++I) {
7927
- MachineInstr &UseMI = *I->getParent ();
7928
- unsigned UseMIOpcode = UseMI.getOpcode ();
7929
- if (AMDGPU::isTrue16Inst (UseMIOpcode) &&
7930
- (16 ==
7931
- RI.getRegSizeInBits (*getOpRegClass (UseMI, I.getOperandNo ())))) {
7932
- I->setSubReg (AMDGPU::lo16);
7933
- }
7934
- }
7935
- }
7936
7918
}
7937
7919
fixImplicitOperands (*NewInstr);
7938
7920
@@ -8740,6 +8722,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
8740
8722
++I;
8741
8723
} while (I != E && I->getParent () == &UseMI);
8742
8724
} else {
8725
+ legalizeOperandsVALUt16 (UseMI, OpNo, MRI);
8726
+
8743
8727
++I;
8744
8728
}
8745
8729
}
0 commit comments