@@ -2200,9 +2200,14 @@ bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
2200
2200
assert (MI.isRegSequence ());
2201
2201
auto Reg = MI.getOperand (0 ).getReg ();
2202
2202
const TargetRegisterClass *DefRC = MRI->getRegClass (Reg);
2203
+ const MCInstrDesc &MovDesc = TII->get (AMDGPU::V_MOV_B64_PSEUDO);
2204
+ const TargetRegisterClass *RC =
2205
+ TII->getRegClass (MovDesc, 0 , TRI, *MI.getMF ());
2203
2206
2204
2207
if (!ST->hasMovB64 () || !TRI->isVGPR (*MRI, Reg) ||
2205
- !MRI->hasOneNonDBGUse (Reg) || !TRI->isProperlyAlignedRC (*DefRC))
2208
+ !MRI->hasOneNonDBGUse (Reg) ||
2209
+ (!TRI->getCompatibleSubRegClass (DefRC, RC, AMDGPU::sub0_sub1) &&
2210
+ DefRC != RC))
2206
2211
return false ;
2207
2212
2208
2213
SmallVector<std::pair<MachineOperand *, unsigned >, 32 > Defs;
@@ -2211,10 +2216,10 @@ bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
2211
2216
2212
2217
// Only attempting to fold immediate materializations.
2213
2218
if (!Defs.empty () &&
2214
- ! std::all_of (Defs.begin (), Defs.end (),
2215
- [](const std::pair<MachineOperand *, unsigned > &Op) {
2216
- return Op.first ->isImm ();
2217
- }))
2219
+ std::any_of (Defs.begin (), Defs.end (),
2220
+ [](const std::pair<MachineOperand *, unsigned > &Op) {
2221
+ return ! Op.first ->isImm ();
2222
+ }))
2218
2223
return false ;
2219
2224
2220
2225
SmallVector<uint64_t , 8 > ImmVals;
@@ -2240,9 +2245,8 @@ bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
2240
2245
}
2241
2246
2242
2247
// Can only combine REG_SEQUENCE into one 64b immediate materialization mov.
2243
- if (DefRC == TRI->getVGPR64Class ()) {
2244
- BuildMI (*MI.getParent (), MI, MI.getDebugLoc (),
2245
- TII->get (AMDGPU::V_MOV_B64_PSEUDO), Reg)
2248
+ if (DefRC == RC) {
2249
+ BuildMI (*MI.getParent (), MI, MI.getDebugLoc (), MovDesc, Reg)
2246
2250
.addImm (ImmVals[0 ]);
2247
2251
MI.eraseFromParent ();
2248
2252
return true ;
@@ -2257,21 +2261,22 @@ bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
2257
2261
for (unsigned i = MI.getNumOperands () - 1 ; i > 0 ; --i)
2258
2262
MI.removeOperand (i);
2259
2263
2260
- for ( unsigned i = 0 ; i < ImmVals. size (); ++i) {
2261
- const TargetRegisterClass *RC = TRI-> getVGPR64Class ();
2264
+ unsigned Ch = 0 ;
2265
+ for ( uint64_t Val : ImmVals) {
2262
2266
Register MovReg = MRI->createVirtualRegister (RC);
2263
2267
// Duplicate vmov imm materializations (e.g., splatted operands) should get
2264
2268
// combined by MachineCSE pass.
2265
2269
BuildMI (*MI.getParent (), MI, MI.getDebugLoc (),
2266
2270
TII->get (AMDGPU::V_MOV_B64_PSEUDO), MovReg)
2267
- .addImm (ImmVals[i] );
2271
+ .addImm (Val );
2268
2272
2269
2273
// 2 subregs with no overlap (i.e., sub0_sub1, sub2_sub3, etc.).
2270
2274
unsigned SubReg64B =
2271
- SIRegisterInfo::getSubRegFromChannel (/* Channel=*/ i * 2 , /* SubRegs=*/ 2 );
2275
+ SIRegisterInfo::getSubRegFromChannel (/* Channel=*/ Ch * 2 , /* SubRegs=*/ 2 );
2272
2276
2273
2277
MI.addOperand (MachineOperand::CreateReg (MovReg, /* isDef=*/ false ));
2274
2278
MI.addOperand (MachineOperand::CreateImm (SubReg64B));
2279
+ ++Ch;
2275
2280
}
2276
2281
2277
2282
LLVM_DEBUG (dbgs () << " Folded into " << MI);
0 commit comments