@@ -2205,9 +2205,14 @@ bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
2205
2205
assert (MI.isRegSequence ());
2206
2206
auto Reg = MI.getOperand (0 ).getReg ();
2207
2207
const TargetRegisterClass *DefRC = MRI->getRegClass (Reg);
2208
+ const MCInstrDesc &MovDesc = TII->get (AMDGPU::V_MOV_B64_PSEUDO);
2209
+ const TargetRegisterClass *RC =
2210
+ TII->getRegClass (MovDesc, 0 , TRI, *MI.getMF ());
2208
2211
2209
2212
if (!ST->hasMovB64 () || !TRI->isVGPR (*MRI, Reg) ||
2210
- !MRI->hasOneNonDBGUse (Reg) || !TRI->isProperlyAlignedRC (*DefRC))
2213
+ !MRI->hasOneNonDBGUse (Reg) ||
2214
+ (!TRI->getCompatibleSubRegClass (DefRC, RC, AMDGPU::sub0_sub1) &&
2215
+ DefRC != RC))
2211
2216
return false ;
2212
2217
2213
2218
SmallVector<std::pair<MachineOperand *, unsigned >, 32 > Defs;
@@ -2216,10 +2221,10 @@ bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
2216
2221
2217
2222
// Only attempting to fold immediate materializations.
2218
2223
if (!Defs.empty () &&
2219
- ! std::all_of (Defs.begin (), Defs.end (),
2220
- [](const std::pair<MachineOperand *, unsigned > &Op) {
2221
- return Op.first ->isImm ();
2222
- }))
2224
+ std::any_of (Defs.begin (), Defs.end (),
2225
+ [](const std::pair<MachineOperand *, unsigned > &Op) {
2226
+ return ! Op.first ->isImm ();
2227
+ }))
2223
2228
return false ;
2224
2229
2225
2230
SmallVector<uint64_t , 8 > ImmVals;
@@ -2245,9 +2250,8 @@ bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
2245
2250
}
2246
2251
2247
2252
// Can only combine REG_SEQUENCE into one 64b immediate materialization mov.
2248
- if (DefRC == TRI->getVGPR64Class ()) {
2249
- BuildMI (*MI.getParent (), MI, MI.getDebugLoc (),
2250
- TII->get (AMDGPU::V_MOV_B64_PSEUDO), Reg)
2253
+ if (DefRC == RC) {
2254
+ BuildMI (*MI.getParent (), MI, MI.getDebugLoc (), MovDesc, Reg)
2251
2255
.addImm (ImmVals[0 ]);
2252
2256
MI.eraseFromParent ();
2253
2257
return true ;
@@ -2262,21 +2266,22 @@ bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
2262
2266
for (unsigned i = MI.getNumOperands () - 1 ; i > 0 ; --i)
2263
2267
MI.removeOperand (i);
2264
2268
2265
- for ( unsigned i = 0 ; i < ImmVals. size (); ++i) {
2266
- const TargetRegisterClass *RC = TRI-> getVGPR64Class ();
2269
+ unsigned Ch = 0 ;
2270
+ for ( uint64_t Val : ImmVals) {
2267
2271
Register MovReg = MRI->createVirtualRegister (RC);
2268
2272
// Duplicate vmov imm materializations (e.g., splatted operands) should get
2269
2273
// combined by MachineCSE pass.
2270
2274
BuildMI (*MI.getParent (), MI, MI.getDebugLoc (),
2271
2275
TII->get (AMDGPU::V_MOV_B64_PSEUDO), MovReg)
2272
- .addImm (ImmVals[i] );
2276
+ .addImm (Val );
2273
2277
2274
2278
// 2 subregs with no overlap (i.e., sub0_sub1, sub2_sub3, etc.).
2275
2279
unsigned SubReg64B =
2276
- SIRegisterInfo::getSubRegFromChannel (/* Channel=*/ i * 2 , /* SubRegs=*/ 2 );
2280
+ SIRegisterInfo::getSubRegFromChannel (/* Channel=*/ Ch * 2 , /* SubRegs=*/ 2 );
2277
2281
2278
2282
MI.addOperand (MachineOperand::CreateReg (MovReg, /* isDef=*/ false ));
2279
2283
MI.addOperand (MachineOperand::CreateImm (SubReg64B));
2284
+ ++Ch;
2280
2285
}
2281
2286
2282
2287
LLVM_DEBUG (dbgs () << " Folded into " << MI);
0 commit comments