@@ -106,9 +106,27 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
106
106
return N0->getOperand (Op0Idx) == N1->getOperand (Op1Idx);
107
107
}
108
108
109
+ static bool canRemat (const MachineInstr &MI) {
110
+
111
+ if (SIInstrInfo::isVOP1 (MI) || SIInstrInfo::isVOP2 (MI) ||
112
+ SIInstrInfo::isVOP3 (MI) || SIInstrInfo::isSDWA (MI) ||
113
+ SIInstrInfo::isSALU (MI))
114
+ return true ;
115
+
116
+ if (SIInstrInfo::isSMRD (MI)) {
117
+ return !MI.memoperands_empty () &&
118
+ llvm::all_of (MI.memoperands (), [](const MachineMemOperand *MMO) {
119
+ return MMO->isLoad () && MMO->isInvariant ();
120
+ });
121
+ }
122
+
123
+ return false ;
124
+ }
125
+
109
126
bool SIInstrInfo::isReallyTriviallyReMaterializable (
110
127
const MachineInstr &MI) const {
111
- if (isVOP1 (MI) || isVOP2 (MI) || isVOP3 (MI) || isSDWA (MI) || isSALU (MI)) {
128
+
129
+ if (canRemat (MI)) {
112
130
// Normally VALU use of exec would block the rematerialization, but that
113
131
// is OK in this case to have an implicit exec read as all VALU do.
114
132
// We really want all of the generic logic for this except for this.
@@ -2434,6 +2452,92 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2434
2452
return true ;
2435
2453
}
2436
2454
2455
+ void SIInstrInfo::reMaterialize (MachineBasicBlock &MBB,
2456
+ MachineBasicBlock::iterator I, Register DestReg,
2457
+ unsigned SubIdx, const MachineInstr &Orig,
2458
+ const TargetRegisterInfo &RI) const {
2459
+
2460
+ // Try shrinking the instruction to remat only the part needed for current
2461
+ // context.
2462
+ // TODO: Handle more cases.
2463
+ unsigned Opcode = Orig.getOpcode ();
2464
+ switch (Opcode) {
2465
+ case AMDGPU::S_LOAD_DWORDX16_IMM:
2466
+ case AMDGPU::S_LOAD_DWORDX8_IMM: {
2467
+ if (SubIdx != 0 )
2468
+ break ;
2469
+
2470
+ if (I == MBB.end ())
2471
+ break ;
2472
+
2473
+ if (I->isBundled ())
2474
+ break ;
2475
+
2476
+ // Look for a single use of the register that is also a subreg.
2477
+ Register RegToFind = Orig.getOperand (0 ).getReg ();
2478
+ MachineOperand *UseMO = nullptr ;
2479
+ for (auto &CandMO : I->operands ()) {
2480
+ if (!CandMO.isReg () || CandMO.getReg () != RegToFind || CandMO.isDef ())
2481
+ continue ;
2482
+ if (UseMO) {
2483
+ UseMO = nullptr ;
2484
+ break ;
2485
+ }
2486
+ UseMO = &CandMO;
2487
+ }
2488
+ if (!UseMO || UseMO->getSubReg () == AMDGPU::NoSubRegister)
2489
+ break ;
2490
+
2491
+ unsigned Offset = RI.getSubRegIdxOffset (UseMO->getSubReg ());
2492
+ unsigned SubregSize = RI.getSubRegIdxSize (UseMO->getSubReg ());
2493
+
2494
+ MachineFunction *MF = MBB.getParent ();
2495
+ MachineRegisterInfo &MRI = MF->getRegInfo ();
2496
+ assert (MRI.use_nodbg_empty (DestReg) && " DestReg should have no users yet." );
2497
+
2498
+ unsigned NewOpcode = -1 ;
2499
+ if (SubregSize == 256 )
2500
+ NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2501
+ else if (SubregSize == 128 )
2502
+ NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2503
+ else
2504
+ break ;
2505
+
2506
+ const MCInstrDesc &TID = get (NewOpcode);
2507
+ const TargetRegisterClass *NewRC =
2508
+ RI.getAllocatableClass (getRegClass (TID, 0 , &RI, *MF));
2509
+ MRI.setRegClass (DestReg, NewRC);
2510
+
2511
+ UseMO->setReg (DestReg);
2512
+ UseMO->setSubReg (AMDGPU::NoSubRegister);
2513
+
2514
+ // Use a smaller load with the desired size, possibly with updated offset.
2515
+ MachineInstr *MI = MF->CloneMachineInstr (&Orig);
2516
+ MI->setDesc (TID);
2517
+ MI->getOperand (0 ).setReg (DestReg);
2518
+ MI->getOperand (0 ).setSubReg (AMDGPU::NoSubRegister);
2519
+ if (Offset) {
2520
+ MachineOperand *OffsetMO = getNamedOperand (*MI, AMDGPU::OpName::offset);
2521
+ int64_t FinalOffset = OffsetMO->getImm () + Offset / 8 ;
2522
+ OffsetMO->setImm (FinalOffset);
2523
+ }
2524
+ SmallVector<MachineMemOperand *> NewMMOs;
2525
+ for (const MachineMemOperand *MemOp : Orig.memoperands ())
2526
+ NewMMOs.push_back (MF->getMachineMemOperand (MemOp, MemOp->getPointerInfo (),
2527
+ SubregSize / 8 ));
2528
+ MI->setMemRefs (*MF, NewMMOs);
2529
+
2530
+ MBB.insert (I, MI);
2531
+ return ;
2532
+ }
2533
+
2534
+ default :
2535
+ break ;
2536
+ }
2537
+
2538
+ TargetInstrInfo::reMaterialize (MBB, I, DestReg, SubIdx, Orig, RI);
2539
+ }
2540
+
2437
2541
std::pair<MachineInstr*, MachineInstr*>
2438
2542
SIInstrInfo::expandMovDPP64 (MachineInstr &MI) const {
2439
2543
assert (MI.getOpcode () == AMDGPU::V_MOV_B64_DPP_PSEUDO);
0 commit comments