@@ -111,6 +111,8 @@ class SIFoldOperands : public MachineFunctionPass {
111
111
bool tryFoldCndMask (MachineInstr &MI) const ;
112
112
bool tryFoldZeroHighBits (MachineInstr &MI) const ;
113
113
bool foldInstOperand (MachineInstr &MI, MachineOperand &OpToFold) const ;
114
+ bool tryFoldFoldableCopy (MachineInstr &MI,
115
+ MachineOperand *&CurrentKnownM0Val) const ;
114
116
115
117
const MachineOperand *isClamp (const MachineInstr &MI) const ;
116
118
bool tryFoldClamp (MachineInstr &MI);
@@ -1292,6 +1294,73 @@ bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1292
1294
return true ;
1293
1295
}
1294
1296
1297
+ bool SIFoldOperands::tryFoldFoldableCopy (
1298
+ MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1299
+ // Specially track simple redefs of m0 to the same value in a block, so we
1300
+ // can erase the later ones.
1301
+ if (MI.getOperand (0 ).getReg () == AMDGPU::M0) {
1302
+ MachineOperand &NewM0Val = MI.getOperand (1 );
1303
+ if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo (NewM0Val)) {
1304
+ MI.eraseFromParent ();
1305
+ return true ;
1306
+ }
1307
+
1308
+ // We aren't tracking other physical registers
1309
+ CurrentKnownM0Val = (NewM0Val.isReg () && NewM0Val.getReg ().isPhysical ())
1310
+ ? nullptr
1311
+ : &NewM0Val;
1312
+ return false ;
1313
+ }
1314
+
1315
+ MachineOperand &OpToFold = MI.getOperand (1 );
1316
+ bool FoldingImm = OpToFold.isImm () || OpToFold.isFI () || OpToFold.isGlobal ();
1317
+
1318
+ // FIXME: We could also be folding things like TargetIndexes.
1319
+ if (!FoldingImm && !OpToFold.isReg ())
1320
+ return false ;
1321
+
1322
+ if (OpToFold.isReg () && !OpToFold.getReg ().isVirtual ())
1323
+ return false ;
1324
+
1325
+ // Prevent folding operands backwards in the function. For example,
1326
+ // the COPY opcode must not be replaced by 1 in this example:
1327
+ //
1328
+ // %3 = COPY %vgpr0; VGPR_32:%3
1329
+ // ...
1330
+ // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1331
+ if (!MI.getOperand (0 ).getReg ().isVirtual ())
1332
+ return false ;
1333
+
1334
+ bool Changed = foldInstOperand (MI, OpToFold);
1335
+
1336
+ // If we managed to fold all uses of this copy then we might as well
1337
+ // delete it now.
1338
+ // The only reason we need to follow chains of copies here is that
1339
+ // tryFoldRegSequence looks forward through copies before folding a
1340
+ // REG_SEQUENCE into its eventual users.
1341
+ auto *InstToErase = &MI;
1342
+ while (MRI->use_nodbg_empty (InstToErase->getOperand (0 ).getReg ())) {
1343
+ auto &SrcOp = InstToErase->getOperand (1 );
1344
+ auto SrcReg = SrcOp.isReg () ? SrcOp.getReg () : Register ();
1345
+ InstToErase->eraseFromParent ();
1346
+ Changed = true ;
1347
+ InstToErase = nullptr ;
1348
+ if (!SrcReg || SrcReg.isPhysical ())
1349
+ break ;
1350
+ InstToErase = MRI->getVRegDef (SrcReg);
1351
+ if (!InstToErase || !TII->isFoldableCopy (*InstToErase))
1352
+ break ;
1353
+ }
1354
+
1355
+ if (InstToErase && InstToErase->isRegSequence () &&
1356
+ MRI->use_nodbg_empty (InstToErase->getOperand (0 ).getReg ())) {
1357
+ InstToErase->eraseFromParent ();
1358
+ Changed = true ;
1359
+ }
1360
+
1361
+ return Changed;
1362
+ }
1363
+
1295
1364
// Clamp patterns are canonically selected to v_max_* instructions, so only
1296
1365
// handle them.
1297
1366
const MachineOperand *SIFoldOperands::isClamp (const MachineInstr &MI) const {
@@ -1746,82 +1815,22 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
1746
1815
continue ;
1747
1816
}
1748
1817
1749
- if (!TII->isFoldableCopy (MI)) {
1750
- // Saw an unknown clobber of m0, so we no longer know what it is.
1751
- if (CurrentKnownM0Val && MI.modifiesRegister (AMDGPU::M0, TRI))
1752
- CurrentKnownM0Val = nullptr ;
1753
-
1754
- // TODO: Omod might be OK if there is NSZ only on the source
1755
- // instruction, and not the omod multiply.
1756
- if (IsIEEEMode || (!HasNSZ && !MI.getFlag (MachineInstr::FmNsz)) ||
1757
- !tryFoldOMod (MI))
1758
- Changed |= tryFoldClamp (MI);
1759
-
1818
+ if (TII->isFoldableCopy (MI)) {
1819
+ Changed |= tryFoldFoldableCopy (MI, CurrentKnownM0Val);
1760
1820
continue ;
1761
1821
}
1762
1822
1763
- // Specially track simple redefs of m0 to the same value in a block, so we
1764
- // can erase the later ones.
1765
- if (MI.getOperand (0 ).getReg () == AMDGPU::M0) {
1766
- MachineOperand &NewM0Val = MI.getOperand (1 );
1767
- if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo (NewM0Val)) {
1768
- MI.eraseFromParent ();
1769
- Changed = true ;
1770
- continue ;
1771
- }
1772
-
1773
- // We aren't tracking other physical registers
1774
- CurrentKnownM0Val = (NewM0Val.isReg () && NewM0Val.getReg ().isPhysical ()) ?
1775
- nullptr : &NewM0Val;
1776
- continue ;
1777
- }
1778
-
1779
- MachineOperand &OpToFold = MI.getOperand (1 );
1780
- bool FoldingImm =
1781
- OpToFold.isImm () || OpToFold.isFI () || OpToFold.isGlobal ();
1782
-
1783
- // FIXME: We could also be folding things like TargetIndexes.
1784
- if (!FoldingImm && !OpToFold.isReg ())
1785
- continue ;
1786
-
1787
- if (OpToFold.isReg () && !OpToFold.getReg ().isVirtual ())
1788
- continue ;
1823
+ // Saw an unknown clobber of m0, so we no longer know what it is.
1824
+ if (CurrentKnownM0Val && MI.modifiesRegister (AMDGPU::M0, TRI))
1825
+ CurrentKnownM0Val = nullptr ;
1789
1826
1790
- // Prevent folding operands backwards in the function. For example,
1791
- // the COPY opcode must not be replaced by 1 in this example:
1792
- //
1793
- // %3 = COPY %vgpr0; VGPR_32:%3
1794
- // ...
1795
- // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1796
- if (!MI.getOperand (0 ).getReg ().isVirtual ())
1797
- continue ;
1798
-
1799
- Changed |= foldInstOperand (MI, OpToFold);
1800
-
1801
- // If we managed to fold all uses of this copy then we might as well
1802
- // delete it now.
1803
- // The only reason we need to follow chains of copies here is that
1804
- // tryFoldRegSequence looks forward through copies before folding a
1805
- // REG_SEQUENCE into its eventual users.
1806
- auto *InstToErase = &MI;
1807
- while (MRI->use_nodbg_empty (InstToErase->getOperand (0 ).getReg ())) {
1808
- auto &SrcOp = InstToErase->getOperand (1 );
1809
- auto SrcReg = SrcOp.isReg () ? SrcOp.getReg () : Register ();
1810
- InstToErase->eraseFromParent ();
1811
- Changed = true ;
1812
- InstToErase = nullptr ;
1813
- if (!SrcReg || SrcReg.isPhysical ())
1814
- break ;
1815
- InstToErase = MRI->getVRegDef (SrcReg);
1816
- if (!InstToErase || !TII->isFoldableCopy (*InstToErase))
1817
- break ;
1818
- }
1819
- if (InstToErase && InstToErase->isRegSequence () &&
1820
- MRI->use_nodbg_empty (InstToErase->getOperand (0 ).getReg ())) {
1821
- InstToErase->eraseFromParent ();
1822
- Changed = true ;
1823
- }
1827
+ // TODO: Omod might be OK if there is NSZ only on the source
1828
+ // instruction, and not the omod multiply.
1829
+ if (IsIEEEMode || (!HasNSZ && !MI.getFlag (MachineInstr::FmNsz)) ||
1830
+ !tryFoldOMod (MI))
1831
+ Changed |= tryFoldClamp (MI);
1824
1832
}
1825
1833
}
1834
+
1826
1835
return Changed;
1827
1836
}
0 commit comments