@@ -1847,6 +1847,110 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1847
1847
}
1848
1848
}
1849
1849
1850
+ static void assignSlotsUsingVGPRBlocks (MachineFunction &MF,
1851
+ const GCNSubtarget &ST,
1852
+ const TargetRegisterInfo *TRI,
1853
+ std::vector<CalleeSavedInfo> &CSI,
1854
+ unsigned &MinCSFrameIndex,
1855
+ unsigned &MaxCSFrameIndex) {
1856
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo <SIMachineFunctionInfo>();
1857
+ MachineFrameInfo &MFI = MF.getFrameInfo ();
1858
+ const SIInstrInfo *TII = ST.getInstrInfo ();
1859
+ const SIRegisterInfo *MRI = ST.getRegisterInfo ();
1860
+
1861
+ assert (std::is_sorted (CSI.begin (), CSI.end (),
1862
+ [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
1863
+ return A.getReg () < B.getReg ();
1864
+ }) &&
1865
+ " Callee saved registers not sorted" );
1866
+
1867
+ auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
1868
+ return !CSI.isSpilledToReg () &&
1869
+ MRI->isVGPR (MF.getRegInfo (), CSI.getReg ()) &&
1870
+ !FuncInfo->isWWMReservedRegister (CSI.getReg ());
1871
+ };
1872
+
1873
+ auto CSEnd = CSI.end ();
1874
+ for (auto CSIt = CSI.begin (); CSIt != CSEnd; ++CSIt) {
1875
+ Register Reg = CSIt->getReg ();
1876
+ if (!CanUseBlockOps (*CSIt))
1877
+ continue ;
1878
+
1879
+ // Find all the regs that will fit in a 32-bit block starting at the current
1880
+ // reg and build the mask. It should have 1 for every register that's
1881
+ // included, with the current register as the least significant bit.
1882
+ uint32_t Mask = 1 ;
1883
+ CSEnd = std::remove_if (
1884
+ CSIt + 1 , CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
1885
+ if (CanUseBlockOps (CSI) && CSI.getReg () < Reg + 32 ) {
1886
+ Mask |= 1 << (CSI.getReg () - Reg);
1887
+ return true ;
1888
+ } else {
1889
+ return false ;
1890
+ }
1891
+ });
1892
+
1893
+ const TargetRegisterClass *BlockRegClass =
1894
+ TII->getRegClassForBlockOp (TRI, MF);
1895
+ Register RegBlock =
1896
+ MRI->getMatchingSuperReg (Reg, AMDGPU::sub0, BlockRegClass);
1897
+ if (!RegBlock) {
1898
+ // We couldn't find a super register for the block. This can happen if
1899
+ // the register we started with is too high (e.g. v232 if the maximum is
1900
+ // v255). We therefore try to get the last register block and figure out
1901
+ // the mask from there.
1902
+ Register LastBlockStart =
1903
+ AMDGPU::VGPR0 + alignDown (Reg - AMDGPU::VGPR0, 32 );
1904
+ RegBlock =
1905
+ MRI->getMatchingSuperReg (LastBlockStart, AMDGPU::sub0, BlockRegClass);
1906
+ assert (RegBlock && MRI->isSubRegister (RegBlock, Reg) &&
1907
+ " Couldn't find super register" );
1908
+ int RegDelta = Reg - LastBlockStart;
1909
+ assert (RegDelta > 0 && llvm::countl_zero (Mask) >= RegDelta &&
1910
+ " Bad shift amount" );
1911
+ Mask <<= RegDelta;
1912
+ }
1913
+
1914
+ FuncInfo->setMaskForVGPRBlockOps (RegBlock, Mask);
1915
+
1916
+ // The stack objects can be a bit smaller than the register block if we know
1917
+ // some of the high bits of Mask are 0. This may happen often with calling
1918
+ // conventions where the caller and callee-saved VGPRs are interleaved at
1919
+ // a small boundary (e.g. 8 or 16).
1920
+ int UnusedBits = llvm::countl_zero (Mask);
1921
+ unsigned BlockSize = MRI->getSpillSize (*BlockRegClass) - UnusedBits * 4 ;
1922
+ int FrameIdx =
1923
+ MFI.CreateStackObject (BlockSize, MRI->getSpillAlign (*BlockRegClass),
1924
+ /* isSpillSlot=*/ true );
1925
+ if ((unsigned )FrameIdx < MinCSFrameIndex)
1926
+ MinCSFrameIndex = FrameIdx;
1927
+ if ((unsigned )FrameIdx > MaxCSFrameIndex)
1928
+ MaxCSFrameIndex = FrameIdx;
1929
+
1930
+ CSIt->setFrameIdx (FrameIdx);
1931
+ CSIt->setReg (RegBlock);
1932
+ CSIt->setHandledByTarget ();
1933
+ }
1934
+ CSI.erase (CSEnd, CSI.end ());
1935
+ }
1936
+
1937
+ bool SIFrameLowering::assignCalleeSavedSpillSlots (
1938
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
1939
+ std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
1940
+ unsigned &MaxCSFrameIndex) const {
1941
+ if (CSI.empty ())
1942
+ return true ; // Early exit if no callee saved registers are modified!
1943
+
1944
+ const GCNSubtarget &ST = MF.getSubtarget <GCNSubtarget>();
1945
+ bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR ();
1946
+
1947
+ if (UseVGPRBlocks)
1948
+ assignSlotsUsingVGPRBlocks (MF, ST, TRI, CSI, MinCSFrameIndex,
1949
+ MaxCSFrameIndex);
1950
+
1951
+ return assignCalleeSavedSpillSlots (MF, TRI, CSI);
1952
+ }
1953
+
1850
1954
bool SIFrameLowering::assignCalleeSavedSpillSlots (
1851
1955
MachineFunction &MF, const TargetRegisterInfo *TRI,
1852
1956
std::vector<CalleeSavedInfo> &CSI) const {
@@ -1915,6 +2019,101 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1915
2019
return true ;
1916
2020
}
1917
2021
2022
+ bool SIFrameLowering::spillCalleeSavedRegisters (
2023
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2024
+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2025
+ MachineFunction *MF = MBB.getParent ();
2026
+ const GCNSubtarget &ST = MF->getSubtarget <GCNSubtarget>();
2027
+ if (!ST.useVGPRBlockOpsForCSR ())
2028
+ return false ;
2029
+
2030
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo ();
2031
+ SIMachineFunctionInfo *MFI = MF->getInfo <SIMachineFunctionInfo>();
2032
+ const SIInstrInfo *TII = ST.getInstrInfo ();
2033
+ SIMachineFunctionInfo *FuncInfo = MF->getInfo <SIMachineFunctionInfo>();
2034
+
2035
+ for (const CalleeSavedInfo &CS : CSI) {
2036
+ Register Reg = CS.getReg ();
2037
+ if (!CS.isHandledByTarget ())
2038
+ continue ;
2039
+
2040
+ // Build a scratch block store.
2041
+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps (Reg);
2042
+ int FrameIndex = CS.getFrameIdx ();
2043
+ MachinePointerInfo PtrInfo =
2044
+ MachinePointerInfo::getFixedStack (*MF, FrameIndex);
2045
+ MachineMemOperand *MMO =
2046
+ MF->getMachineMemOperand (PtrInfo, MachineMemOperand::MOStore,
2047
+ FrameInfo.getObjectSize (FrameIndex),
2048
+ FrameInfo.getObjectAlign (FrameIndex));
2049
+
2050
+ BuildMI (MBB, MI, MI->getDebugLoc (),
2051
+ TII->get (AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
2052
+ .addReg (Reg, getKillRegState (false ))
2053
+ .addFrameIndex (FrameIndex)
2054
+ .addReg (MFI->getStackPtrOffsetReg ())
2055
+ .addImm (0 )
2056
+ .addImm (Mask)
2057
+ .addMemOperand (MMO);
2058
+
2059
+ FuncInfo->setHasSpilledVGPRs ();
2060
+
2061
+ // Add the register to the liveins. This is necessary because if any of the
2062
+ // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2063
+ // then the whole block will be marked as reserved and `updateLiveness` will
2064
+ // skip it.
2065
+ MBB.addLiveIn (Reg);
2066
+ }
2067
+
2068
+ return false ;
2069
+ }
2070
+
2071
+ bool SIFrameLowering::restoreCalleeSavedRegisters (
2072
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2073
+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2074
+ MachineFunction *MF = MBB.getParent ();
2075
+ const GCNSubtarget &ST = MF->getSubtarget <GCNSubtarget>();
2076
+ if (!ST.useVGPRBlockOpsForCSR ())
2077
+ return false ;
2078
+
2079
+ SIMachineFunctionInfo *FuncInfo = MF->getInfo <SIMachineFunctionInfo>();
2080
+ MachineFrameInfo &MFI = MF->getFrameInfo ();
2081
+ const SIInstrInfo *TII = ST.getInstrInfo ();
2082
+ const SIRegisterInfo *SITRI = static_cast <const SIRegisterInfo *>(TRI);
2083
+ for (const CalleeSavedInfo &CS : reverse (CSI)) {
2084
+ if (!CS.isHandledByTarget ())
2085
+ continue ;
2086
+
2087
+ // Build a scratch block load.
2088
+ Register Reg = CS.getReg ();
2089
+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps (Reg);
2090
+ int FrameIndex = CS.getFrameIdx ();
2091
+ MachinePointerInfo PtrInfo =
2092
+ MachinePointerInfo::getFixedStack (*MF, FrameIndex);
2093
+ MachineMemOperand *MMO = MF->getMachineMemOperand (
2094
+ PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize (FrameIndex),
2095
+ MFI.getObjectAlign (FrameIndex));
2096
+
2097
+ auto MIB = BuildMI (MBB, MI, MI->getDebugLoc (),
2098
+ TII->get (AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
2099
+ .addFrameIndex (FrameIndex)
2100
+ .addReg (FuncInfo->getStackPtrOffsetReg ())
2101
+ .addImm (0 )
2102
+ .addImm (Mask)
2103
+ .addMemOperand (MMO);
2104
+ SITRI->addImplicitUsesForBlockCSRLoad (MIB, Reg);
2105
+
2106
+ // Add the register to the liveins. This is necessary because if any of the
2107
+ // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2108
+ // then the whole block will be marked as reserved and `updateLiveness` will
2109
+ // skip it.
2110
+ if (!MBB.isLiveIn (Reg))
2111
+ MBB.addLiveIn (Reg);
2112
+ }
2113
+
2114
+ return false ;
2115
+ }
2116
+
1918
2117
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr (
1919
2118
MachineFunction &MF,
1920
2119
MachineBasicBlock &MBB,
0 commit comments