Skip to content

Commit 130080f

Browse files
rovkatsymalla
andauthored
[AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (#133242)
Don't count register uses when determining the maximum number of registers used by a function. Count only the defs. This is really an underestimate of the true register usage, but in practice that's not a problem because if a function uses a register, then it has either defined it earlier, or some other function that executed before has defined it. In particular, the register counts are used: 1. When launching an entry function - in which case we're safe because the register counts of the entry function will include the register counts of all callees. 2. At function boundaries in dynamic VGPR mode. In this case it's safe because whenever we set the new VGPR allocation we take into account the outgoing_vgpr_count set by the middle-end. The main advantage of doing this is that the artificial VGPR arguments used only for preserving the inactive lanes when using the llvm.amdgcn.init.whole.wave intrinsic are no longer counted. This enables us to allocate only the registers we need in dynamic VGPR mode. --------- Co-authored-by: Thomas Symalla <[email protected]>
1 parent e1276ec commit 130080f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+587
-473
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4263,10 +4263,9 @@ same *vendor-name*.
42634263
wavefront for
42644264
GFX6-GFX9. A register
42654265
is required if it is
4266-
used explicitly, or
4266+
written to, or
42674267
if a higher numbered
4268-
register is used
4269-
explicitly. This
4268+
register is written to. This
42704269
includes the special
42714270
SGPRs for VCC, Flat
42724271
Scratch (GFX7-GFX9)
@@ -4284,10 +4283,10 @@ same *vendor-name*.
42844283
each work-item for
42854284
GFX6-GFX9. A register
42864285
is required if it is
4287-
used explicitly, or
4286+
written to, or
42884287
if a higher numbered
4289-
register is used
4290-
explicitly.
4288+
register is
4289+
written to.
42914290
".agpr_count" integer Required Number of accumulator
42924291
registers required by
42934292
each work-item for

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
989989
// dispatch registers are function args.
990990
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
991991

992-
if (isShader(F.getCallingConv())) {
992+
if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
993993
bool IsPixelShader =
994994
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
995995

@@ -1060,15 +1060,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
10601060

10611061
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
10621062
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1063-
} else if (isKernel(F.getCallingConv()) &&
1064-
MFI->getNumKernargPreloadedSGPRs()) {
1065-
// Consider cases where the total number of UserSGPRs with trailing
1066-
// allocated preload SGPRs, is greater than the number of explicitly
1067-
// referenced SGPRs.
1068-
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
1069-
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
1070-
ProgInfo.NumSGPR =
1071-
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
10721063
}
10731064

10741065
// Adjust number of registers used to meet default/requested minimum/maximum

llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp

Lines changed: 17 additions & 266 deletions
Original file line numberDiff line numberDiff line change
@@ -137,274 +137,29 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
137137
if (MFI->isStackRealigned())
138138
Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
139139

140-
Info.UsesVCC =
141-
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
142-
143-
// If there are no calls, MachineRegisterInfo can tell us the used register
144-
// count easily.
145-
// A tail call isn't considered a call for MachineFrameInfo's purposes.
146-
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
147-
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
148-
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
149-
if (ST.hasMAIInsts())
150-
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
151-
return Info;
140+
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
141+
142+
Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
143+
Info.NumExplicitSGPR =
144+
TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
145+
if (ST.hasMAIInsts())
146+
Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
147+
148+
// Preloaded registers are written by the hardware, not defined in the
149+
// function body, so they need special handling.
150+
if (MFI->isEntryFunction()) {
151+
Info.NumExplicitSGPR =
152+
std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
153+
Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
152154
}
153155

154-
int32_t MaxVGPR = -1;
155-
int32_t MaxAGPR = -1;
156-
int32_t MaxSGPR = -1;
156+
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
157+
return Info;
158+
157159
Info.CalleeSegmentSize = 0;
158160

159161
for (const MachineBasicBlock &MBB : MF) {
160162
for (const MachineInstr &MI : MBB) {
161-
// TODO: Check regmasks? Do they occur anywhere except calls?
162-
for (const MachineOperand &MO : MI.operands()) {
163-
unsigned Width = 0;
164-
bool IsSGPR = false;
165-
bool IsAGPR = false;
166-
167-
if (!MO.isReg())
168-
continue;
169-
170-
Register Reg = MO.getReg();
171-
switch (Reg) {
172-
case AMDGPU::EXEC:
173-
case AMDGPU::EXEC_LO:
174-
case AMDGPU::EXEC_HI:
175-
case AMDGPU::SCC:
176-
case AMDGPU::M0:
177-
case AMDGPU::M0_LO16:
178-
case AMDGPU::M0_HI16:
179-
case AMDGPU::SRC_SHARED_BASE_LO:
180-
case AMDGPU::SRC_SHARED_BASE:
181-
case AMDGPU::SRC_SHARED_LIMIT_LO:
182-
case AMDGPU::SRC_SHARED_LIMIT:
183-
case AMDGPU::SRC_PRIVATE_BASE_LO:
184-
case AMDGPU::SRC_PRIVATE_BASE:
185-
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
186-
case AMDGPU::SRC_PRIVATE_LIMIT:
187-
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
188-
case AMDGPU::SGPR_NULL:
189-
case AMDGPU::SGPR_NULL64:
190-
case AMDGPU::MODE:
191-
continue;
192-
193-
case AMDGPU::NoRegister:
194-
assert(MI.isDebugInstr() &&
195-
"Instruction uses invalid noreg register");
196-
continue;
197-
198-
case AMDGPU::VCC:
199-
case AMDGPU::VCC_LO:
200-
case AMDGPU::VCC_HI:
201-
case AMDGPU::VCC_LO_LO16:
202-
case AMDGPU::VCC_LO_HI16:
203-
case AMDGPU::VCC_HI_LO16:
204-
case AMDGPU::VCC_HI_HI16:
205-
Info.UsesVCC = true;
206-
continue;
207-
208-
case AMDGPU::FLAT_SCR:
209-
case AMDGPU::FLAT_SCR_LO:
210-
case AMDGPU::FLAT_SCR_HI:
211-
continue;
212-
213-
case AMDGPU::XNACK_MASK:
214-
case AMDGPU::XNACK_MASK_LO:
215-
case AMDGPU::XNACK_MASK_HI:
216-
llvm_unreachable("xnack_mask registers should not be used");
217-
218-
case AMDGPU::LDS_DIRECT:
219-
llvm_unreachable("lds_direct register should not be used");
220-
221-
case AMDGPU::TBA:
222-
case AMDGPU::TBA_LO:
223-
case AMDGPU::TBA_HI:
224-
case AMDGPU::TMA:
225-
case AMDGPU::TMA_LO:
226-
case AMDGPU::TMA_HI:
227-
llvm_unreachable("trap handler registers should not be used");
228-
229-
case AMDGPU::SRC_VCCZ:
230-
llvm_unreachable("src_vccz register should not be used");
231-
232-
case AMDGPU::SRC_EXECZ:
233-
llvm_unreachable("src_execz register should not be used");
234-
235-
case AMDGPU::SRC_SCC:
236-
llvm_unreachable("src_scc register should not be used");
237-
238-
default:
239-
break;
240-
}
241-
242-
if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
243-
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
244-
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
245-
IsSGPR = true;
246-
Width = 1;
247-
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
248-
AMDGPU::VGPR_16RegClass.contains(Reg)) {
249-
IsSGPR = false;
250-
Width = 1;
251-
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
252-
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
253-
IsSGPR = false;
254-
IsAGPR = true;
255-
Width = 1;
256-
} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
257-
IsSGPR = true;
258-
Width = 2;
259-
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
260-
IsSGPR = false;
261-
Width = 2;
262-
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
263-
IsSGPR = false;
264-
IsAGPR = true;
265-
Width = 2;
266-
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
267-
IsSGPR = false;
268-
Width = 3;
269-
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
270-
IsSGPR = true;
271-
Width = 3;
272-
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
273-
IsSGPR = false;
274-
IsAGPR = true;
275-
Width = 3;
276-
} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
277-
IsSGPR = true;
278-
Width = 4;
279-
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
280-
IsSGPR = false;
281-
Width = 4;
282-
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
283-
IsSGPR = false;
284-
IsAGPR = true;
285-
Width = 4;
286-
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
287-
IsSGPR = false;
288-
Width = 5;
289-
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
290-
IsSGPR = true;
291-
Width = 5;
292-
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
293-
IsSGPR = false;
294-
IsAGPR = true;
295-
Width = 5;
296-
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
297-
IsSGPR = false;
298-
Width = 6;
299-
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
300-
IsSGPR = true;
301-
Width = 6;
302-
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
303-
IsSGPR = false;
304-
IsAGPR = true;
305-
Width = 6;
306-
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
307-
IsSGPR = false;
308-
Width = 7;
309-
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
310-
IsSGPR = true;
311-
Width = 7;
312-
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
313-
IsSGPR = false;
314-
IsAGPR = true;
315-
Width = 7;
316-
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
317-
IsSGPR = true;
318-
Width = 8;
319-
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
320-
IsSGPR = false;
321-
Width = 8;
322-
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
323-
IsSGPR = false;
324-
IsAGPR = true;
325-
Width = 8;
326-
} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
327-
IsSGPR = false;
328-
Width = 9;
329-
} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
330-
IsSGPR = true;
331-
Width = 9;
332-
} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
333-
IsSGPR = false;
334-
IsAGPR = true;
335-
Width = 9;
336-
} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
337-
IsSGPR = false;
338-
Width = 10;
339-
} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
340-
IsSGPR = true;
341-
Width = 10;
342-
} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
343-
IsSGPR = false;
344-
IsAGPR = true;
345-
Width = 10;
346-
} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
347-
IsSGPR = false;
348-
Width = 11;
349-
} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
350-
IsSGPR = true;
351-
Width = 11;
352-
} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
353-
IsSGPR = false;
354-
IsAGPR = true;
355-
Width = 11;
356-
} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
357-
IsSGPR = false;
358-
Width = 12;
359-
} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
360-
IsSGPR = true;
361-
Width = 12;
362-
} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
363-
IsSGPR = false;
364-
IsAGPR = true;
365-
Width = 12;
366-
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
367-
IsSGPR = true;
368-
Width = 16;
369-
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
370-
IsSGPR = false;
371-
Width = 16;
372-
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
373-
IsSGPR = false;
374-
IsAGPR = true;
375-
Width = 16;
376-
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
377-
IsSGPR = true;
378-
Width = 32;
379-
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
380-
IsSGPR = false;
381-
Width = 32;
382-
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
383-
IsSGPR = false;
384-
IsAGPR = true;
385-
Width = 32;
386-
} else {
387-
// We only expect TTMP registers or registers that do not belong to
388-
// any RC.
389-
assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
390-
AMDGPU::TTMP_64RegClass.contains(Reg) ||
391-
AMDGPU::TTMP_128RegClass.contains(Reg) ||
392-
AMDGPU::TTMP_256RegClass.contains(Reg) ||
393-
AMDGPU::TTMP_512RegClass.contains(Reg) ||
394-
!TRI.getPhysRegBaseClass(Reg)) &&
395-
"Unknown register class");
396-
}
397-
unsigned HWReg = TRI.getHWRegIndex(Reg);
398-
int MaxUsed = HWReg + Width - 1;
399-
if (IsSGPR) {
400-
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
401-
} else if (IsAGPR) {
402-
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
403-
} else {
404-
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
405-
}
406-
}
407-
408163
if (MI.isCall()) {
409164
// Pseudo used just to encode the underlying global. Is there a better
410165
// way to track this?
@@ -464,9 +219,5 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
464219
}
465220
}
466221

467-
Info.NumExplicitSGPR = MaxSGPR + 1;
468-
Info.NumVGPR = MaxVGPR + 1;
469-
Info.NumAGPR = MaxAGPR + 1;
470-
471222
return Info;
472223
}

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -970,10 +970,25 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
970970
return NumUserSGPRs;
971971
}
972972

973+
// Get the number of preloaded SGPRs for compute kernels.
973974
unsigned getNumPreloadedSGPRs() const {
974975
return NumUserSGPRs + NumSystemSGPRs;
975976
}
976977

978+
// Get the number of preloaded VGPRs for compute kernels.
979+
unsigned getNumPreloadedVGPRs() const {
980+
if (hasWorkItemIDZ())
981+
return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
982+
983+
if (hasWorkItemIDY())
984+
return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
985+
986+
if (hasWorkItemIDX())
987+
return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
988+
989+
return 0;
990+
}
991+
977992
unsigned getNumKernargPreloadedSGPRs() const {
978993
return UserSGPRInfo.getNumKernargPreloadSGPRs();
979994
}

0 commit comments

Comments
 (0)