Skip to content

Commit a5cbd2a

Browse files
authored
Revert "[AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (#… (#144039)
…133242)" This reverts commit 130080f because it causes issues in testcases similar to coalescer_remat.ll [1], i.e. when we use a VGPR tuple but only write to its lower parts. The high VGPRs would then not be included in the vgpr_count, and accessing them would be an out of bounds violation. [1] https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
1 parent 9eef4d1 commit a5cbd2a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+473
-587
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4263,9 +4263,10 @@ same *vendor-name*.
42634263
wavefront for
42644264
GFX6-GFX9. A register
42654265
is required if it is
4266-
written to, or
4266+
used explicitly, or
42674267
if a higher numbered
4268-
register is written to. This
4268+
register is used
4269+
explicitly. This
42694270
includes the special
42704271
SGPRs for VCC, Flat
42714272
Scratch (GFX7-GFX9)
@@ -4283,10 +4284,10 @@ same *vendor-name*.
42834284
each work-item for
42844285
GFX6-GFX9. A register
42854286
is required if it is
4286-
written to, or
4287+
used explicitly, or
42874288
if a higher numbered
4288-
register is
4289-
written to.
4289+
register is used
4290+
explicitly.
42904291
".agpr_count" integer Required Number of accumulator
42914292
registers required by
42924293
each work-item for

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
989989
// dispatch registers are function args.
990990
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
991991

992-
if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
992+
if (isShader(F.getCallingConv())) {
993993
bool IsPixelShader =
994994
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
995995

@@ -1060,6 +1060,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
10601060

10611061
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
10621062
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1063+
} else if (isKernel(F.getCallingConv()) &&
1064+
MFI->getNumKernargPreloadedSGPRs()) {
1065+
// Consider cases where the total number of UserSGPRs with trailing
1066+
// allocated preload SGPRs, is greater than the number of explicitly
1067+
// referenced SGPRs.
1068+
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
1069+
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
1070+
ProgInfo.NumSGPR =
1071+
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
10631072
}
10641073

10651074
// Adjust number of registers used to meet default/requested minimum/maximum

llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp

Lines changed: 266 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -137,29 +137,274 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
137137
if (MFI->isStackRealigned())
138138
Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
139139

140-
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
141-
142-
Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
143-
Info.NumExplicitSGPR =
144-
TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
145-
if (ST.hasMAIInsts())
146-
Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
147-
148-
// Preloaded registers are written by the hardware, not defined in the
149-
// function body, so they need special handling.
150-
if (MFI->isEntryFunction()) {
151-
Info.NumExplicitSGPR =
152-
std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
153-
Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
154-
}
155-
156-
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
140+
Info.UsesVCC =
141+
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
142+
143+
// If there are no calls, MachineRegisterInfo can tell us the used register
144+
// count easily.
145+
// A tail call isn't considered a call for MachineFrameInfo's purposes.
146+
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
147+
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
148+
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
149+
if (ST.hasMAIInsts())
150+
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
157151
return Info;
152+
}
158153

154+
int32_t MaxVGPR = -1;
155+
int32_t MaxAGPR = -1;
156+
int32_t MaxSGPR = -1;
159157
Info.CalleeSegmentSize = 0;
160158

161159
for (const MachineBasicBlock &MBB : MF) {
162160
for (const MachineInstr &MI : MBB) {
161+
// TODO: Check regmasks? Do they occur anywhere except calls?
162+
for (const MachineOperand &MO : MI.operands()) {
163+
unsigned Width = 0;
164+
bool IsSGPR = false;
165+
bool IsAGPR = false;
166+
167+
if (!MO.isReg())
168+
continue;
169+
170+
Register Reg = MO.getReg();
171+
switch (Reg) {
172+
case AMDGPU::EXEC:
173+
case AMDGPU::EXEC_LO:
174+
case AMDGPU::EXEC_HI:
175+
case AMDGPU::SCC:
176+
case AMDGPU::M0:
177+
case AMDGPU::M0_LO16:
178+
case AMDGPU::M0_HI16:
179+
case AMDGPU::SRC_SHARED_BASE_LO:
180+
case AMDGPU::SRC_SHARED_BASE:
181+
case AMDGPU::SRC_SHARED_LIMIT_LO:
182+
case AMDGPU::SRC_SHARED_LIMIT:
183+
case AMDGPU::SRC_PRIVATE_BASE_LO:
184+
case AMDGPU::SRC_PRIVATE_BASE:
185+
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
186+
case AMDGPU::SRC_PRIVATE_LIMIT:
187+
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
188+
case AMDGPU::SGPR_NULL:
189+
case AMDGPU::SGPR_NULL64:
190+
case AMDGPU::MODE:
191+
continue;
192+
193+
case AMDGPU::NoRegister:
194+
assert(MI.isDebugInstr() &&
195+
"Instruction uses invalid noreg register");
196+
continue;
197+
198+
case AMDGPU::VCC:
199+
case AMDGPU::VCC_LO:
200+
case AMDGPU::VCC_HI:
201+
case AMDGPU::VCC_LO_LO16:
202+
case AMDGPU::VCC_LO_HI16:
203+
case AMDGPU::VCC_HI_LO16:
204+
case AMDGPU::VCC_HI_HI16:
205+
Info.UsesVCC = true;
206+
continue;
207+
208+
case AMDGPU::FLAT_SCR:
209+
case AMDGPU::FLAT_SCR_LO:
210+
case AMDGPU::FLAT_SCR_HI:
211+
continue;
212+
213+
case AMDGPU::XNACK_MASK:
214+
case AMDGPU::XNACK_MASK_LO:
215+
case AMDGPU::XNACK_MASK_HI:
216+
llvm_unreachable("xnack_mask registers should not be used");
217+
218+
case AMDGPU::LDS_DIRECT:
219+
llvm_unreachable("lds_direct register should not be used");
220+
221+
case AMDGPU::TBA:
222+
case AMDGPU::TBA_LO:
223+
case AMDGPU::TBA_HI:
224+
case AMDGPU::TMA:
225+
case AMDGPU::TMA_LO:
226+
case AMDGPU::TMA_HI:
227+
llvm_unreachable("trap handler registers should not be used");
228+
229+
case AMDGPU::SRC_VCCZ:
230+
llvm_unreachable("src_vccz register should not be used");
231+
232+
case AMDGPU::SRC_EXECZ:
233+
llvm_unreachable("src_execz register should not be used");
234+
235+
case AMDGPU::SRC_SCC:
236+
llvm_unreachable("src_scc register should not be used");
237+
238+
default:
239+
break;
240+
}
241+
242+
if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
243+
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
244+
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
245+
IsSGPR = true;
246+
Width = 1;
247+
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
248+
AMDGPU::VGPR_16RegClass.contains(Reg)) {
249+
IsSGPR = false;
250+
Width = 1;
251+
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
252+
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
253+
IsSGPR = false;
254+
IsAGPR = true;
255+
Width = 1;
256+
} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
257+
IsSGPR = true;
258+
Width = 2;
259+
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
260+
IsSGPR = false;
261+
Width = 2;
262+
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
263+
IsSGPR = false;
264+
IsAGPR = true;
265+
Width = 2;
266+
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
267+
IsSGPR = false;
268+
Width = 3;
269+
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
270+
IsSGPR = true;
271+
Width = 3;
272+
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
273+
IsSGPR = false;
274+
IsAGPR = true;
275+
Width = 3;
276+
} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
277+
IsSGPR = true;
278+
Width = 4;
279+
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
280+
IsSGPR = false;
281+
Width = 4;
282+
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
283+
IsSGPR = false;
284+
IsAGPR = true;
285+
Width = 4;
286+
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
287+
IsSGPR = false;
288+
Width = 5;
289+
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
290+
IsSGPR = true;
291+
Width = 5;
292+
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
293+
IsSGPR = false;
294+
IsAGPR = true;
295+
Width = 5;
296+
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
297+
IsSGPR = false;
298+
Width = 6;
299+
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
300+
IsSGPR = true;
301+
Width = 6;
302+
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
303+
IsSGPR = false;
304+
IsAGPR = true;
305+
Width = 6;
306+
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
307+
IsSGPR = false;
308+
Width = 7;
309+
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
310+
IsSGPR = true;
311+
Width = 7;
312+
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
313+
IsSGPR = false;
314+
IsAGPR = true;
315+
Width = 7;
316+
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
317+
IsSGPR = true;
318+
Width = 8;
319+
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
320+
IsSGPR = false;
321+
Width = 8;
322+
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
323+
IsSGPR = false;
324+
IsAGPR = true;
325+
Width = 8;
326+
} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
327+
IsSGPR = false;
328+
Width = 9;
329+
} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
330+
IsSGPR = true;
331+
Width = 9;
332+
} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
333+
IsSGPR = false;
334+
IsAGPR = true;
335+
Width = 9;
336+
} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
337+
IsSGPR = false;
338+
Width = 10;
339+
} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
340+
IsSGPR = true;
341+
Width = 10;
342+
} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
343+
IsSGPR = false;
344+
IsAGPR = true;
345+
Width = 10;
346+
} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
347+
IsSGPR = false;
348+
Width = 11;
349+
} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
350+
IsSGPR = true;
351+
Width = 11;
352+
} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
353+
IsSGPR = false;
354+
IsAGPR = true;
355+
Width = 11;
356+
} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
357+
IsSGPR = false;
358+
Width = 12;
359+
} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
360+
IsSGPR = true;
361+
Width = 12;
362+
} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
363+
IsSGPR = false;
364+
IsAGPR = true;
365+
Width = 12;
366+
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
367+
IsSGPR = true;
368+
Width = 16;
369+
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
370+
IsSGPR = false;
371+
Width = 16;
372+
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
373+
IsSGPR = false;
374+
IsAGPR = true;
375+
Width = 16;
376+
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
377+
IsSGPR = true;
378+
Width = 32;
379+
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
380+
IsSGPR = false;
381+
Width = 32;
382+
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
383+
IsSGPR = false;
384+
IsAGPR = true;
385+
Width = 32;
386+
} else {
387+
// We only expect TTMP registers or registers that do not belong to
388+
// any RC.
389+
assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
390+
AMDGPU::TTMP_64RegClass.contains(Reg) ||
391+
AMDGPU::TTMP_128RegClass.contains(Reg) ||
392+
AMDGPU::TTMP_256RegClass.contains(Reg) ||
393+
AMDGPU::TTMP_512RegClass.contains(Reg) ||
394+
!TRI.getPhysRegBaseClass(Reg)) &&
395+
"Unknown register class");
396+
}
397+
unsigned HWReg = TRI.getHWRegIndex(Reg);
398+
int MaxUsed = HWReg + Width - 1;
399+
if (IsSGPR) {
400+
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
401+
} else if (IsAGPR) {
402+
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
403+
} else {
404+
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
405+
}
406+
}
407+
163408
if (MI.isCall()) {
164409
// Pseudo used just to encode the underlying global. Is there a better
165410
// way to track this?
@@ -219,5 +464,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
219464
}
220465
}
221466

467+
Info.NumExplicitSGPR = MaxSGPR + 1;
468+
Info.NumVGPR = MaxVGPR + 1;
469+
Info.NumAGPR = MaxAGPR + 1;
470+
222471
return Info;
223472
}

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -970,25 +970,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
970970
return NumUserSGPRs;
971971
}
972972

973-
// Get the number of preloaded SGPRs for compute kernels.
974973
unsigned getNumPreloadedSGPRs() const {
975974
return NumUserSGPRs + NumSystemSGPRs;
976975
}
977976

978-
// Get the number of preloaded VGPRs for compute kernels.
979-
unsigned getNumPreloadedVGPRs() const {
980-
if (hasWorkItemIDZ())
981-
return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
982-
983-
if (hasWorkItemIDY())
984-
return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
985-
986-
if (hasWorkItemIDX())
987-
return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
988-
989-
return 0;
990-
}
991-
992977
unsigned getNumKernargPreloadedSGPRs() const {
993978
return UserSGPRInfo.getNumKernargPreloadSGPRs();
994979
}

0 commit comments

Comments
 (0)