Skip to content

Commit 10cef70

Browse files
committed
AMDGPU: Clean up LDS-related occupancy calculations
Occupancy is expressed as waves per SIMD. This means that we need to take into account the number of SIMDs per "CU" or, to be more precise, the number of SIMDs over which a workgroup may be distributed. getOccupancyWithLocalMemSize was wrong because it didn't take SIMDs into account at all. At the same time, we need to take into account that WGP mode offers access to a larger total amount of LDS, since this can affect how non-power-of-two LDS allocations are rounded. To make this work consistently, we distinguish between (available) local memory size and addressable local memory size (which is always limited by 64kB on gfx10+, even with WGP mode). This change results in a massive amount of test churn. A lot of it is caused by the fact that the default work group size is 1024, which means that (due to rounding effects) the default occupancy on older hardware is 8 instead of 10, which affects scheduling via register pressure estimates. I've adjusted most tests by just running the UTC tools, but in some cases I manually changed the work group size to 32 or 64 to make sure that work group size chunkiness has no effect. Differential Revision: https://reviews.llvm.org/D139468
1 parent 0775f21 commit 10cef70

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+7881
-7798
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -842,11 +842,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
842842
Ctx.diagnose(Diag);
843843
}
844844

845-
if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
845+
if (MFI->getLDSSize() >
846+
static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
846847
LLVMContext &Ctx = MF.getFunction().getContext();
847-
DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
848-
MFI->getLDSSize(),
849-
STM.getLocalMemorySize(), DS_Error);
848+
DiagnosticInfoResourceLimit Diag(
849+
MF.getFunction(), "local memory", MFI->getLDSSize(),
850+
STM.getAddressableLocalMemorySize(), DS_Error);
850851
Ctx.diagnose(Diag);
851852
}
852853

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
798798
}
799799
}
800800

801-
LocalMemLimit = ST.getLocalMemorySize();
801+
LocalMemLimit = ST.getAddressableLocalMemorySize();
802802
if (LocalMemLimit == 0)
803803
return false;
804804

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
141141
HasMovrel = true;
142142
}
143143

144+
AddressableLocalMemorySize = LocalMemorySize;
145+
146+
if (AMDGPU::isGFX10Plus(*this) &&
147+
!getFeatureBits().test(AMDGPU::FeatureCuMode))
148+
LocalMemorySize *= 2;
149+
144150
// Don't crash on invalid devices.
145151
if (WavefrontSizeLog2 == 0)
146152
WavefrontSizeLog2 = 5;
@@ -304,19 +310,29 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
304310
}
305311
}
306312

307-
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
308-
const Function &F) const {
309-
if (NWaves == 1)
310-
return getLocalMemorySize();
311-
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
312-
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
313-
if (!WorkGroupsPerCu)
314-
return 0;
315-
unsigned MaxWaves = getMaxWavesPerEU();
316-
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
313+
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
314+
// allows the given function to achieve an occupancy of NWaves waves per
315+
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
316+
unsigned
317+
AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
318+
const Function &F) const {
319+
const unsigned WaveSize = getWavefrontSize();
320+
const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
321+
const unsigned WavesPerWorkgroup =
322+
std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
323+
324+
const unsigned WorkGroupsPerCU =
325+
std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
326+
327+
return getLocalMemorySize() / WorkGroupsPerCU;
317328
}
318329

319330
// FIXME: Should return min,max range.
331+
//
332+
// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
333+
// be achieved when only the given function is running on the machine; and
334+
// taking into account the overall number of wave slots, the (maximum) workgroup
335+
// size, and the per-workgroup LDS allocation size.
320336
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
321337
const Function &F) const {
322338
const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
@@ -338,10 +354,13 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
338354

339355
NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
340356

341-
// Round to the number of waves.
342-
const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
357+
// Round to the number of waves per CU.
358+
const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
343359
unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
344360

361+
// Number of waves per EU (SIMD).
362+
MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
363+
345364
// Clamp to the maximum possible number of waves.
346365
MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
347366

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class AMDGPUSubtarget {
6464
unsigned EUsPerCU = 4;
6565
unsigned MaxWavesPerEU = 10;
6666
unsigned LocalMemorySize = 0;
67+
unsigned AddressableLocalMemorySize = 0;
6768
char WavefrontSizeLog2 = 0;
6869

6970
public:
@@ -210,6 +211,10 @@ class AMDGPUSubtarget {
210211
return LocalMemorySize;
211212
}
212213

214+
unsigned getAddressableLocalMemorySize() const {
215+
return AddressableLocalMemorySize;
216+
}
217+
213218
/// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
214219
/// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
215220
/// CU mode into account.

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -903,10 +903,12 @@ void GCNSchedStage::checkScheduling() {
903903
return;
904904
}
905905

906+
unsigned TargetOccupancy =
907+
std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
906908
unsigned WavesAfter =
907-
std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST));
909+
std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
908910
unsigned WavesBefore =
909-
std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
911+
std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
910912
LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
911913
<< ", after " << WavesAfter << ".\n");
912914

llvm/lib/Target/AMDGPU/R600Subtarget.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
2828
InstrInfo(*this),
2929
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
3030
TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
31-
InstrItins(getInstrItineraryForCPU(GPU)) {}
31+
InstrItins(getInstrItineraryForCPU(GPU)) {
32+
AddressableLocalMemorySize = LocalMemorySize;
33+
}
3234

3335
R600Subtarget &R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
3436
StringRef GPU,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12635,7 +12635,8 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
1263512635
// We can report everything over the maximum size as 0. We can't report
1263612636
// based on the actual size because we don't know if it's accurate or not
1263712637
// at any given point.
12638-
Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize()));
12638+
Known.Zero.setHighBits(
12639+
countLeadingZeros(getSubtarget()->getAddressableLocalMemorySize()));
1263912640
break;
1264012641
}
1264112642
}

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -828,11 +828,26 @@ unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
828828
}
829829

830830
unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
831+
unsigned BytesPerCU = 0;
832+
if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
833+
BytesPerCU = 32768;
834+
if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
835+
BytesPerCU = 65536;
836+
837+
// "Per CU" really means "per whatever functional block the waves of a
838+
// workgroup must share". So the effective local memory size is doubled in
839+
// WGP mode on gfx10.
840+
if (isGFX10Plus(*STI) && !STI->getFeatureBits().test(FeatureCuMode))
841+
BytesPerCU *= 2;
842+
843+
return BytesPerCU;
844+
}
845+
846+
unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
831847
if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
832848
return 32768;
833849
if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
834850
return 65536;
835-
836851
return 0;
837852
}
838853

@@ -852,11 +867,18 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
852867
assert(FlatWorkGroupSize != 0);
853868
if (STI->getTargetTriple().getArch() != Triple::amdgcn)
854869
return 8;
870+
unsigned MaxWaves = getMaxWavesPerEU(STI) * getEUsPerCU(STI);
855871
unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize);
856-
if (N == 1)
857-
return 40;
858-
N = 40 / N;
859-
return std::min(N, 16u);
872+
if (N == 1) {
873+
// Single-wave workgroups don't consume barrier resources.
874+
return MaxWaves;
875+
}
876+
877+
unsigned MaxBarriers = 16;
878+
if (isGFX10Plus(*STI) && !STI->getFeatureBits().test(FeatureCuMode))
879+
MaxBarriers = 32;
880+
881+
return std::min(MaxWaves / N, MaxBarriers);
860882
}
861883

862884
unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,10 @@ unsigned getWavefrontSize(const MCSubtargetInfo *STI);
192192
/// \returns Local memory size in bytes for given subtarget \p STI.
193193
unsigned getLocalMemorySize(const MCSubtargetInfo *STI);
194194

195+
/// \returns Maximum addressable local memory size in bytes for given subtarget
196+
/// \p STI.
197+
unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI);
198+
195199
/// \returns Number of execution units per compute unit for given subtarget \p
196200
/// STI.
197201
unsigned getEUsPerCU(const MCSubtargetInfo *STI);

0 commit comments

Comments
 (0)