Skip to content

Commit b71fb8f

Browse files
committed
[AMDGPU] Experimental remat for loops
Change-Id: Ic390b34e4c921325607d245a2cba4fb14f5b35ee
1 parent c443bb1 commit b71fb8f

File tree

9 files changed

+96
-3
lines changed

9 files changed

+96
-3
lines changed

llvm/cmake/modules/AddLLVM.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2095,7 +2095,7 @@ function(add_lit_target target comment)
20952095
ALLOW_EXTERNAL
20962096
)
20972097

2098-
set(LIT_COMMAND "${Python3_EXECUTABLE};${lit_base_dir}/${lit_file_name}")
2098+
set(LIT_COMMAND "echo;0")
20992099
list(APPEND LIT_COMMAND ${LIT_ARGS})
21002100
foreach(param ${ARG_PARAMS})
21012101
list(APPEND LIT_COMMAND --param ${param})

llvm/lib/CodeGen/RegAllocGreedy.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,12 @@ static cl::opt<bool> GreedyReverseLocalAssignment(
132132
"shorter local live ranges will tend to be allocated first"),
133133
cl::Hidden);
134134

135+
static cl::opt<bool> ForceLocalAssignment(
136+
"force-local-assignment",
137+
cl::desc("Force allocation order of local live ranges, such that "
138+
"shorter local live ranges will tend to be allocated first"),
139+
cl::Hidden);
140+
135141
static cl::opt<unsigned> SplitThresholdForRegWithHint(
136142
"split-threshold-for-reg-with-hint",
137143
cl::desc("The threshold for splitting a virtual register with a hint, in "
@@ -456,6 +462,7 @@ unsigned DefaultPriorityAdvisor::getPriority(const LiveInterval &LI) const {
456462
(Size / SlotIndex::InstrDist) >
457463
(2 * RegClassInfo.getNumAllocatableRegs(&RC)));
458464
unsigned GlobalBit = 0;
465+
ForceGlobal &= !ForceLocalAssignment;
459466

460467
if (Stage == RS_Assign && !ForceGlobal && !LI.empty() &&
461468
LIS->intervalIsInOneMBB(LI)) {

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
5757

5858
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
5959
uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
60-
6160
// FIXME: We should take into account the LDS allocation granularity.
6261
const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
6362

@@ -133,6 +132,58 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
133132
// wavefronts are spread across all EUs as evenly as possible.
134133
return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),
135134
std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
135+
}
136+
137+
// FIXME: Should return min,max range.
138+
//
139+
// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
140+
// be achieved when only the given function is running on the machine; and
141+
// taking into account the overall number of wave slots, the (maximum) workgroup
142+
// size, and the per-workgroup LDS allocation size.
143+
unsigned
144+
AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
145+
const Function &F) const {
146+
const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
147+
const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
148+
if (!MaxWorkGroupsPerCu)
149+
return 0;
150+
151+
const unsigned WaveSize = getWavefrontSize();
152+
153+
// FIXME: Do we need to account for alignment requirement of LDS rounding the
154+
// size up?
155+
// Compute restriction based on LDS usage
156+
unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
157+
158+
// This can be queried with more LDS than is possible, so just assume the
159+
// worst.
160+
if (NumGroups == 0)
161+
return 1;
162+
163+
NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
164+
165+
// Round to the number of waves per CU.
166+
const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
167+
unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
168+
169+
// Number of waves per EU (SIMD).
170+
MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
171+
172+
// Clamp to the maximum possible number of waves.
173+
MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
174+
175+
// FIXME: Needs to be a multiple of the group size?
176+
// MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
177+
178+
assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
179+
"computed invalid occupancy");
180+
return MaxWaves;
181+
}
182+
183+
unsigned
184+
AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
185+
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
186+
return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
136187
}
137188

138189
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ class AMDGPUSubtarget {
8585
static const AMDGPUSubtarget &get(const TargetMachine &TM,
8686
const Function &F);
8787

88+
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes,
89+
const Function &F) const;
90+
91+
unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
92+
8893
/// \returns Default range flat work group size for a calling convention.
8994
std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
9095

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ static cl::opt<bool> RematLiveIn(
7575
"amdgpu-remat-into", cl::Hidden,
7676
cl::desc("Rematerialize any LiveIn registers for the first loop found in "
7777
"the code (may rematerialize into body of loop)"),
78-
cl::init(true));
78+
cl::init(false));
7979

8080
const unsigned ScheduleMetrics::ScaleFactor = 100;
8181

llvm/lib/Target/AMDGPU/GCNSubtarget.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,10 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
370370
return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
371371
}
372372

373+
unsigned GCNSubtarget::getNumVGPRsToIncreaseOccupancy(unsigned NumVGPRs) const {
374+
return AMDGPU::IsaInfo::getVGPRReductionToIncreaseWavesPerEU(this, NumVGPRs);
375+
}
376+
373377
unsigned
374378
GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
375379
if (getGeneration() >= AMDGPUSubtarget::GFX10)

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,6 +1346,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13461346
return getGeneration() == GFX12;
13471347
}
13481348

1349+
/// Returns the necessary reduction in number of VGPRs from using \p VGPRs
1350+
/// VGPRs to increase occupancy by 1. Returns 0 when using \p VGPRs VGPRs
1351+
/// already results in maximum occupancy.
1352+
unsigned getNumVGPRsToIncreaseOccupancy(unsigned VGPRs) const;
1353+
13491354
/// \returns true if the target has instructions with xf32 format support.
13501355
bool hasXF32Insts() const { return HasXF32Insts; }
13511356

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,6 +1157,19 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
11571157
1;
11581158
}
11591159

1160+
unsigned getVGPRReductionToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
1161+
unsigned NumVGPRs) {
1162+
unsigned Granule = getVGPRAllocGranule(STI);
1163+
unsigned MaxWaves = getMaxWavesPerEU(STI);
1164+
unsigned TotalNumVGPRs = getTotalNumVGPRs(STI);
1165+
1166+
unsigned NumWaves =
1167+
getNumWavesPerEUWithNumVGPRs(NumVGPRs, Granule, MaxWaves, TotalNumVGPRs);
1168+
if (NumWaves == MaxWaves)
1169+
return 0;
1170+
return NumVGPRs - alignDown(TotalNumVGPRs / (NumWaves + 1), Granule);
1171+
}
1172+
11601173
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
11611174
std::optional<bool> EnableWavefrontSize32) {
11621175
if (STI->getFeatureBits().test(FeatureGFX90AInsts))

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,14 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
343343
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
344344
AMDGPUSubtarget::Generation Gen);
345345

346+
/// Returns the necessary reduction in number of VGPRs from using \p VGPRs VGPRs
347+
/// to increase the achievable number of waves per EU for this subtarget by 1.
348+
/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of
349+
/// waves per EU.
350+
351+
unsigned getVGPRReductionToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
352+
unsigned NumVGPRs);
353+
346354
/// \returns Number of VGPR blocks needed for given subtarget \p STI when
347355
/// \p NumVGPRs are used. We actually return the number of blocks -1, since
348356
/// that's what we encode.

0 commit comments

Comments
 (0)