Skip to content

Commit f8be115

Browse files
kychendevigcbot
authored andcommitted
Improve RA optimization for scalar payload in send
For some send instruction, scalar payload such as address operand in block load is defined as GRF aligned in vISA. This causes more RA constraints which may result in spilling. This patch improves RA optimization to relax the alignment restriction for the original variable and replace local reference with temp aligned variable to mitigate the problem.
1 parent 8771de8 commit f8be115

File tree

4 files changed

+67
-18
lines changed

4 files changed

+67
-18
lines changed

visa/GraphColor.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10089,6 +10089,12 @@ int GlobalRA::coloringRegAlloc() {
1008910089

1009010090
// Re-run GRA loop if changes were made to IR
1009110091
rerunGRA |= split.getChangesMade();
10092+
kernel.dumpToFile("after.Split_Aligned_Scalar." + std::to_string(iterationNo));
10093+
#ifndef DLL_MODE
10094+
if (stopAfter("Split_Aligned_Scalar")) {
10095+
return VISA_EARLY_EXIT;
10096+
}
10097+
#endif // DLL_MODE
1009210098
}
1009310099

1009410100
// Calculate the spill caused by send to decide if global splitting is

visa/SplitAlignedScalars.cpp

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,11 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
7575
Data.allowed = false;
7676
}
7777

78-
if (dst->getTypeSize() != dstTopDcl->getByteSize()) {
79-
// we require that dst opnd writes complete topdcl
78+
if ((dst->getTypeSize() != dstTopDcl->getByteSize()) &&
79+
!(dst->getTypeSize() == 4 && dstTopDcl->getByteSize() == 8)) {
80+
// Disallow case where dst opnd does not write complete topdcl,
81+
// except QW opnd which may be A64 address computed separately
82+
// with low/hi DW opnds
8083
Data.allowed = false;
8184
}
8285

@@ -89,10 +92,11 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
8992
auto dstDcl =
9093
dst->asDstRegRegion()->getBase()->asRegVar()->getDeclare();
9194
if (dstDcl->getAliasDeclare()) {
92-
// disallow case where topdcl is scalar, but alias dcl
93-
// is not a scalar as it may be smaller in size. for eg,
94-
// topdcl may be :uq and alias may be of type :ud.
95-
if (dstDcl->getByteSize() != dstTopDcl->getByteSize())
95+
// Disallow case where dst alias dcl size is different from
96+
// top dcl, except QW opnd which may be A64 address computed
97+
// separately with low/hi DW opnds
98+
if ((dstDcl->getByteSize() != dstTopDcl->getByteSize()) &&
99+
!(dstDcl->getByteSize() == 4 && dstTopDcl->getByteSize() == 8))
96100
Data.allowed = false;
97101
}
98102

@@ -102,6 +106,9 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
102106
}
103107

104108
if (!inst->isSend()) {
109+
if (inst->getExecSize() != 1) {
110+
Data.allowed = false;
111+
}
105112
// check whether dst type size != src type size
106113
// disallow optimization if dst type is different
107114
// than src as that entails alignmment requirements
@@ -128,6 +135,10 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
128135
Data.lastUse = lexId;
129136

130137
if (!inst->isSend()) {
138+
if (!(src->asSrcRegRegion()->isScalar())) {
139+
Data.allowed = false;
140+
}
141+
131142
// mixed types have alignment requirements
132143
if (dst->getTypeSize() != src->getTypeSize())
133144
Data.allowed = false;
@@ -212,9 +223,8 @@ void SplitAlignedScalars::pruneCandidates(
212223
std::for_each(candidates.begin(), candidates.end(),
213224
[&](G4_Declare *dcl) { candidateList.push_back(dcl); });
214225

215-
// First re-order candidates based on spill cost in descending order
226+
// First re-order candidates based on spill cost in ascending order
216227
candidateList.sort(compareSpillCost);
217-
std::reverse(candidateList.begin(), candidateList.end());
218228

219229
for (auto it = candidateList.begin(); it != candidateList.end();) {
220230
auto dcl = *it;
@@ -248,7 +258,9 @@ void SplitAlignedScalars::pruneCandidates(
248258
candidates.clear();
249259
unsigned int totalMovsNeeded = 0;
250260
unsigned int estimatedInstCount = estimateInstCount();
251-
for (auto candidate : candidateList) {
261+
262+
for (auto ci = candidateList.rbegin(); ci != candidateList.rend(); ++ci) {
263+
const auto &candidate = *ci;
252264
bool isCandidate = false;
253265
auto numMovsNeeded = computeNumMovs(candidate);
254266

@@ -274,7 +286,9 @@ void SplitAlignedScalars::pruneCandidates(
274286
}
275287

276288
bool SplitAlignedScalars::isDclCandidate(G4_Declare *dcl) {
277-
if (dcl->getRegFile() == G4_RegFileKind::G4_GRF && dcl->getNumElems() == 1 &&
289+
if (dcl->getRegFile() == G4_RegFileKind::G4_GRF &&
290+
(dcl->getNumElems() == 1 ||
291+
(dcl->getNumElems() == 2 && dcl->getElemType() == Type_D)) &&
278292
!dcl->getAddressed() && !dcl->getIsPartialDcl() &&
279293
!dcl->getRegVar()->isPhyRegAssigned() && !dcl->getAliasDeclare() &&
280294
!dcl->isInput() && !dcl->isOutput() && !dcl->isPayloadLiveOut() &&
@@ -325,9 +339,15 @@ void SplitAlignedScalars::run() {
325339
return newTopDcl;
326340
};
327341

328-
auto getTypeExecSize = [&](G4_Type type) {
329-
if (TypeSize(type) < 8)
330-
return std::make_tuple(1, type);
342+
auto getTypeExecSize = [&](G4_Type type, bool isScalar, G4_ExecSize execSize) {
343+
if (TypeSize(type) < 8) {
344+
if (execSize == g4::SIMD1 || isScalar)
345+
return std::make_tuple(1, type);
346+
else {
347+
vISA_ASSERT(execSize == g4::SIMD2, "invalid execution size");
348+
return std::make_tuple(2, type);
349+
}
350+
}
331351

332352
if (kernel.fg.builder->noInt64())
333353
return std::make_tuple(2, Type_UD);
@@ -380,8 +400,10 @@ void SplitAlignedScalars::run() {
380400

381401
// emit copy to store data to original non-aligned scalar
382402
unsigned int execSize = 1;
403+
auto oldExecSize = inst->getExecSize();
383404
G4_Type typeToUse = Type_UD;
384-
std::tie(execSize, typeToUse) = getTypeExecSize(dst->getType());
405+
std::tie(execSize, typeToUse) =
406+
getTypeExecSize(dst->getType(), false, oldExecSize);
385407

386408
auto src = kernel.fg.builder->createSrc(
387409
dstRgn->getBase(), dstRgn->getRegOff(), dstRgn->getSubRegOff(),
@@ -437,8 +459,18 @@ void SplitAlignedScalars::run() {
437459
newAlignedTmpTopDcl->copyAlign(oldTopDcl);
438460

439461
unsigned int execSize = 1;
462+
auto oldExecSize = inst->getExecSize();
440463
G4_Type typeToUse = Type_UD;
441-
std::tie(execSize, typeToUse) = getTypeExecSize(srcRgn->getType());
464+
G4_Type srcType = srcRgn->getType();
465+
bool isScalar = srcRgn->isScalar();
466+
if (inst->isSend() && isScalar && oldTopDcl->getNumElems() == 2 &&
467+
TypeSize(oldTopDcl->getElemType()) == 4) {
468+
// For SIMD1 send with :d type data payload (e.g., d32x2t),
469+
// create a move to copy both dwords when replacing source
470+
srcType = Type_UQ;
471+
}
472+
std::tie(execSize, typeToUse) =
473+
getTypeExecSize(srcType, isScalar, oldExecSize);
442474

443475
// copy oldDcl in to newAlignedTmpTopDcl
444476
auto tmpDst = kernel.fg.builder->createDst(

visa/SplitAlignedScalars.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,13 @@ SPDX-License-Identifier: MIT
1414
namespace vISA {
1515
class SplitAlignedScalars {
1616
private:
17-
const unsigned int MinOptDist = 200;
1817
// Constant trip count assume for each loop to estimate dynamic inst
1918
// count change due to splitting.
2019
const unsigned int EstimatedLoopTripCount = 4;
20+
// Minimum instruction distance required for splitting
21+
unsigned int MinOptDist = 0;
2122
// Threshold percent increase in estimated dynamic inst count allowed
22-
const float BloatAllowed = 1.0f / 100.0f;
23+
float BloatAllowed = 0.0f;
2324

2425
unsigned int numDclsReplaced = 0;
2526
unsigned int numMovsAdded = 0;
@@ -65,6 +66,10 @@ class SplitAlignedScalars {
6566
public:
6667
SplitAlignedScalars(GlobalRA &g, GraphColor &c)
6768
: gra(g), coloring(c), kernel(g.kernel) {
69+
MinOptDist =
70+
g.kernel.getOptions()->getuInt32Option(vISA_SplitAlignedScalarMinDist);
71+
BloatAllowed = g.kernel.getOptions()->getuInt32Option(
72+
vISA_SplitAlignedScalarBloatPPT) / 1000.0f;
6873
for (auto spill : coloring.getSpilledLiveRanges()) {
6974
spilledDclSet.insert(spill->getDcl());
7075
}

visa/include/VISAOptionsDefs.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ DEF_VISA_OPTION(vISA_AbortOnSpillThreshold, ET_INT32, "-abortOnSpill", UNUSED,
278278
DEF_VISA_OPTION(vISA_enableBCR, ET_BOOL, "-enableBCR", UNUSED, false)
279279
DEF_VISA_OPTION(vISA_forceBCR, ET_BOOL, "-forceBCR", UNUSED, false)
280280

281+
281282
// clang-format off
282283
// Enable bundle conflict reduction: put operands of instruction into different GRF bundles.
283284
// Value: 0 disable, 1 dpas instruction, 2 non-dpas instructions, 3 all instructions
@@ -321,7 +322,12 @@ DEF_VISA_OPTION(vISA_FillConstOpt, ET_BOOL, "-nofillconstopt", UNUSED, true)
321322
DEF_VISA_OPTION(vISA_GCRRInFF, ET_BOOL, "-GCRRinFF", UNUSED, false)
322323
DEF_VISA_OPTION(vISA_IncrementalRA, ET_INT32, "-incrementalra",
323324
"USAGE: -incrementalra <0|1|2> where 0 is disabled, 1 is enabled, 2 is enabled with verification", 0)
324-
325+
DEF_VISA_OPTION(vISA_SplitAlignedScalarMinDist, ET_INT32,
326+
"-splitAlignedScalarMinDist",
327+
"dist threshold for controlling when to split aligned scalars in RA", 200)
328+
DEF_VISA_OPTION(vISA_SplitAlignedScalarBloatPPT, ET_INT32,
329+
"-splitAlignedScalarBloatRatio",
330+
"instuction increase ppt (part per thousand) for controlling when to split aligned scalars in RA", 10)
325331
//=== scheduler options ===
326332
DEF_VISA_OPTION(vISA_LocalScheduling, ET_BOOL, "-noschedule", UNUSED, true)
327333
DEF_VISA_OPTION(vISA_preRA_Schedule, ET_BOOL, "-nopresched", UNUSED, true)

0 commit comments

Comments
 (0)