@@ -75,8 +75,11 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
75
75
Data.allowed = false ;
76
76
}
77
77
78
- if (dst->getTypeSize () != dstTopDcl->getByteSize ()) {
79
- // we require that dst opnd writes complete topdcl
78
+ if ((dst->getTypeSize () != dstTopDcl->getByteSize ()) &&
79
+ !(dst->getTypeSize () == 4 && dstTopDcl->getByteSize () == 8 )) {
80
+ // Disallow case where dst opnd does not write complete topdcl,
81
+ // except QW opnd which may be A64 address computed separately
82
+ // with low/hi DW opnds
80
83
Data.allowed = false ;
81
84
}
82
85
@@ -89,10 +92,11 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
89
92
auto dstDcl =
90
93
dst->asDstRegRegion ()->getBase ()->asRegVar ()->getDeclare ();
91
94
if (dstDcl->getAliasDeclare ()) {
92
- // disallow case where topdcl is scalar, but alias dcl
93
- // is not a scalar as it may be smaller in size. for eg,
94
- // topdcl may be :uq and alias may be of type :ud.
95
- if (dstDcl->getByteSize () != dstTopDcl->getByteSize ())
95
+ // Disallow case where dst alias dcl size is different from
96
+ // top dcl, except QW opnd which may be A64 address computed
97
+ // separately with low/hi DW opnds
98
+ if ((dstDcl->getByteSize () != dstTopDcl->getByteSize ()) &&
99
+ !(dstDcl->getByteSize () == 4 && dstTopDcl->getByteSize () == 8 ))
96
100
Data.allowed = false ;
97
101
}
98
102
@@ -102,6 +106,9 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
102
106
}
103
107
104
108
if (!inst->isSend ()) {
109
+ if (inst->getExecSize () != 1 ) {
110
+ Data.allowed = false ;
111
+ }
105
112
// check whether dst type size != src type size
106
113
// disallow optimization if dst type is different
107
114
// than src as that entails alignmment requirements
@@ -128,6 +135,10 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
128
135
Data.lastUse = lexId;
129
136
130
137
if (!inst->isSend ()) {
138
+ if (!(src->asSrcRegRegion ()->isScalar ())) {
139
+ Data.allowed = false ;
140
+ }
141
+
131
142
// mixed types have alignment requirements
132
143
if (dst->getTypeSize () != src->getTypeSize ())
133
144
Data.allowed = false ;
@@ -212,9 +223,8 @@ void SplitAlignedScalars::pruneCandidates(
212
223
std::for_each (candidates.begin (), candidates.end (),
213
224
[&](G4_Declare *dcl) { candidateList.push_back (dcl); });
214
225
215
- // First re-order candidates based on spill cost in descending order
226
+ // First re-order candidates based on spill cost in ascending order
216
227
candidateList.sort (compareSpillCost);
217
- std::reverse (candidateList.begin (), candidateList.end ());
218
228
219
229
for (auto it = candidateList.begin (); it != candidateList.end ();) {
220
230
auto dcl = *it;
@@ -248,7 +258,9 @@ void SplitAlignedScalars::pruneCandidates(
248
258
candidates.clear ();
249
259
unsigned int totalMovsNeeded = 0 ;
250
260
unsigned int estimatedInstCount = estimateInstCount ();
251
- for (auto candidate : candidateList) {
261
+
262
+ for (auto ci = candidateList.rbegin (); ci != candidateList.rend (); ++ci) {
263
+ const auto &candidate = *ci;
252
264
bool isCandidate = false ;
253
265
auto numMovsNeeded = computeNumMovs (candidate);
254
266
@@ -274,7 +286,9 @@ void SplitAlignedScalars::pruneCandidates(
274
286
}
275
287
276
288
bool SplitAlignedScalars::isDclCandidate (G4_Declare *dcl) {
277
- if (dcl->getRegFile () == G4_RegFileKind::G4_GRF && dcl->getNumElems () == 1 &&
289
+ if (dcl->getRegFile () == G4_RegFileKind::G4_GRF &&
290
+ (dcl->getNumElems () == 1 ||
291
+ (dcl->getNumElems () == 2 && dcl->getElemType () == Type_D)) &&
278
292
!dcl->getAddressed () && !dcl->getIsPartialDcl () &&
279
293
!dcl->getRegVar ()->isPhyRegAssigned () && !dcl->getAliasDeclare () &&
280
294
!dcl->isInput () && !dcl->isOutput () && !dcl->isPayloadLiveOut () &&
@@ -325,9 +339,15 @@ void SplitAlignedScalars::run() {
325
339
return newTopDcl;
326
340
};
327
341
328
- auto getTypeExecSize = [&](G4_Type type) {
329
- if (TypeSize (type) < 8 )
330
- return std::make_tuple (1 , type);
342
+ auto getTypeExecSize = [&](G4_Type type, bool isScalar, G4_ExecSize execSize) {
343
+ if (TypeSize (type) < 8 ) {
344
+ if (execSize == g4::SIMD1 || isScalar)
345
+ return std::make_tuple (1 , type);
346
+ else {
347
+ vISA_ASSERT (execSize == g4::SIMD2, " invalid execution size" );
348
+ return std::make_tuple (2 , type);
349
+ }
350
+ }
331
351
332
352
if (kernel.fg .builder ->noInt64 ())
333
353
return std::make_tuple (2 , Type_UD);
@@ -380,8 +400,10 @@ void SplitAlignedScalars::run() {
380
400
381
401
// emit copy to store data to original non-aligned scalar
382
402
unsigned int execSize = 1 ;
403
+ auto oldExecSize = inst->getExecSize ();
383
404
G4_Type typeToUse = Type_UD;
384
- std::tie (execSize, typeToUse) = getTypeExecSize (dst->getType ());
405
+ std::tie (execSize, typeToUse) =
406
+ getTypeExecSize (dst->getType (), false , oldExecSize);
385
407
386
408
auto src = kernel.fg .builder ->createSrc (
387
409
dstRgn->getBase (), dstRgn->getRegOff (), dstRgn->getSubRegOff (),
@@ -437,8 +459,18 @@ void SplitAlignedScalars::run() {
437
459
newAlignedTmpTopDcl->copyAlign (oldTopDcl);
438
460
439
461
unsigned int execSize = 1 ;
462
+ auto oldExecSize = inst->getExecSize ();
440
463
G4_Type typeToUse = Type_UD;
441
- std::tie (execSize, typeToUse) = getTypeExecSize (srcRgn->getType ());
464
+ G4_Type srcType = srcRgn->getType ();
465
+ bool isScalar = srcRgn->isScalar ();
466
+ if (inst->isSend () && isScalar && oldTopDcl->getNumElems () == 2 &&
467
+ TypeSize (oldTopDcl->getElemType ()) == 4 ) {
468
+ // For SIMD1 send with :d type data payload (e.g., d32x2t),
469
+ // create a move to copy both dwords when replacing source
470
+ srcType = Type_UQ;
471
+ }
472
+ std::tie (execSize, typeToUse) =
473
+ getTypeExecSize (srcType, isScalar, oldExecSize);
442
474
443
475
// copy oldDcl in to newAlignedTmpTopDcl
444
476
auto tmpDst = kernel.fg .builder ->createDst (
0 commit comments