@@ -1257,10 +1257,10 @@ bool LoopVarSplit::split(G4_Declare* dcl, Loop& loop)
1257
1257
1258
1258
// replace all occurences of dcl in loop with TMP
1259
1259
for (auto src : srcs)
1260
- replaceSrc (src, splitDcl);
1260
+ replaceSrc (src, splitDcl, loop );
1261
1261
1262
1262
for (auto dst : dsts)
1263
- replaceDst (dst, splitDcl);
1263
+ replaceDst (dst, splitDcl, loop );
1264
1264
1265
1265
splitResults[dcl].push_back (std::make_pair (splitDcl, &loop));
1266
1266
@@ -1328,14 +1328,17 @@ void LoopVarSplit::copy(G4_BB* bb, G4_Declare* dst, G4_Declare* src, SplitResult
1328
1328
const RegionDesc* rd = kernel.fg .builder ->getRegionStride1 ();
1329
1329
G4_ExecSize execSize{ kernel.numEltPerGRF <Type_UD>() };
1330
1330
1331
- if ((i + 1 ) < numRows)
1331
+ // copy 2 GRFs at a time if byte size permits
1332
+ if (bytesRemaining >= kernel.numEltPerGRF <Type_UB>() * 2 )
1332
1333
execSize = G4_ExecSize (kernel.numEltPerGRF <Type_UD>() * 2 );
1333
1334
1334
1335
auto dstRgn = kernel.fg .builder ->createDst (dst->getRegVar (), (short )i, 0 , 1 , Type_F);
1335
1336
auto srcRgn = kernel.fg .builder ->createSrc (src->getRegVar (), (short )i, 0 , rd, Type_F);
1336
1337
auto inst = kernel.fg .builder ->createMov (execSize, dstRgn, srcRgn, instOption, false );
1337
1338
1338
1339
insertCopy (inst);
1340
+ MUST_BE_TRUE (bytesRemaining >= (unsigned int )(execSize.value * G4_Type_Table[Type_F].byteSize ),
1341
+ " Invalid copy exec size" );
1339
1342
bytesRemaining -= (execSize.value * G4_Type_Table[Type_F].byteSize );
1340
1343
1341
1344
if (bytesRemaining < kernel.numEltPerGRF <Type_UB>())
@@ -1386,14 +1389,16 @@ void LoopVarSplit::copy(G4_BB* bb, G4_Declare* dst, G4_Declare* src, SplitResult
1386
1389
1387
1390
insertCopy (inst);
1388
1391
1392
+ MUST_BE_TRUE (bytesRemaining >= (execSize.value * (unsigned int )G4_Type_Table[type].byteSize ),
1393
+ " Invalid copy exec size" );
1389
1394
bytesRemaining -= (execSize.value * G4_Type_Table[type].byteSize );
1390
1395
};
1391
1396
}
1392
1397
1393
- void LoopVarSplit::replaceSrc (G4_SrcRegRegion* src, G4_Declare* dcl)
1398
+ void LoopVarSplit::replaceSrc (G4_SrcRegRegion* src, G4_Declare* dcl, const Loop& loop )
1394
1399
{
1395
1400
auto srcDcl = src->getBase ()->asRegVar ()->getDeclare ();
1396
- dcl = getNewDcl (srcDcl, dcl);
1401
+ dcl = getNewDcl (srcDcl, dcl, loop );
1397
1402
1398
1403
auto newSrcRgn = kernel.fg .builder ->createSrc (dcl->getRegVar (), src->getRegOff (),
1399
1404
src->getSubRegOff (), src->getRegion (), src->getType (), src->getAccRegSel ());
@@ -1409,10 +1414,10 @@ void LoopVarSplit::replaceSrc(G4_SrcRegRegion* src, G4_Declare* dcl)
1409
1414
}
1410
1415
}
1411
1416
1412
- void LoopVarSplit::replaceDst (G4_DstRegRegion* dst, G4_Declare* dcl)
1417
+ void LoopVarSplit::replaceDst (G4_DstRegRegion* dst, G4_Declare* dcl, const Loop& loop )
1413
1418
{
1414
1419
auto dstDcl = dst->getBase ()->asRegVar ()->getDeclare ();
1415
- dcl = getNewDcl (dstDcl, dcl);
1420
+ dcl = getNewDcl (dstDcl, dcl, loop );
1416
1421
1417
1422
auto newDstRgn = kernel.fg .builder ->createDst (dcl->getRegVar (), dst->getRegOff (),
1418
1423
dst->getSubRegOff (), dst->getHorzStride (), dst->getType (), dst->getAccRegSel ());
@@ -1421,22 +1426,35 @@ void LoopVarSplit::replaceDst(G4_DstRegRegion* dst, G4_Declare* dcl)
1421
1426
inst->setDest (newDstRgn);
1422
1427
}
1423
1428
1424
- G4_Declare* LoopVarSplit::getNewDcl (G4_Declare* dcl1, G4_Declare* dcl2)
1429
+ G4_Declare* LoopVarSplit::getNewDcl (G4_Declare* dcl1, G4_Declare* dcl2, const Loop& loop )
1425
1430
{
1426
1431
// this method gets args dcl1, dcl2. this method is invoked
1427
1432
// when the transformation replaces existing src/dst rgn with
1428
- // equivalent one but using split variable.
1433
+ // equivalent one but using split variable. for eg,
1434
+ //
1435
+ // op ... V10(0,5) ... <-- assume V10 is alias of V9
1436
+ //
1437
+ // assume V9 gets split so V10 src rgn above has to be replaced.
1438
+ // say V9's split dcl is called LOOP_SPLIT_V9.
1439
+ // so in this function we create a new dcl, LOOP_SPLIT_V10 that
1440
+ // aliases LOOP_SPLIT_V9 exactly like V10 aliases V9. this
1441
+ // way we dont need any complicated logic to flatten V10.
1429
1442
//
1430
1443
// dcl1 is a dcl used to construct some src or dst rgn.
1431
1444
// dcl2 is a new dcl that splits dcl1. dcl2 is always root dcl.
1432
- // dcl1 may or may not be aliased of another dcl.
1445
+ // dcl1 may or may not be alias of another dcl.
1433
1446
// if dcl1 is also root dcl, then return dcl2.
1434
1447
// if dcl1 is an alias dcl, then construct new dcl that aliases
1435
1448
// dcl2 at similar offset.
1436
1449
// mapping from old dcl to new dcl is stored for future invocations.
1450
+ // this mapping is done per loop as a single spilled variable could
1451
+ // be split in multiple loops and each split instance would use a
1452
+ // different loop split variable.
1437
1453
1438
1454
MUST_BE_TRUE (!dcl2->getAliasDeclare (), " Expecting to see root dcl for dcl2" );
1439
1455
1456
+ auto & oldNewDcl = oldNewDclPerLoop[&loop];
1457
+
1440
1458
auto it = oldNewDcl.find (dcl1);
1441
1459
if (it != oldNewDcl.end ())
1442
1460
return (*it).second ;
@@ -1449,7 +1467,7 @@ G4_Declare* LoopVarSplit::getNewDcl(G4_Declare* dcl1, G4_Declare* dcl2)
1449
1467
1450
1468
auto newDcl = kernel.fg .builder ->createTempVar (dcl1->getTotalElems (), dcl1->getElemType (),
1451
1469
dcl1->getSubRegAlign ());
1452
- newDcl->setAliasDeclare (getNewDcl (dcl1->getRootDeclare (), dcl2), dcl1->getOffsetFromBase ());
1470
+ newDcl->setAliasDeclare (getNewDcl (dcl1->getRootDeclare (), dcl2, loop ), dcl1->getOffsetFromBase ());
1453
1471
1454
1472
oldNewDcl[dcl1] = newDcl;
1455
1473
0 commit comments