@@ -17,6 +17,7 @@ SPDX-License-Identifier: MIT
17
17
#include " common/LLVMWarningsPush.hpp"
18
18
#include " llvmWrapper/IR/DerivedTypes.h"
19
19
#include " llvmWrapper/Support/Alignment.h"
20
+ #include " llvm/Transforms/Utils/Local.h"
20
21
#include " common/LLVMWarningsPop.hpp"
21
22
#include < list>
22
23
#include " Probe/Assertion.h"
@@ -153,6 +154,8 @@ void ConstantCoalescing::ProcessFunction(Function* function)
153
154
}
154
155
// scan and rewrite cb-load in this block
155
156
ProcessBlock (cur_blk, dircb_owloads, indcb_owloads, indcb_gathers);
157
+ ShrinkChunks (dircb_owloads);
158
+ ShrinkChunks (indcb_owloads);
156
159
CleanupExtract (cur_blk);
157
160
VectorizePrep (cur_blk);
158
161
}
@@ -239,6 +242,145 @@ static bool canReplaceInsert(InsertElementInst* insertElt)
239
242
return true ;
240
243
}
241
244
245
+ // Attempt to shrink the size of oword loads by looking at the indices of extractelements.
246
+
247
+ void ConstantCoalescing::ShrinkChunks (std::vector<BufChunk*>& chunk_vec)
248
+ {
249
+ for (auto chunk : chunk_vec)
250
+ {
251
+ uint ub = 1 ;
252
+ uint lb = std::numeric_limits<uint>::max ();
253
+ for (auto user : chunk->chunkIO ->users ())
254
+ {
255
+ if (auto extractElement = dyn_cast<ExtractElementInst>(user))
256
+ {
257
+ if (auto index = dyn_cast<ConstantInt>(extractElement->getIndexOperand ()))
258
+ {
259
+ uint indexValue = int_cast<uint>(index->getZExtValue ()) + chunk->chunkStart ;
260
+ ub = std::max (ub, indexValue + 1 );
261
+ lb = std::min (lb, indexValue);
262
+ continue ;
263
+ }
264
+ }
265
+ ub = chunk->ub ;
266
+ lb = chunk->lb ;
267
+ break ;
268
+ }
269
+
270
+ uint loadSize = RoundChunkSize (ub - lb, chunk->elementSize );
271
+ IGCLLVM::FixedVectorType* originalType = dyn_cast<IGCLLVM::FixedVectorType>(chunk->chunkIO ->getType ());
272
+ if (originalType && loadSize < originalType->getNumElements ())
273
+ {
274
+ IGC_ASSERT (chunk->chunkSize >= loadSize);
275
+ uint start_adj = lb - chunk->chunkStart ;
276
+ chunk->chunkSize = loadSize;
277
+ chunk->chunkStart = lb;
278
+ ExtensionKind extension = EK_NotExtended;
279
+ // Get correct extension type and ensure that final offset is an add instruction
280
+ // with a constant 2nd operand to satisfy AdjustLoad.
281
+ if (auto ldRaw = dyn_cast<LdRawIntrinsic>(chunk->chunkIO ))
282
+ {
283
+ Value* offsetValue = ldRaw->getOffsetValue ();
284
+ if (!isa<ConstantInt>(offsetValue))
285
+ {
286
+ uint offset;
287
+ SimpleBaseOffset (offsetValue, offset, extension);
288
+
289
+ if (auto offsetInst = dyn_cast<Instruction>(offsetValue))
290
+ {
291
+ auto opcode = offsetInst->getOpcode ();
292
+ if (!(opcode == Instruction::Add && isa<ConstantInt>(offsetInst->getOperand (1 ))))
293
+ {
294
+ // Can't use irbuilder to create instruction directly or it will get constant folded.
295
+ ldRaw->setOffsetValue (BinaryOperator::Create (Instruction::Add, offsetInst, irBuilder->getInt32 (0 ), " " , ldRaw));
296
+ }
297
+ }
298
+ }
299
+ }
300
+ else
301
+ {
302
+ auto load = cast<LoadInst>(chunk->chunkIO );
303
+ uint bufId = 0 ;
304
+ Value* elt_ptrv = nullptr ;
305
+ BufferType bufType = BUFFER_TYPE_UNKNOWN;
306
+ if (load->getPointerAddressSpace () == ADDRESS_SPACE_CONSTANT)
307
+ {
308
+ uint offset;
309
+ Value* buf_idxv = nullptr ;
310
+ Value* elt_idxv = nullptr ;
311
+ DecomposePtrExp (load->getPointerOperand (), buf_idxv, elt_idxv, offset, extension);
312
+ }
313
+ else if (IsReadOnlyLoadDirectCB (load, bufId, elt_ptrv, bufType))
314
+ {
315
+ if (auto i2p = dyn_cast<IntToPtrInst>(elt_ptrv))
316
+ {
317
+ if (!isa<ConstantInt>(i2p->getOperand (0 )))
318
+ {
319
+ uint offset = 0 ;
320
+ SimpleBaseOffset (i2p->getOperand (0 ), offset, extension);
321
+
322
+ if (auto addressInst = dyn_cast<Instruction>(i2p->getOperand (0 )))
323
+ {
324
+ auto opcode = addressInst->getOpcode ();
325
+ if (!(opcode == Instruction::Add && isa<ConstantInt>(addressInst->getOperand (1 ))))
326
+ {
327
+ // Can't use irbuilder to create instruction directly or it will get constant folded.
328
+ i2p->setOperand (0 , BinaryOperator::Create (Instruction::Add, addressInst, irBuilder->getInt32 (0 ), " " , i2p));
329
+ }
330
+ }
331
+ }
332
+ }
333
+ }
334
+ else
335
+ {
336
+ IGC_ASSERT (false ); // All cases should be handled by now.
337
+ }
338
+ }
339
+ AdjustLoad (chunk, extension);
340
+
341
+ SmallVector<Instruction*, 4 > use_set;
342
+ // adjust all the splitters
343
+ Value::user_iterator use_it = chunk->chunkIO ->user_begin ();
344
+ Value::user_iterator use_e = chunk->chunkIO ->user_end ();
345
+ for (; use_it != use_e; ++use_it)
346
+ {
347
+ if (auto usei = dyn_cast<ExtractElementInst>(*use_it))
348
+ {
349
+ if (auto e_idx = dyn_cast<ConstantInt>(usei->getIndexOperand ()))
350
+ {
351
+ uint val = (uint)e_idx->getZExtValue ();
352
+ IGC_ASSERT (val >= start_adj);
353
+ val -= start_adj;
354
+ // update the index source
355
+ e_idx = ConstantInt::get (irBuilder->getInt32Ty (), val);
356
+ usei->setOperand (1 , e_idx);
357
+ continue ;
358
+ }
359
+ }
360
+ use_set.push_back (llvm::cast<Instruction>(*use_it));
361
+ }
362
+ if (use_set.size () > 0 )
363
+ {
364
+ WIAnalysis::WIDependancy loadDep = wiAns->whichDepend (chunk->chunkIO );
365
+ irBuilder->SetInsertPoint (chunk->chunkIO ->getNextNode ());
366
+ Value* vec = UndefValue::get (originalType);
367
+ for (unsigned i = start_adj; i < originalType->getNumElements (); i++)
368
+ {
369
+ Value* channel = irBuilder->CreateExtractElement (
370
+ chunk->chunkIO , irBuilder->getInt32 (i));
371
+ wiAns->incUpdateDepend (channel, loadDep);
372
+ vec = irBuilder->CreateInsertElement (vec, channel, irBuilder->getInt32 (i - start_adj));
373
+ wiAns->incUpdateDepend (vec, loadDep);
374
+ }
375
+ for (auto it : use_set)
376
+ {
377
+ it->replaceUsesOfWith (chunk->chunkIO , vec);
378
+ }
379
+ }
380
+ }
381
+ }
382
+ }
383
+
242
384
// pattern match away redundant insertElt/extractElt pairs introduced by coalescing
243
385
//
244
386
// %26 = load <2 x float>, <2 x float> addrspace(65546)* %chunkPtr36, align 4
@@ -1184,6 +1326,23 @@ void ConstantCoalescing::SetAlignment(Instruction* load, uint alignment)
1184
1326
1185
1327
}
1186
1328
1329
+ // Rounds the chunk size up to the next value supported by data port
1330
+ // loads or LSC transposed load.
1331
+ // Legacy data port supports:
1332
+ // - block loads of 1, 2, 4 or 8 OWords
1333
+ // - scattered loads of 1, 2 and 4 DWords/QWords
1334
+ // - (byte aligned) 1, 2 and 4 Bytes
1335
+ // LSC supports:
1336
+ // - transposed loads of 1, 2, 3, 4, 8, 16, 32 and 64 DWords/QWords
1337
+ // - (byte aligned) loads of 1, 2 and 4 Bytes
1338
+ uint32_t ConstantCoalescing::RoundChunkSize (const uint32_t chunkSize, const uint scalarSizeInBytes)
1339
+ {
1340
+ bool supportsVec3Load = scalarSizeInBytes >= 4 && m_ctx->platform .LSCEnabled ();
1341
+ uint32_t validChunkSize = (supportsVec3Load && chunkSize == 3 ) ?
1342
+ 3 : iSTD::RoundPower2 ((DWORD)chunkSize);
1343
+ return validChunkSize;
1344
+ }
1345
+
1187
1346
void ConstantCoalescing::MergeUniformLoad (Instruction* load,
1188
1347
Value* bufIdxV, uint addrSpace,
1189
1348
Value* eltIdxV, uint offsetInBytes,
@@ -1258,28 +1417,12 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load,
1258
1417
(addrSpace == ADDRESS_SPACE_CONSTANT ||
1259
1418
addrSpace == ADDRESS_SPACE_GLOBAL);
1260
1419
1261
- // Lambda rounds the chunk size up to the next value supported by data port
1262
- // loads or LSC transposed load.
1263
- // Legacy data port supports:
1264
- // - block loads of 1, 2, 4 or 8 OWords
1265
- // - scattered loads of 1, 2 and 4 DWords/QWords
1266
- // - (byte aligned) 1, 2 and 4 Bytes
1267
- // LSC supports:
1268
- // - transposed loads of 1, 2, 3, 4, 8, 16, 32 and 64 DWords/QWords
1269
- // - (byte aligned) loads of 1, 2 and 4 Bytes
1270
- auto RoundChunkSize = [this , scalarSizeInBytes](const uint32_t chunkSize)
1271
- {
1272
- bool supportsVec3Load = scalarSizeInBytes >= 4 && m_ctx->platform .LSCEnabled ();
1273
- uint32_t validChunkSize = (supportsVec3Load && chunkSize == 3 ) ?
1274
- 3 : iSTD::RoundPower2 ((DWORD)chunkSize);
1275
- return validChunkSize;
1276
- };
1277
1420
Type* loadDataType = load->getType ();
1278
1421
const uint32_t loadNumElements = loadDataType->isVectorTy () ?
1279
1422
int_cast<uint32_t >(cast<IGCLLVM::FixedVectorType>(loadDataType)->getNumElements ()) : 1 ;
1280
1423
// VectorPreProcess pass legalizes loaded data size.
1281
- IGC_ASSERT (loadNumElements == RoundChunkSize (loadNumElements));
1282
- IGC_ASSERT (loadNumElements >= RoundChunkSize (maxEltPlus));
1424
+ IGC_ASSERT (loadNumElements == RoundChunkSize (loadNumElements, scalarSizeInBytes ));
1425
+ IGC_ASSERT (loadNumElements >= RoundChunkSize (maxEltPlus, scalarSizeInBytes ));
1283
1426
if (!cov_chunk)
1284
1427
{
1285
1428
if (isDwordAligned)
@@ -1290,7 +1433,9 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load,
1290
1433
cov_chunk->baseIdxV = eltIdxV;
1291
1434
cov_chunk->elementSize = scalarSizeInBytes;
1292
1435
cov_chunk->chunkStart = eltid;
1293
- cov_chunk->chunkSize = RoundChunkSize (maxEltPlus);
1436
+ cov_chunk->chunkSize = RoundChunkSize (maxEltPlus, scalarSizeInBytes);
1437
+ cov_chunk->ub = cov_chunk->chunkStart + cov_chunk->chunkSize ;
1438
+ cov_chunk->lb = cov_chunk->chunkStart ;
1294
1439
const alignment_t chunkAlignment = std::max<alignment_t >(alignment, 4 );
1295
1440
cov_chunk->chunkIO = CreateChunkLoad (load, cov_chunk, eltid, chunkAlignment, Extension);
1296
1441
chunk_vec.push_back (cov_chunk);
@@ -1301,10 +1446,12 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load,
1301
1446
// Determine load boundaries
1302
1447
uint lb = std::min (eltid, cov_chunk->chunkStart );
1303
1448
uint ub = std::max (eltid + maxEltPlus, cov_chunk->chunkStart + cov_chunk->chunkSize );
1449
+ cov_chunk->lb = std::min (eltid, cov_chunk->lb );
1450
+ cov_chunk->ub = std::max (eltid + maxEltPlus, cov_chunk->ub );
1304
1451
1305
1452
// Calculate load start and size adjustments
1306
1453
uint start_adj = cov_chunk->chunkStart - lb;
1307
- uint size_adj = RoundChunkSize (ub - lb) - cov_chunk->chunkSize ;
1454
+ uint size_adj = RoundChunkSize (ub - lb, scalarSizeInBytes ) - cov_chunk->chunkSize ;
1308
1455
1309
1456
// Out of bounds check
1310
1457
if (needsOutOfBoundsChecks)
@@ -1859,13 +2006,9 @@ Instruction* ConstantCoalescing::FindOrAddChunkExtract(BufChunk* cov_chunk, uint
1859
2006
return splitter;
1860
2007
}
1861
2008
1862
- void ConstantCoalescing::AdjustChunk (
1863
- BufChunk* cov_chunk, uint start_adj, uint size_adj, const ExtensionKind &Extension)
2009
+ void ConstantCoalescing::AdjustLoad (BufChunk* cov_chunk, const ExtensionKind& Extension)
1864
2010
{
1865
- cov_chunk->chunkSize += size_adj;
1866
- cov_chunk->chunkStart -= start_adj;
1867
2011
// mutateType to change array-size
1868
- Type* originalType = cov_chunk->chunkIO ->getType ();
1869
2012
Type* vty = IGCLLVM::FixedVectorType::get (cov_chunk->chunkIO ->getType ()->getScalarType (), cov_chunk->chunkSize );
1870
2013
cov_chunk->chunkIO ->mutateType (vty);
1871
2014
// change the dest ptr-type on bitcast
@@ -1998,6 +2141,15 @@ void ConstantCoalescing::AdjustChunk(
1998
2141
cast<Instruction>(eac)->setOperand (1 , cv_start);
1999
2142
}
2000
2143
}
2144
+ }
2145
+
2146
+ void ConstantCoalescing::AdjustChunk (
2147
+ BufChunk* cov_chunk, uint start_adj, uint size_adj, const ExtensionKind &Extension)
2148
+ {
2149
+ cov_chunk->chunkSize += size_adj;
2150
+ cov_chunk->chunkStart -= start_adj;
2151
+ Type* originalType = cov_chunk->chunkIO ->getType ();
2152
+ AdjustLoad (cov_chunk, Extension);
2001
2153
2002
2154
SmallVector<Instruction*, 4 > use_set;
2003
2155
// adjust all the splitters
@@ -2080,6 +2232,7 @@ void ConstantCoalescing::MoveExtracts(BufChunk* cov_chunk, Instruction* load, ui
2080
2232
}
2081
2233
usei->setOperand (0 , cov_chunk->chunkIO );
2082
2234
}
2235
+ replaceAllDbgUsesWith (*load, *cov_chunk->chunkIO , *cov_chunk->chunkIO , getAnalysis<DominatorTreeWrapperPass>().getDomTree ());
2083
2236
}
2084
2237
else
2085
2238
{
0 commit comments