Skip to content

Commit 77e4f96

Browse files
davidjwooigcbot
authored andcommitted
Shrink loads generated by constant coalescing
Try to shrink the vector loads merged by constant coalescing by looking at the indices used by the extractelement instructions that operate on the loads.
1 parent 9329900 commit 77e4f96

File tree

4 files changed

+188
-27
lines changed

4 files changed

+188
-27
lines changed

IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp

Lines changed: 178 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ SPDX-License-Identifier: MIT
1717
#include "common/LLVMWarningsPush.hpp"
1818
#include "llvmWrapper/IR/DerivedTypes.h"
1919
#include "llvmWrapper/Support/Alignment.h"
20+
#include "llvm/Transforms/Utils/Local.h"
2021
#include "common/LLVMWarningsPop.hpp"
2122
#include <list>
2223
#include "Probe/Assertion.h"
@@ -153,6 +154,8 @@ void ConstantCoalescing::ProcessFunction(Function* function)
153154
}
154155
// scan and rewrite cb-load in this block
155156
ProcessBlock(cur_blk, dircb_owloads, indcb_owloads, indcb_gathers);
157+
ShrinkChunks(dircb_owloads);
158+
ShrinkChunks(indcb_owloads);
156159
CleanupExtract(cur_blk);
157160
VectorizePrep(cur_blk);
158161
}
@@ -239,6 +242,145 @@ static bool canReplaceInsert(InsertElementInst* insertElt)
239242
return true;
240243
}
241244

245+
// Attempt to shrink the size of oword loads by looking at the indices of extractelements.
246+
247+
void ConstantCoalescing::ShrinkChunks(std::vector<BufChunk*>& chunk_vec)
248+
{
249+
for (auto chunk : chunk_vec)
250+
{
251+
uint ub = 1;
252+
uint lb = std::numeric_limits<uint>::max();
253+
for (auto user : chunk->chunkIO->users())
254+
{
255+
if (auto extractElement = dyn_cast<ExtractElementInst>(user))
256+
{
257+
if (auto index = dyn_cast<ConstantInt>(extractElement->getIndexOperand()))
258+
{
259+
uint indexValue = int_cast<uint>(index->getZExtValue()) + chunk->chunkStart;
260+
ub = std::max(ub, indexValue + 1);
261+
lb = std::min(lb, indexValue);
262+
continue;
263+
}
264+
}
265+
ub = chunk->ub;
266+
lb = chunk->lb;
267+
break;
268+
}
269+
270+
uint loadSize = RoundChunkSize(ub - lb, chunk->elementSize);
271+
IGCLLVM::FixedVectorType* originalType = dyn_cast<IGCLLVM::FixedVectorType>(chunk->chunkIO->getType());
272+
if (originalType && loadSize < originalType->getNumElements())
273+
{
274+
IGC_ASSERT(chunk->chunkSize >= loadSize);
275+
uint start_adj = lb - chunk->chunkStart;
276+
chunk->chunkSize = loadSize;
277+
chunk->chunkStart = lb;
278+
ExtensionKind extension = EK_NotExtended;
279+
// Get correct extension type and ensure that final offset is an add instruction
280+
// with a constant 2nd operand to satisfy AdjustLoad.
281+
if (auto ldRaw = dyn_cast<LdRawIntrinsic>(chunk->chunkIO))
282+
{
283+
Value* offsetValue = ldRaw->getOffsetValue();
284+
if (!isa<ConstantInt>(offsetValue))
285+
{
286+
uint offset;
287+
SimpleBaseOffset(offsetValue, offset, extension);
288+
289+
if(auto offsetInst = dyn_cast<Instruction>(offsetValue))
290+
{
291+
auto opcode = offsetInst->getOpcode();
292+
if (!(opcode == Instruction::Add && isa<ConstantInt>(offsetInst->getOperand(1))))
293+
{
294+
// Can't use irbuilder to create instruction directly or it will get constant folded.
295+
ldRaw->setOffsetValue(BinaryOperator::Create(Instruction::Add, offsetInst, irBuilder->getInt32(0), "", ldRaw));
296+
}
297+
}
298+
}
299+
}
300+
else
301+
{
302+
auto load = cast<LoadInst>(chunk->chunkIO);
303+
uint bufId = 0;
304+
Value* elt_ptrv = nullptr;
305+
BufferType bufType = BUFFER_TYPE_UNKNOWN;
306+
if (load->getPointerAddressSpace() == ADDRESS_SPACE_CONSTANT)
307+
{
308+
uint offset;
309+
Value* buf_idxv = nullptr;
310+
Value* elt_idxv = nullptr;
311+
DecomposePtrExp(load->getPointerOperand(), buf_idxv, elt_idxv, offset, extension);
312+
}
313+
else if (IsReadOnlyLoadDirectCB(load, bufId, elt_ptrv, bufType))
314+
{
315+
if (auto i2p = dyn_cast<IntToPtrInst>(elt_ptrv))
316+
{
317+
if (!isa<ConstantInt>(i2p->getOperand(0)))
318+
{
319+
uint offset = 0;
320+
SimpleBaseOffset(i2p->getOperand(0), offset, extension);
321+
322+
if (auto addressInst = dyn_cast<Instruction>(i2p->getOperand(0)))
323+
{
324+
auto opcode = addressInst->getOpcode();
325+
if (!(opcode == Instruction::Add && isa<ConstantInt>(addressInst->getOperand(1))))
326+
{
327+
// Can't use irbuilder to create instruction directly or it will get constant folded.
328+
i2p->setOperand(0, BinaryOperator::Create(Instruction::Add, addressInst, irBuilder->getInt32(0), "", i2p));
329+
}
330+
}
331+
}
332+
}
333+
}
334+
else
335+
{
336+
IGC_ASSERT(false); // All cases should be handled by now.
337+
}
338+
}
339+
AdjustLoad(chunk, extension);
340+
341+
SmallVector<Instruction*, 4> use_set;
342+
// adjust all the splitters
343+
Value::user_iterator use_it = chunk->chunkIO->user_begin();
344+
Value::user_iterator use_e = chunk->chunkIO->user_end();
345+
for (; use_it != use_e; ++use_it)
346+
{
347+
if (auto usei = dyn_cast<ExtractElementInst>(*use_it))
348+
{
349+
if (auto e_idx = dyn_cast<ConstantInt>(usei->getIndexOperand()))
350+
{
351+
uint val = (uint)e_idx->getZExtValue();
352+
IGC_ASSERT(val >= start_adj);
353+
val -= start_adj;
354+
// update the index source
355+
e_idx = ConstantInt::get(irBuilder->getInt32Ty(), val);
356+
usei->setOperand(1, e_idx);
357+
continue;
358+
}
359+
}
360+
use_set.push_back(llvm::cast<Instruction>(*use_it));
361+
}
362+
if (use_set.size() > 0)
363+
{
364+
WIAnalysis::WIDependancy loadDep = wiAns->whichDepend(chunk->chunkIO);
365+
irBuilder->SetInsertPoint(chunk->chunkIO->getNextNode());
366+
Value* vec = UndefValue::get(originalType);
367+
for (unsigned i = start_adj; i < originalType->getNumElements(); i++)
368+
{
369+
Value* channel = irBuilder->CreateExtractElement(
370+
chunk->chunkIO, irBuilder->getInt32(i));
371+
wiAns->incUpdateDepend(channel, loadDep);
372+
vec = irBuilder->CreateInsertElement(vec, channel, irBuilder->getInt32(i - start_adj));
373+
wiAns->incUpdateDepend(vec, loadDep);
374+
}
375+
for (auto it : use_set)
376+
{
377+
it->replaceUsesOfWith(chunk->chunkIO, vec);
378+
}
379+
}
380+
}
381+
}
382+
}
383+
242384
// pattern match away redundant insertElt/extractElt pairs introduced by coalescing
243385
//
244386
// %26 = load <2 x float>, <2 x float> addrspace(65546)* %chunkPtr36, align 4
@@ -1184,6 +1326,23 @@ void ConstantCoalescing::SetAlignment(Instruction* load, uint alignment)
11841326

11851327
}
11861328

1329+
// Rounds the chunk size up to the next value supported by data port
1330+
// loads or LSC transposed load.
1331+
// Legacy data port supports:
1332+
// - block loads of 1, 2, 4 or 8 OWords
1333+
// - scattered loads of 1, 2 and 4 DWords/QWords
1334+
// - (byte aligned) 1, 2 and 4 Bytes
1335+
// LSC supports:
1336+
// - transposed loads of 1, 2, 3, 4, 8, 16, 32 and 64 DWords/QWords
1337+
// - (byte aligned) loads of 1, 2 and 4 Bytes
1338+
uint32_t ConstantCoalescing::RoundChunkSize(const uint32_t chunkSize, const uint scalarSizeInBytes)
1339+
{
1340+
bool supportsVec3Load = scalarSizeInBytes >= 4 && m_ctx->platform.LSCEnabled();
1341+
uint32_t validChunkSize = (supportsVec3Load && chunkSize == 3) ?
1342+
3 : iSTD::RoundPower2((DWORD)chunkSize);
1343+
return validChunkSize;
1344+
}
1345+
11871346
void ConstantCoalescing::MergeUniformLoad(Instruction* load,
11881347
Value* bufIdxV, uint addrSpace,
11891348
Value* eltIdxV, uint offsetInBytes,
@@ -1258,28 +1417,12 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load,
12581417
(addrSpace == ADDRESS_SPACE_CONSTANT ||
12591418
addrSpace == ADDRESS_SPACE_GLOBAL);
12601419

1261-
// Lambda rounds the chunk size up to the next value supported by data port
1262-
// loads or LSC transposed load.
1263-
// Legacy data port supports:
1264-
// - block loads of 1, 2, 4 or 8 OWords
1265-
// - scattered loads of 1, 2 and 4 DWords/QWords
1266-
// - (byte aligned) 1, 2 and 4 Bytes
1267-
// LSC supports:
1268-
// - transposed loads of 1, 2, 3, 4, 8, 16, 32 and 64 DWords/QWords
1269-
// - (byte aligned) loads of 1, 2 and 4 Bytes
1270-
auto RoundChunkSize = [this, scalarSizeInBytes](const uint32_t chunkSize)
1271-
{
1272-
bool supportsVec3Load = scalarSizeInBytes >= 4 && m_ctx->platform.LSCEnabled();
1273-
uint32_t validChunkSize = (supportsVec3Load && chunkSize == 3) ?
1274-
3 : iSTD::RoundPower2((DWORD)chunkSize);
1275-
return validChunkSize;
1276-
};
12771420
Type* loadDataType = load->getType();
12781421
const uint32_t loadNumElements = loadDataType->isVectorTy() ?
12791422
int_cast<uint32_t>(cast<IGCLLVM::FixedVectorType>(loadDataType)->getNumElements()) : 1;
12801423
// VectorPreProcess pass legalizes loaded data size.
1281-
IGC_ASSERT(loadNumElements == RoundChunkSize(loadNumElements));
1282-
IGC_ASSERT(loadNumElements >= RoundChunkSize(maxEltPlus));
1424+
IGC_ASSERT(loadNumElements == RoundChunkSize(loadNumElements, scalarSizeInBytes));
1425+
IGC_ASSERT(loadNumElements >= RoundChunkSize(maxEltPlus, scalarSizeInBytes));
12831426
if (!cov_chunk)
12841427
{
12851428
if (isDwordAligned)
@@ -1290,7 +1433,9 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load,
12901433
cov_chunk->baseIdxV = eltIdxV;
12911434
cov_chunk->elementSize = scalarSizeInBytes;
12921435
cov_chunk->chunkStart = eltid;
1293-
cov_chunk->chunkSize = RoundChunkSize(maxEltPlus);
1436+
cov_chunk->chunkSize = RoundChunkSize(maxEltPlus, scalarSizeInBytes);
1437+
cov_chunk->ub = cov_chunk->chunkStart + cov_chunk->chunkSize;
1438+
cov_chunk->lb = cov_chunk->chunkStart;
12941439
const alignment_t chunkAlignment = std::max<alignment_t>(alignment, 4);
12951440
cov_chunk->chunkIO = CreateChunkLoad(load, cov_chunk, eltid, chunkAlignment, Extension);
12961441
chunk_vec.push_back(cov_chunk);
@@ -1301,10 +1446,12 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load,
13011446
// Determine load boundaries
13021447
uint lb = std::min(eltid, cov_chunk->chunkStart);
13031448
uint ub = std::max(eltid + maxEltPlus, cov_chunk->chunkStart + cov_chunk->chunkSize);
1449+
cov_chunk->lb = std::min(eltid, cov_chunk->lb);
1450+
cov_chunk->ub = std::max(eltid + maxEltPlus, cov_chunk->ub);
13041451

13051452
// Calculate load start and size adjustments
13061453
uint start_adj = cov_chunk->chunkStart - lb;
1307-
uint size_adj = RoundChunkSize(ub - lb) - cov_chunk->chunkSize;
1454+
uint size_adj = RoundChunkSize(ub - lb, scalarSizeInBytes) - cov_chunk->chunkSize;
13081455

13091456
// Out of bounds check
13101457
if (needsOutOfBoundsChecks)
@@ -1859,13 +2006,9 @@ Instruction* ConstantCoalescing::FindOrAddChunkExtract(BufChunk* cov_chunk, uint
18592006
return splitter;
18602007
}
18612008

1862-
void ConstantCoalescing::AdjustChunk(
1863-
BufChunk* cov_chunk, uint start_adj, uint size_adj, const ExtensionKind &Extension)
2009+
void ConstantCoalescing::AdjustLoad(BufChunk* cov_chunk, const ExtensionKind& Extension)
18642010
{
1865-
cov_chunk->chunkSize += size_adj;
1866-
cov_chunk->chunkStart -= start_adj;
18672011
// mutateType to change array-size
1868-
Type* originalType = cov_chunk->chunkIO->getType();
18692012
Type* vty = IGCLLVM::FixedVectorType::get(cov_chunk->chunkIO->getType()->getScalarType(), cov_chunk->chunkSize);
18702013
cov_chunk->chunkIO->mutateType(vty);
18712014
// change the dest ptr-type on bitcast
@@ -1998,6 +2141,15 @@ void ConstantCoalescing::AdjustChunk(
19982141
cast<Instruction>(eac)->setOperand(1, cv_start);
19992142
}
20002143
}
2144+
}
2145+
2146+
void ConstantCoalescing::AdjustChunk(
2147+
BufChunk* cov_chunk, uint start_adj, uint size_adj, const ExtensionKind &Extension)
2148+
{
2149+
cov_chunk->chunkSize += size_adj;
2150+
cov_chunk->chunkStart -= start_adj;
2151+
Type* originalType = cov_chunk->chunkIO->getType();
2152+
AdjustLoad(cov_chunk, Extension);
20012153

20022154
SmallVector<Instruction*, 4> use_set;
20032155
// adjust all the splitters
@@ -2080,6 +2232,7 @@ void ConstantCoalescing::MoveExtracts(BufChunk* cov_chunk, Instruction* load, ui
20802232
}
20812233
usei->setOperand(0, cov_chunk->chunkIO);
20822234
}
2235+
replaceAllDbgUsesWith(*load, *cov_chunk->chunkIO, *cov_chunk->chunkIO, getAnalysis<DominatorTreeWrapperPass>().getDomTree());
20832236
}
20842237
else
20852238
{

IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ namespace IGC
4343
uint elementSize; // size in bytes of the basic data element
4444
uint chunkStart; // offset of the first data element in chunk in units of elementSize
4545
uint chunkSize; // chunk size in elements
46+
uint ub; // upper bound; the first element after the last element accessed by any of the loads that
47+
// were merged into chunkIO. TODO: think up a better name for this.
48+
uint lb; // lower bound; the first element accessed by any of the loads that
49+
// were merged into chunkIO. TODO: think up a better name for this.
4650
llvm::Instruction* chunkIO; // coalesced load
4751
uint loadOrder; // direct CB used order.
4852
};
@@ -264,6 +268,7 @@ namespace IGC
264268
llvm::Value* ptr_val, llvm::Value*& buf_idxv,
265269
llvm::Value*& elt_idxv, uint& eltid, ExtensionKind &Extension);
266270
static uint CheckVectorElementUses(const llvm::Instruction* load);
271+
void AdjustLoad(BufChunk* cov_chunk, const ExtensionKind& Extension);
267272
void AdjustChunk(BufChunk* cov_chunk, uint start_adj, uint size_adj, const ExtensionKind &Extension);
268273
void EnlargeChunk(BufChunk* cov_chunk, uint size_adj);
269274
void MoveExtracts(BufChunk* cov_chunk, llvm::Instruction* load, uint start_adj);
@@ -281,6 +286,8 @@ namespace IGC
281286
Instruction* loadToReplace,
282287
Instruction* ldData,
283288
uint offsetInBytes);
289+
290+
uint32_t RoundChunkSize(const uint32_t chunkSize, const uint scalarSizeInBytes);
284291
void MergeUniformLoad(llvm::Instruction* load,
285292
llvm::Value* bufIdxV, uint addrSpace,
286293
llvm::Value* eltIdxV, uint offsetInBytes,
@@ -298,6 +305,7 @@ namespace IGC
298305
llvm::Value* eltIdxV, uint eltid,
299306
std::vector<BufChunk*>& chunk_vec);
300307

308+
void ShrinkChunks(std::vector<BufChunk*>& chunk_vec);
301309
bool CleanupExtract(llvm::BasicBlock* bb);
302310
void VectorizePrep(llvm::BasicBlock* bb);
303311
bool safeToMoveInstUp(Instruction* inst, Instruction* newLocation);

IGC/Compiler/tests/DebugInfo/ConstantCoalescing/directcb_vec.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ entry:
4141
br label %lbl1, !dbg !32
4242

4343
lbl1: ; preds = %entry
44-
%2 = add i32 15, 16, !dbg !33
44+
%2 = add i32 %a, 16, !dbg !33
4545
call void @llvm.dbg.value(metadata i32 %2, metadata !16, metadata !DIExpression()), !dbg !33
4646
%3 = inttoptr i32 %2 to <2 x float> addrspace(65536)*, !dbg !34
4747
call void @llvm.dbg.value(metadata <2 x float> addrspace(65536)* %3, metadata !18, metadata !DIExpression()), !dbg !34

IGC/Compiler/tests/DebugInfo/ConstantCoalescing/indirectcb_int.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
;
2525
; CHECK: entry:
2626
; CHECK-DAG: [[EXTR1_V:%[A-z0-9]*]] = extractelement {{.*}}[[LOAD1_V:%[A-z0-9]*]]{{.*}} !dbg [[EXTR1_LOC:![0-9]*]]
27-
; CHECK-DAG: [[LOAD1_V]] = call <2 x float> {{.*}} !dbg [[LOAD1_LOC:![0-9]*]]
27+
; CHECK-DAG: [[LOAD1_V]] = call <1 x float> {{.*}} !dbg [[LOAD1_LOC:![0-9]*]]
2828
; CHECK-DAG: [[EXTR2_V:%[A-z0-9]*]] = extractelement {{.*}}[[LOAD2_V:%[A-z0-9]*]]{{.*}} !dbg [[EXTR2_LOC:![0-9]*]]
2929
; CHECK-DAG: [[LOAD2_V]] = call <2 x float> {{.*}} !dbg [[LOAD2_LOC:![0-9]*]]
3030
; CHECK-DAG: [[EXTR3_V:%[A-z0-9]*]] = extractelement {{.*}}[[LOAD2_V]]{{.*}} !dbg [[EXTR3_LOC:![0-9]*]]

0 commit comments

Comments
 (0)