Skip to content

Commit 356d190

Browse files
DianaChenigcbot
authored andcommitted
vISA: Cross-thread size should be 32-byte aligned insted of GRF size aligned
Cross-thread-size alignement is the convention between IGC and NEO and should be 32-byte aligned. Update vISA load payload proglog generation accordingly. Also update patch token value "dataParameterStreamSize" to report the same size as expected in vISA
1 parent 847209b commit 356d190

File tree

2 files changed

+55
-37
lines changed

2 files changed

+55
-37
lines changed

IGC/AdaptorOCL/OCL/sp/sp_g8.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,8 +1872,9 @@ RETVAL CGen8OpenCLStateProcessor::CreatePatchList(
18721872
annotations.m_PrivateMemSize->Offset + iOpenCL::DATA_PARAMETER_DATA_SIZE );
18731873
}
18741874

1875-
// Payload must be a multiple of a GRF register
1876-
dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, CPlatform(m_Platform).getGRFSize());
1875+
// Payload must be a multiple of 32 bytes
1876+
// This assumption has to be the same as in vISA::Optimizer::loadThreadPayload
1877+
dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, 32);
18771878

18781879
if( retValue.Success )
18791880
{

visa/Optimizer.cpp

Lines changed: 52 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7753,35 +7753,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
77537753
{
77547754
return;
77557755
}
7756-
// indirect data address is at r0.0[5:31]
7757-
// local thread id is at r0.2[0:7]
7758-
// use r127 as the header for each oword load
7759-
uint32_t startGRF =
7760-
kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
7761-
uint32_t inputEnd = 32;
7762-
uint32_t inputCount = kernel.fg.builder->getInputCount();
7763-
for (unsigned int id = 0; id < inputCount; id++)
7764-
{
7765-
input_info_t* input_info = kernel.fg.builder->getInputArg(id);
7766-
// skip pseudo input for register bindings.
7767-
if (input_info->isPseudoInput())
7768-
{
7769-
continue;
7770-
}
7771-
if (kernel.fg.builder->getFCPatchInfo()->getIsEntryKernel())
7772-
{
7773-
vISA::G4_Declare* dcl = input_info->dcl;
7774-
if (INPUT_GENERAL == input_info->getInputClass() && !(dcl->isLiveIn()))
7775-
{
7776-
break;
7777-
}
7778-
}
7779-
if (inputEnd < (unsigned)(input_info->size + input_info->offset))
7780-
{
7781-
inputEnd = input_info->size + input_info->offset;
7782-
}
7783-
}
7784-
int numGRF = ((inputEnd + getGRFSize() - 1) / getGRFSize()) - startGRF;
7756+
77857757
std::vector<G4_INST*> instBuffer;
77867758

77877759
G4_Declare* r0 = builder.createHardwiredDeclare(8, Type_UD, 0, 0);
@@ -7914,6 +7886,41 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
79147886
int addrSubreg = 2;
79157887

79167888
G4_BB* perThreadBB = nullptr;
7889+
7890+
// Calculate the payload size:
7891+
// indirect data address is at r0.0[5:31]
7892+
// local thread id is at r0.2[0:7]
7893+
// use r127 as the header for each oword load
7894+
uint32_t startGRF =
7895+
kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
7896+
uint32_t inputEnd = 32;
7897+
uint32_t inputCount = kernel.fg.builder->getInputCount();
7898+
for (unsigned int id = 0; id < inputCount; id++)
7899+
{
7900+
input_info_t* input_info = kernel.fg.builder->getInputArg(id);
7901+
// skip pseudo input for register bindings.
7902+
if (input_info->isPseudoInput())
7903+
{
7904+
continue;
7905+
}
7906+
if (kernel.fg.builder->getFCPatchInfo()->getIsEntryKernel())
7907+
{
7908+
vISA::G4_Declare* dcl = input_info->dcl;
7909+
if (INPUT_GENERAL == input_info->getInputClass() && !(dcl->isLiveIn()))
7910+
{
7911+
break;
7912+
}
7913+
}
7914+
if (inputEnd < (unsigned)(input_info->size + input_info->offset))
7915+
{
7916+
inputEnd = input_info->size + input_info->offset;
7917+
}
7918+
}
7919+
// cross-thread-payload size must be 32-bytes aligned hence the entire payload must be 32-bytes aligned too
7920+
// GRF size must be 32-bytes aligned so align inputEnd to 32-bytes satisfies it
7921+
inputEnd = (inputEnd % 32) ? inputEnd + 32 - inputEnd % 32 : inputEnd;
7922+
uint32_t payloadSizeByte = inputEnd < startGRF * getGRFSize() ? 0 : inputEnd - startGRF * getGRFSize();
7923+
79177924
// Load per-thread data, if any. Per-thread data always start from r1
79187925
// this is a fixed size 8 inst (nop padded as necessary), which may be skipped
79197926
// by runtime if the local_id are auto-generated by HW.
@@ -7924,12 +7931,17 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
79247931
{
79257932
int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
79267933
int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
7934+
// per-thread-data size must be GRF-size aligned,
79277935
uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
7928-
uint32_t numCrossThreadGRF = (CTIS < 0) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
7936+
// cross-thread-data size must be 32-bytes aligned
7937+
uint32_t numCrossThreadDW = (CTIS < 0) ?
7938+
(payloadSizeByte - numPerThreadGRF * getGRFSize()) / TypeSize(Type_UD) :
7939+
CTIS % 32 ? CTIS + 32 - CTIS % 32 : CTIS;
79297940

79307941
if (useInlineData)
79317942
{
7932-
numCrossThreadGRF--;
7943+
// skip the first GRF
7944+
numCrossThreadDW -= numEltPerGRF<Type_UD>();
79337945
}
79347946
instBuffer.push_back(getLabel("per_thread_prolog"));
79357947

@@ -7959,7 +7971,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
79597971
// create a relocation for cross_thread_size (per_thread_payload_offset). In case of the
79607972
// cross_thread_size is changed after compilation (e.g. gtpin inserted argument), the relocation
79617973
// need to be resolved to the new cross_thread_size.
7962-
G4_Operand* addSrc1 = builder.createRelocImm(numCrossThreadGRF * numEltPerGRF<Type_UB>(), Type_UW);
7974+
G4_Operand* addSrc1 = builder.createRelocImm(numCrossThreadDW * TypeSize(Type_UD) , Type_UW);
79637975
auto addDst = builder.createDst(rtail->getRegVar(), 0, 2, 1, Type_UD);
79647976
// instruction has relocation must not be compacted
79657977
auto addInst = builder.createBinOp(G4_add, g4::SIMD1, addDst, addSrc0,
@@ -8041,17 +8053,22 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
80418053
int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
80428054

80438055
uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
8044-
uint32_t numCrossThreadGRF = (CTIS < 0) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
8056+
// cross-thread-data size must be 32-bytes aligned
8057+
uint32_t numCrossThreadDW = (CTIS < 0) ?
8058+
(payloadSizeByte - numPerThreadGRF * getGRFSize()) / TypeSize(Type_UD) :
8059+
CTIS % 32 ? CTIS + 32 - CTIS % 32 : CTIS;
80458060
uint32_t crossThreadStart = startGRF + numPerThreadGRF;
80468061

80478062
if (useInlineData)
80488063
{
80498064
// first GRF of cross-thread data is already loaded
80508065
crossThreadStart++;
8051-
numCrossThreadGRF--;
8066+
numCrossThreadDW -= numEltPerGRF<Type_UD>();
80528067
}
80538068
{
8054-
loadFromMemory(rtail, crossThreadStart, numCrossThreadGRF);
8069+
// GRF size is 32-bytes in this case so numCrossThreadDW must be GRF size aligned
8070+
assert(!(numCrossThreadDW % numEltPerGRF<Type_UD>()));
8071+
loadFromMemory(rtail, crossThreadStart, numCrossThreadDW / numEltPerGRF<Type_UD>());
80558072
}
80568073
}
80578074

0 commit comments

Comments
 (0)