Skip to content

Commit 75c7754

Browse files
dlei6gsys_zuul
authored andcommitted
Fixed non-uniform calls with return on stack.
Return value copy must be done in the call loop, such that subsequent iterations does not corrupt the value when stack space is overwritten. First a block copy is done, then a masked copy to the final dst register to respect the execution mask. Did some refactoring for cleaner code in call loop. Change-Id: Ia8406cdea878e14baf48ebc6c99d2545818dc144
1 parent 30405bc commit 75c7754

File tree

1 file changed

+98
-107
lines changed

1 file changed

+98
-107
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 98 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -9559,121 +9559,33 @@ void EmitPass::emitStackCall(llvm::CallInst* inst)
95599559
unsigned char argSizeInGRF = (offsetA + getGRFSize() - 1) / getGRFSize();
95609560
unsigned char retSizeInGRF = retOnStack ? 0 : (retSize + getGRFSize() - 1) / getGRFSize();
95619561

9562-
CVariable* funcAddr = GetSymbol(inst->getCalledValue());
9563-
9564-
if (!isIndirectFCall)
9565-
{
9566-
m_encoder->StackCall(nullptr, F, argSizeInGRF, retSizeInGRF);
9567-
m_encoder->Push();
9568-
}
9569-
else
9562+
// lambda to read the return value
9563+
auto CopyReturnValue = [this](CallInst* inst, bool isStackCopy)->void
95709564
{
9571-
if (funcAddr->IsUniform())
9572-
{
9573-
funcAddr = TruncatePointer(funcAddr);
9574-
m_encoder->IndirectStackCall(nullptr, funcAddr, argSizeInGRF, retSizeInGRF);
9575-
m_encoder->Push();
9576-
}
9577-
else
9565+
if (!isStackCopy)
95789566
{
9579-
// If the call is not uniform, we have to make a uniform call per lane
9580-
// First get the execution mask for active lanes
9581-
CVariable* eMask = GetExecutionMask();
9582-
// Create a label for the loop
9583-
uint label = m_encoder->GetNewLabelID();
9584-
m_encoder->Label(label);
9585-
m_encoder->Push();
9586-
9587-
// Get the first active lane's function address
9588-
CVariable* offset = nullptr;
9589-
funcAddr = TruncatePointer(funcAddr);
9590-
CVariable* uniformAddr = UniformCopy(funcAddr, offset, eMask);
9591-
// Set the predicate to true for all lanes with the same address
9592-
CVariable* callPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
9593-
m_encoder->Cmp(EPREDICATE_EQ, callPred, uniformAddr, funcAddr);
9594-
m_encoder->Push();
9595-
9596-
uint callLabel = m_encoder->GetNewLabelID();
9597-
m_encoder->SetInversePredicate(true);
9598-
m_encoder->Jump(callPred, callLabel);
9599-
m_encoder->Push();
9600-
9601-
// Indirect call for all lanes set by the flag
9602-
m_encoder->IndirectStackCall(nullptr, uniformAddr, argSizeInGRF, retSizeInGRF);
9603-
m_encoder->Copy(eMask, eMask);
9604-
m_encoder->Push();
9605-
9606-
// Label for lanes that skipped the call
9607-
m_encoder->Label(callLabel);
9608-
m_encoder->Push();
9609-
9610-
// Unset the bits in execution mask for lanes that were called
9611-
CVariable* callMask = m_currShader->GetNewVariable(1, eMask->GetType(), eMask->GetAlign(), true, CName::NONE);
9612-
CVariable* loopPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
9613-
m_encoder->Cast(callMask, callPred);
9614-
m_encoder->Not(callMask, callMask);
9615-
m_encoder->And(eMask, eMask, callMask);
9616-
m_encoder->Push();
9617-
m_encoder->SetP(loopPred, eMask);
9618-
m_encoder->Push();
9619-
9620-
if (!inst->use_empty() && !retOnStack)
9567+
CVariable* Dst = GetSymbol(inst);
9568+
CVariable* Src = m_currShader->GetRETV();
9569+
if (Dst->GetType() == ISA_TYPE_BOOL)
96219570
{
9622-
// Emit the return value if used: copy the reserved RET register to call's dst
9623-
// For non-uniform call, copy the ret inside this loop so that it'll honor
9624-
// the loop mask
9625-
CVariable* Dst = GetSymbol(inst);
9626-
CVariable* Src = m_currShader->GetRETV();
9627-
if (Dst->GetType() == ISA_TYPE_BOOL)
9628-
{
9629-
CVariable* SrcAlias = m_currShader->GetNewAlias(Src, ISA_TYPE_W, 0, numLanes(m_currShader->m_dispatchSize), false);
9630-
m_encoder->Cmp(EPREDICATE_NE, Dst, SrcAlias, m_currShader->ImmToVariable(0, ISA_TYPE_W));
9631-
}
9632-
else
9633-
{
9634-
IGC_ASSERT(Dst->GetSize() <= Src->GetSize());
9635-
if (Dst->GetType() != Src->GetType() || Src->IsUniform() != Dst->IsUniform())
9636-
{
9637-
Src = m_currShader->GetNewAlias(Src, Dst->GetType(), 0, Dst->GetNumberElement(), Dst->IsUniform());
9638-
}
9639-
emitCopyAll(Dst, Src, inst->getType());
9640-
}
9571+
CVariable* SrcAlias = m_currShader->GetNewAlias(Src, ISA_TYPE_W, 0, numLanes(m_currShader->m_dispatchSize), false);
9572+
m_encoder->Cmp(EPREDICATE_NE, Dst, SrcAlias, m_currShader->ImmToVariable(0, ISA_TYPE_W));
96419573
}
9642-
9643-
// Loop while there are bits still left in the mask
9644-
m_encoder->Jump(loopPred, label);
9645-
m_encoder->Push();
9646-
}
9647-
}
9648-
9649-
// Emit the return value if used.
9650-
if (!inst->use_empty())
9651-
{
9652-
CVariable* Dst = GetSymbol(inst);
9653-
if (!retOnStack)
9654-
{
9655-
// non-unifrm funcAddr case has been handled in above loop expansion
9656-
if (funcAddr->IsUniform()) {
9657-
CVariable* Src = m_currShader->GetRETV();
9658-
if (Dst->GetType() == ISA_TYPE_BOOL)
9659-
{
9660-
CVariable* SrcAlias = m_currShader->GetNewAlias(Src, ISA_TYPE_W, 0, numLanes(m_currShader->m_dispatchSize), false);
9661-
m_encoder->Cmp(EPREDICATE_NE, Dst, SrcAlias, m_currShader->ImmToVariable(0, ISA_TYPE_W));
9662-
}
9663-
else
9574+
else
9575+
{
9576+
IGC_ASSERT(Dst->GetSize() <= Src->GetSize());
9577+
if (Dst->GetType() != Src->GetType() || Src->IsUniform() != Dst->IsUniform())
96649578
{
9665-
IGC_ASSERT(Dst->GetSize() <= Src->GetSize());
9666-
if (Dst->GetType() != Src->GetType() || Src->IsUniform() != Dst->IsUniform())
9667-
{
9668-
Src = m_currShader->GetNewAlias(Src, Dst->GetType(), 0, Dst->GetNumberElement(), Dst->IsUniform());
9669-
}
9670-
emitCopyAll(Dst, Src, inst->getType());
9579+
Src = m_currShader->GetNewAlias(Src, Dst->GetType(), 0, Dst->GetNumberElement(), Dst->IsUniform());
96719580
}
9581+
emitCopyAll(Dst, Src, inst->getType());
96729582
}
96739583
}
96749584
else
96759585
{
9676-
// read return value from stack, from (SP+n)
9586+
CVariable* retDst = GetSymbol(inst);
9587+
CVariable* Dst = m_currShader->GetNewVariable(retDst);
9588+
96779589
int RmnBytes = Dst->GetSize();
96789590
IGC_ASSERT(Dst->GetType() != ISA_TYPE_BOOL);
96799591
uint32_t RdBytes = 0;
@@ -9736,14 +9648,93 @@ void EmitPass::emitStackCall(llvm::CallInst* inst)
97369648
else
97379649
m_encoder->OWLoad(pTempDst, resource, pTempVar, false, SIZE_OWORD);
97389650
m_encoder->Push();
9651+
m_encoder->SetNoMask();
97399652
emitVectorCopy(Dst, pTempDst, RmnBytes / elemSize, RdBytes, 0);
97409653
}
97419654
}
97429655
RdBytes += RdSize;
97439656
RmnBytes -= RdSize;
9744-
} while (RmnBytes > 0);
9745-
// end of reading return value from stack
9657+
} while (RmnBytes > 0); // end of reading return value from stack
9658+
9659+
// First do a block read from SP, then a copy that respects the execution mask
9660+
emitCopyAll(retDst, Dst, inst->getType());
9661+
}
9662+
};
9663+
9664+
CVariable* funcAddr = GetSymbol(inst->getCalledValue());
9665+
if (!isIndirectFCall)
9666+
{
9667+
m_encoder->StackCall(nullptr, F, argSizeInGRF, retSizeInGRF);
9668+
m_encoder->Push();
9669+
}
9670+
else
9671+
{
9672+
if (funcAddr->IsUniform())
9673+
{
9674+
funcAddr = TruncatePointer(funcAddr);
9675+
m_encoder->IndirectStackCall(nullptr, funcAddr, argSizeInGRF, retSizeInGRF);
9676+
m_encoder->Push();
97469677
}
9678+
else
9679+
{
9680+
// If the call is not uniform, we have to make a uniform call per lane
9681+
// First get the execution mask for active lanes
9682+
CVariable* eMask = GetExecutionMask();
9683+
// Create a label for the loop
9684+
uint label = m_encoder->GetNewLabelID();
9685+
m_encoder->Label(label);
9686+
m_encoder->Push();
9687+
9688+
// Get the first active lane's function address
9689+
CVariable* offset = nullptr;
9690+
funcAddr = TruncatePointer(funcAddr);
9691+
CVariable* uniformAddr = UniformCopy(funcAddr, offset, eMask);
9692+
// Set the predicate to true for all lanes with the same address
9693+
CVariable* callPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
9694+
m_encoder->Cmp(EPREDICATE_EQ, callPred, uniformAddr, funcAddr);
9695+
m_encoder->Push();
9696+
9697+
uint callLabel = m_encoder->GetNewLabelID();
9698+
m_encoder->SetInversePredicate(true);
9699+
m_encoder->Jump(callPred, callLabel);
9700+
m_encoder->Push();
9701+
9702+
// Indirect call for all lanes set by the flag
9703+
m_encoder->IndirectStackCall(nullptr, uniformAddr, argSizeInGRF, retSizeInGRF);
9704+
m_encoder->Copy(eMask, eMask);
9705+
m_encoder->Push();
9706+
9707+
if (!inst->use_empty())
9708+
{
9709+
// For non-uniform call, copy the ret inside this loop so that it'll honor the loop mask
9710+
CopyReturnValue(inst, retOnStack);
9711+
}
9712+
9713+
// Label for lanes that skipped the call
9714+
m_encoder->Label(callLabel);
9715+
m_encoder->Push();
9716+
9717+
// Unset the bits in execution mask for lanes that were called
9718+
CVariable* callMask = m_currShader->GetNewVariable(1, eMask->GetType(), eMask->GetAlign(), true, CName::NONE);
9719+
CVariable* loopPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
9720+
m_encoder->Cast(callMask, callPred);
9721+
m_encoder->Not(callMask, callMask);
9722+
m_encoder->And(eMask, eMask, callMask);
9723+
m_encoder->Push();
9724+
m_encoder->SetP(loopPred, eMask);
9725+
m_encoder->Push();
9726+
9727+
// Loop while there are bits still left in the mask
9728+
m_encoder->Jump(loopPred, label);
9729+
m_encoder->Push();
9730+
}
9731+
}
9732+
9733+
// Emit the return value if used
9734+
// Non-uniform handled in above loop
9735+
if (!inst->use_empty() && funcAddr->IsUniform())
9736+
{
9737+
CopyReturnValue(inst, retOnStack);
97479738
}
97489739

97499740
// Set the max stack sized pushed in the parent function for this call's args

0 commit comments

Comments
 (0)