@@ -9559,121 +9559,33 @@ void EmitPass::emitStackCall(llvm::CallInst* inst)
9559
9559
unsigned char argSizeInGRF = (offsetA + getGRFSize() - 1) / getGRFSize();
9560
9560
unsigned char retSizeInGRF = retOnStack ? 0 : (retSize + getGRFSize() - 1) / getGRFSize();
9561
9561
9562
- CVariable* funcAddr = GetSymbol(inst->getCalledValue());
9563
-
9564
- if (!isIndirectFCall)
9565
- {
9566
- m_encoder->StackCall(nullptr, F, argSizeInGRF, retSizeInGRF);
9567
- m_encoder->Push();
9568
- }
9569
- else
9562
+ // lambda to read the return value
9563
+ auto CopyReturnValue = [this](CallInst* inst, bool isStackCopy)->void
9570
9564
{
9571
- if (funcAddr->IsUniform())
9572
- {
9573
- funcAddr = TruncatePointer(funcAddr);
9574
- m_encoder->IndirectStackCall(nullptr, funcAddr, argSizeInGRF, retSizeInGRF);
9575
- m_encoder->Push();
9576
- }
9577
- else
9565
+ if (!isStackCopy)
9578
9566
{
9579
- // If the call is not uniform, we have to make a uniform call per lane
9580
- // First get the execution mask for active lanes
9581
- CVariable* eMask = GetExecutionMask();
9582
- // Create a label for the loop
9583
- uint label = m_encoder->GetNewLabelID();
9584
- m_encoder->Label(label);
9585
- m_encoder->Push();
9586
-
9587
- // Get the first active lane's function address
9588
- CVariable* offset = nullptr;
9589
- funcAddr = TruncatePointer(funcAddr);
9590
- CVariable* uniformAddr = UniformCopy(funcAddr, offset, eMask);
9591
- // Set the predicate to true for all lanes with the same address
9592
- CVariable* callPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
9593
- m_encoder->Cmp(EPREDICATE_EQ, callPred, uniformAddr, funcAddr);
9594
- m_encoder->Push();
9595
-
9596
- uint callLabel = m_encoder->GetNewLabelID();
9597
- m_encoder->SetInversePredicate(true);
9598
- m_encoder->Jump(callPred, callLabel);
9599
- m_encoder->Push();
9600
-
9601
- // Indirect call for all lanes set by the flag
9602
- m_encoder->IndirectStackCall(nullptr, uniformAddr, argSizeInGRF, retSizeInGRF);
9603
- m_encoder->Copy(eMask, eMask);
9604
- m_encoder->Push();
9605
-
9606
- // Label for lanes that skipped the call
9607
- m_encoder->Label(callLabel);
9608
- m_encoder->Push();
9609
-
9610
- // Unset the bits in execution mask for lanes that were called
9611
- CVariable* callMask = m_currShader->GetNewVariable(1, eMask->GetType(), eMask->GetAlign(), true, CName::NONE);
9612
- CVariable* loopPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
9613
- m_encoder->Cast(callMask, callPred);
9614
- m_encoder->Not(callMask, callMask);
9615
- m_encoder->And(eMask, eMask, callMask);
9616
- m_encoder->Push();
9617
- m_encoder->SetP(loopPred, eMask);
9618
- m_encoder->Push();
9619
-
9620
- if (!inst->use_empty() && !retOnStack)
9567
+ CVariable* Dst = GetSymbol(inst);
9568
+ CVariable* Src = m_currShader->GetRETV();
9569
+ if (Dst->GetType() == ISA_TYPE_BOOL)
9621
9570
{
9622
- // Emit the return value if used: copy the reserved RET register to call's dst
9623
- // For non-uniform call, copy the ret inside this loop so that it'll honor
9624
- // the loop mask
9625
- CVariable* Dst = GetSymbol(inst);
9626
- CVariable* Src = m_currShader->GetRETV();
9627
- if (Dst->GetType() == ISA_TYPE_BOOL)
9628
- {
9629
- CVariable* SrcAlias = m_currShader->GetNewAlias(Src, ISA_TYPE_W, 0, numLanes(m_currShader->m_dispatchSize), false);
9630
- m_encoder->Cmp(EPREDICATE_NE, Dst, SrcAlias, m_currShader->ImmToVariable(0, ISA_TYPE_W));
9631
- }
9632
- else
9633
- {
9634
- IGC_ASSERT(Dst->GetSize() <= Src->GetSize());
9635
- if (Dst->GetType() != Src->GetType() || Src->IsUniform() != Dst->IsUniform())
9636
- {
9637
- Src = m_currShader->GetNewAlias(Src, Dst->GetType(), 0, Dst->GetNumberElement(), Dst->IsUniform());
9638
- }
9639
- emitCopyAll(Dst, Src, inst->getType());
9640
- }
9571
+ CVariable* SrcAlias = m_currShader->GetNewAlias(Src, ISA_TYPE_W, 0, numLanes(m_currShader->m_dispatchSize), false);
9572
+ m_encoder->Cmp(EPREDICATE_NE, Dst, SrcAlias, m_currShader->ImmToVariable(0, ISA_TYPE_W));
9641
9573
}
9642
-
9643
- // Loop while there are bits still left in the mask
9644
- m_encoder->Jump(loopPred, label);
9645
- m_encoder->Push();
9646
- }
9647
- }
9648
-
9649
- // Emit the return value if used.
9650
- if (!inst->use_empty())
9651
- {
9652
- CVariable* Dst = GetSymbol(inst);
9653
- if (!retOnStack)
9654
- {
9655
- // non-unifrm funcAddr case has been handled in above loop expansion
9656
- if (funcAddr->IsUniform()) {
9657
- CVariable* Src = m_currShader->GetRETV();
9658
- if (Dst->GetType() == ISA_TYPE_BOOL)
9659
- {
9660
- CVariable* SrcAlias = m_currShader->GetNewAlias(Src, ISA_TYPE_W, 0, numLanes(m_currShader->m_dispatchSize), false);
9661
- m_encoder->Cmp(EPREDICATE_NE, Dst, SrcAlias, m_currShader->ImmToVariable(0, ISA_TYPE_W));
9662
- }
9663
- else
9574
+ else
9575
+ {
9576
+ IGC_ASSERT(Dst->GetSize() <= Src->GetSize());
9577
+ if (Dst->GetType() != Src->GetType() || Src->IsUniform() != Dst->IsUniform())
9664
9578
{
9665
- IGC_ASSERT(Dst->GetSize() <= Src->GetSize());
9666
- if (Dst->GetType() != Src->GetType() || Src->IsUniform() != Dst->IsUniform())
9667
- {
9668
- Src = m_currShader->GetNewAlias(Src, Dst->GetType(), 0, Dst->GetNumberElement(), Dst->IsUniform());
9669
- }
9670
- emitCopyAll(Dst, Src, inst->getType());
9579
+ Src = m_currShader->GetNewAlias(Src, Dst->GetType(), 0, Dst->GetNumberElement(), Dst->IsUniform());
9671
9580
}
9581
+ emitCopyAll(Dst, Src, inst->getType());
9672
9582
}
9673
9583
}
9674
9584
else
9675
9585
{
9676
- // read return value from stack, from (SP+n)
9586
+ CVariable* retDst = GetSymbol(inst);
9587
+ CVariable* Dst = m_currShader->GetNewVariable(retDst);
9588
+
9677
9589
int RmnBytes = Dst->GetSize();
9678
9590
IGC_ASSERT(Dst->GetType() != ISA_TYPE_BOOL);
9679
9591
uint32_t RdBytes = 0;
@@ -9736,14 +9648,93 @@ void EmitPass::emitStackCall(llvm::CallInst* inst)
9736
9648
else
9737
9649
m_encoder->OWLoad(pTempDst, resource, pTempVar, false, SIZE_OWORD);
9738
9650
m_encoder->Push();
9651
+ m_encoder->SetNoMask();
9739
9652
emitVectorCopy(Dst, pTempDst, RmnBytes / elemSize, RdBytes, 0);
9740
9653
}
9741
9654
}
9742
9655
RdBytes += RdSize;
9743
9656
RmnBytes -= RdSize;
9744
- } while (RmnBytes > 0);
9745
- // end of reading return value from stack
9657
+ } while (RmnBytes > 0); // end of reading return value from stack
9658
+
9659
+ // First do a block read from SP, then a copy that respects the execution mask
9660
+ emitCopyAll(retDst, Dst, inst->getType());
9661
+ }
9662
+ };
9663
+
9664
+ CVariable* funcAddr = GetSymbol(inst->getCalledValue());
9665
+ if (!isIndirectFCall)
9666
+ {
9667
+ m_encoder->StackCall(nullptr, F, argSizeInGRF, retSizeInGRF);
9668
+ m_encoder->Push();
9669
+ }
9670
+ else
9671
+ {
9672
+ if (funcAddr->IsUniform())
9673
+ {
9674
+ funcAddr = TruncatePointer(funcAddr);
9675
+ m_encoder->IndirectStackCall(nullptr, funcAddr, argSizeInGRF, retSizeInGRF);
9676
+ m_encoder->Push();
9746
9677
}
9678
+ else
9679
+ {
9680
+ // If the call is not uniform, we have to make a uniform call per lane
9681
+ // First get the execution mask for active lanes
9682
+ CVariable* eMask = GetExecutionMask();
9683
+ // Create a label for the loop
9684
+ uint label = m_encoder->GetNewLabelID();
9685
+ m_encoder->Label(label);
9686
+ m_encoder->Push();
9687
+
9688
+ // Get the first active lane's function address
9689
+ CVariable* offset = nullptr;
9690
+ funcAddr = TruncatePointer(funcAddr);
9691
+ CVariable* uniformAddr = UniformCopy(funcAddr, offset, eMask);
9692
+ // Set the predicate to true for all lanes with the same address
9693
+ CVariable* callPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
9694
+ m_encoder->Cmp(EPREDICATE_EQ, callPred, uniformAddr, funcAddr);
9695
+ m_encoder->Push();
9696
+
9697
+ uint callLabel = m_encoder->GetNewLabelID();
9698
+ m_encoder->SetInversePredicate(true);
9699
+ m_encoder->Jump(callPred, callLabel);
9700
+ m_encoder->Push();
9701
+
9702
+ // Indirect call for all lanes set by the flag
9703
+ m_encoder->IndirectStackCall(nullptr, uniformAddr, argSizeInGRF, retSizeInGRF);
9704
+ m_encoder->Copy(eMask, eMask);
9705
+ m_encoder->Push();
9706
+
9707
+ if (!inst->use_empty())
9708
+ {
9709
+ // For non-uniform call, copy the ret inside this loop so that it'll honor the loop mask
9710
+ CopyReturnValue(inst, retOnStack);
9711
+ }
9712
+
9713
+ // Label for lanes that skipped the call
9714
+ m_encoder->Label(callLabel);
9715
+ m_encoder->Push();
9716
+
9717
+ // Unset the bits in execution mask for lanes that were called
9718
+ CVariable* callMask = m_currShader->GetNewVariable(1, eMask->GetType(), eMask->GetAlign(), true, CName::NONE);
9719
+ CVariable* loopPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
9720
+ m_encoder->Cast(callMask, callPred);
9721
+ m_encoder->Not(callMask, callMask);
9722
+ m_encoder->And(eMask, eMask, callMask);
9723
+ m_encoder->Push();
9724
+ m_encoder->SetP(loopPred, eMask);
9725
+ m_encoder->Push();
9726
+
9727
+ // Loop while there are bits still left in the mask
9728
+ m_encoder->Jump(loopPred, label);
9729
+ m_encoder->Push();
9730
+ }
9731
+ }
9732
+
9733
+ // Emit the return value if used
9734
+ // Non-uniform handled in above loop
9735
+ if (!inst->use_empty() && funcAddr->IsUniform())
9736
+ {
9737
+ CopyReturnValue(inst, retOnStack);
9747
9738
}
9748
9739
9749
9740
// Set the max stack sized pushed in the parent function for this call's args
0 commit comments