@@ -7753,35 +7753,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
7753
7753
{
7754
7754
return ;
7755
7755
}
7756
- // indirect data address is at r0.0[5:31]
7757
- // local thread id is at r0.2[0:7]
7758
- // use r127 as the header for each oword load
7759
- uint32_t startGRF =
7760
- kernel.getOptions ()->getuInt32Option (vISA_loadThreadPayloadStartReg);
7761
- uint32_t inputEnd = 32 ;
7762
- uint32_t inputCount = kernel.fg .builder ->getInputCount ();
7763
- for (unsigned int id = 0 ; id < inputCount; id++)
7764
- {
7765
- input_info_t * input_info = kernel.fg .builder ->getInputArg (id);
7766
- // skip pseudo input for register bindings.
7767
- if (input_info->isPseudoInput ())
7768
- {
7769
- continue ;
7770
- }
7771
- if (kernel.fg .builder ->getFCPatchInfo ()->getIsEntryKernel ())
7772
- {
7773
- vISA::G4_Declare* dcl = input_info->dcl ;
7774
- if (INPUT_GENERAL == input_info->getInputClass () && !(dcl->isLiveIn ()))
7775
- {
7776
- break ;
7777
- }
7778
- }
7779
- if (inputEnd < (unsigned )(input_info->size + input_info->offset ))
7780
- {
7781
- inputEnd = input_info->size + input_info->offset ;
7782
- }
7783
- }
7784
- int numGRF = ((inputEnd + getGRFSize () - 1 ) / getGRFSize ()) - startGRF;
7756
+
7785
7757
std::vector<G4_INST*> instBuffer;
7786
7758
7787
7759
G4_Declare* r0 = builder.createHardwiredDeclare (8 , Type_UD, 0 , 0 );
@@ -7914,6 +7886,41 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
7914
7886
int addrSubreg = 2 ;
7915
7887
7916
7888
G4_BB* perThreadBB = nullptr ;
7889
+
7890
+ // Calculate the payload size:
7891
+ // indirect data address is at r0.0[5:31]
7892
+ // local thread id is at r0.2[0:7]
7893
+ // use r127 as the header for each oword load
7894
+ uint32_t startGRF =
7895
+ kernel.getOptions ()->getuInt32Option (vISA_loadThreadPayloadStartReg);
7896
+ uint32_t inputEnd = 32 ;
7897
+ uint32_t inputCount = kernel.fg .builder ->getInputCount ();
7898
+ for (unsigned int id = 0 ; id < inputCount; id++)
7899
+ {
7900
+ input_info_t * input_info = kernel.fg .builder ->getInputArg (id);
7901
+ // skip pseudo input for register bindings.
7902
+ if (input_info->isPseudoInput ())
7903
+ {
7904
+ continue ;
7905
+ }
7906
+ if (kernel.fg .builder ->getFCPatchInfo ()->getIsEntryKernel ())
7907
+ {
7908
+ vISA::G4_Declare* dcl = input_info->dcl ;
7909
+ if (INPUT_GENERAL == input_info->getInputClass () && !(dcl->isLiveIn ()))
7910
+ {
7911
+ break ;
7912
+ }
7913
+ }
7914
+ if (inputEnd < (unsigned )(input_info->size + input_info->offset ))
7915
+ {
7916
+ inputEnd = input_info->size + input_info->offset ;
7917
+ }
7918
+ }
7919
+ // cross-thread-payload size must be 32-bytes aligned hence the entire payload must be 32-bytes aligned too
7920
+ // GRF size must be 32-bytes aligned so align inputEnd to 32-bytes satisfies it
7921
+ inputEnd = (inputEnd % 32 ) ? inputEnd + 32 - inputEnd % 32 : inputEnd;
7922
+ uint32_t payloadSizeByte = inputEnd < startGRF * getGRFSize () ? 0 : inputEnd - startGRF * getGRFSize ();
7923
+
7917
7924
// Load per-thread data, if any. Per-thread data always start from r1
7918
7925
// this is a fixed size 8 inst (nop padded as necessary), which may be skipped
7919
7926
// by runtime if the local_id are auto-generated by HW.
@@ -7924,12 +7931,17 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
7924
7931
{
7925
7932
int PTIS = kernel.getInt32KernelAttr (Attributes::ATTR_PerThreadInputSize);
7926
7933
int CTIS = kernel.getInt32KernelAttr (Attributes::ATTR_CrossThreadInputSize);
7934
+ // per-thread-data size must be GRF-size aligned,
7927
7935
uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
7928
- uint32_t numCrossThreadGRF = (CTIS < 0 ) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
7936
+ // cross-thread-data size must be 32-bytes aligned
7937
+ uint32_t numCrossThreadDW = (CTIS < 0 ) ?
7938
+ (payloadSizeByte - numPerThreadGRF * getGRFSize ()) / TypeSize (Type_UD) :
7939
+ CTIS % 32 ? CTIS + 32 - CTIS % 32 : CTIS;
7929
7940
7930
7941
if (useInlineData)
7931
7942
{
7932
- numCrossThreadGRF--;
7943
+ // skip the first GRF
7944
+ numCrossThreadDW -= numEltPerGRF<Type_UD>();
7933
7945
}
7934
7946
instBuffer.push_back (getLabel (" per_thread_prolog" ));
7935
7947
@@ -7959,7 +7971,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
7959
7971
// create a relocation for cross_thread_size (per_thread_payload_offset). In case of the
7960
7972
// cross_thread_size is changed after compilation (e.g. gtpin inserted argument), the relocation
7961
7973
// need to be resolved to the new cross_thread_size.
7962
- G4_Operand* addSrc1 = builder.createRelocImm (numCrossThreadGRF * numEltPerGRF<Type_UB>() , Type_UW);
7974
+ G4_Operand* addSrc1 = builder.createRelocImm (numCrossThreadDW * TypeSize (Type_UD) , Type_UW);
7963
7975
auto addDst = builder.createDst (rtail->getRegVar (), 0 , 2 , 1 , Type_UD);
7964
7976
// instruction has relocation must not be compacted
7965
7977
auto addInst = builder.createBinOp (G4_add, g4::SIMD1, addDst, addSrc0,
@@ -8041,17 +8053,22 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8041
8053
int CTIS = kernel.getInt32KernelAttr (Attributes::ATTR_CrossThreadInputSize);
8042
8054
8043
8055
uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
8044
- uint32_t numCrossThreadGRF = (CTIS < 0 ) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
8056
+ // cross-thread-data size must be 32-bytes aligned
8057
+ uint32_t numCrossThreadDW = (CTIS < 0 ) ?
8058
+ (payloadSizeByte - numPerThreadGRF * getGRFSize ()) / TypeSize (Type_UD) :
8059
+ CTIS % 32 ? CTIS + 32 - CTIS % 32 : CTIS;
8045
8060
uint32_t crossThreadStart = startGRF + numPerThreadGRF;
8046
8061
8047
8062
if (useInlineData)
8048
8063
{
8049
8064
// first GRF of cross-thread data is already loaded
8050
8065
crossThreadStart++;
8051
- numCrossThreadGRF-- ;
8066
+ numCrossThreadDW -= numEltPerGRF<Type_UD>() ;
8052
8067
}
8053
8068
{
8054
- loadFromMemory (rtail, crossThreadStart, numCrossThreadGRF);
8069
+ // GRF size is 32-bytes in this case so numCrossThreadDW must be GRF size aligned
8070
+ assert (!(numCrossThreadDW % numEltPerGRF<Type_UD>()));
8071
+ loadFromMemory (rtail, crossThreadStart, numCrossThreadDW / numEltPerGRF<Type_UD>());
8055
8072
}
8056
8073
}
8057
8074
0 commit comments