vISA: Cross-thread size should be 32-byte aligned insted of GRF size aligned

DianaChen · igcbot · commit 356d1901c818 · 2021-10-21T23:13:22.000+02:00
Cross-thread-size alignement is the convention between IGC and NEO and should be 32-byte aligned.
Update vISA load payload proglog generation accordingly. Also update patch token value "dataParameterStreamSize"
to report the same size as expected in vISA
diff --git a/IGC/AdaptorOCL/OCL/sp/sp_g8.cpp b/IGC/AdaptorOCL/OCL/sp/sp_g8.cpp
@@ -1872,8 +1872,9 @@ RETVAL CGen8OpenCLStateProcessor::CreatePatchList(
             annotations.m_PrivateMemSize->Offset + iOpenCL::DATA_PARAMETER_DATA_SIZE );
     }
 
-    // Payload must be a multiple of a GRF register
-    dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, CPlatform(m_Platform).getGRFSize());
+    // Payload must be a multiple of 32 bytes
+    // This assumption has to be the same as in vISA::Optimizer::loadThreadPayload
+    dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, 32);
 
     if( retValue.Success )
     {
diff --git a/visa/Optimizer.cpp b/visa/Optimizer.cpp
@@ -7753,35 +7753,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
         {
             return;
         }
-        // indirect data address is at r0.0[5:31]
-        // local thread id is at r0.2[0:7]
-        // use r127 as the header for each oword load
-        uint32_t startGRF =
-            kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
-        uint32_t inputEnd = 32;
-        uint32_t inputCount = kernel.fg.builder->getInputCount();
-        for (unsigned int id = 0; id < inputCount; id++)
-        {
-            input_info_t* input_info = kernel.fg.builder->getInputArg(id);
-            // skip pseudo input for register bindings.
-            if (input_info->isPseudoInput())
-            {
-                continue;
-            }
-            if (kernel.fg.builder->getFCPatchInfo()->getIsEntryKernel())
-            {
-              vISA::G4_Declare* dcl = input_info->dcl;
-              if (INPUT_GENERAL == input_info->getInputClass() && !(dcl->isLiveIn()))
-              {
-                  break;
-              }
-            }
-            if (inputEnd < (unsigned)(input_info->size + input_info->offset))
-            {
-                inputEnd = input_info->size + input_info->offset;
-            }
-        }
-        int numGRF = ((inputEnd + getGRFSize() - 1) / getGRFSize()) - startGRF;
+
         std::vector<G4_INST*> instBuffer;
 
         G4_Declare* r0 = builder.createHardwiredDeclare(8, Type_UD, 0, 0);
@@ -7914,6 +7886,41 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
         int addrSubreg = 2;
 
         G4_BB* perThreadBB = nullptr;
+
+        // Calculate the payload size:
+        // indirect data address is at r0.0[5:31]
+        // local thread id is at r0.2[0:7]
+        // use r127 as the header for each oword load
+        uint32_t startGRF =
+            kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
+        uint32_t inputEnd = 32;
+        uint32_t inputCount = kernel.fg.builder->getInputCount();
+        for (unsigned int id = 0; id < inputCount; id++)
+        {
+            input_info_t* input_info = kernel.fg.builder->getInputArg(id);
+            // skip pseudo input for register bindings.
+            if (input_info->isPseudoInput())
+            {
+                continue;
+            }
+            if (kernel.fg.builder->getFCPatchInfo()->getIsEntryKernel())
+            {
+                vISA::G4_Declare* dcl = input_info->dcl;
+                if (INPUT_GENERAL == input_info->getInputClass() && !(dcl->isLiveIn()))
+                {
+                    break;
+                }
+            }
+            if (inputEnd < (unsigned)(input_info->size + input_info->offset))
+            {
+                inputEnd = input_info->size + input_info->offset;
+            }
+        }
+        // cross-thread-payload size must be 32-bytes aligned hence the entire payload must be 32-bytes aligned too
+        // GRF size must be 32-bytes aligned so align inputEnd to 32-bytes satisfies it
+        inputEnd = (inputEnd % 32) ? inputEnd + 32 - inputEnd % 32 : inputEnd;
+        uint32_t payloadSizeByte = inputEnd < startGRF * getGRFSize() ? 0 : inputEnd - startGRF * getGRFSize();
+
         // Load per-thread data, if any. Per-thread data always start from r1
         // this is a fixed size 8 inst (nop padded as necessary), which may be skipped
         // by runtime if the local_id are auto-generated by HW.
@@ -7924,12 +7931,17 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
         {
             int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
             int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
+            // per-thread-data size must be GRF-size aligned,
             uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
-            uint32_t numCrossThreadGRF = (CTIS < 0) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
+            // cross-thread-data size must be 32-bytes aligned
+            uint32_t numCrossThreadDW = (CTIS < 0) ?
+                (payloadSizeByte - numPerThreadGRF * getGRFSize()) / TypeSize(Type_UD) :
+                CTIS % 32 ? CTIS + 32 - CTIS % 32 : CTIS;
 
             if (useInlineData)
             {
-                numCrossThreadGRF--;
+                // skip the first GRF
+                numCrossThreadDW -= numEltPerGRF<Type_UD>();
             }
             instBuffer.push_back(getLabel("per_thread_prolog"));
 
@@ -7959,7 +7971,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
             // create a relocation for cross_thread_size (per_thread_payload_offset). In case of the
             // cross_thread_size is changed after compilation (e.g. gtpin inserted argument), the relocation
             // need to be resolved to the new cross_thread_size.
-            G4_Operand* addSrc1 = builder.createRelocImm(numCrossThreadGRF * numEltPerGRF<Type_UB>(), Type_UW);
+            G4_Operand* addSrc1 = builder.createRelocImm(numCrossThreadDW * TypeSize(Type_UD) , Type_UW);
             auto addDst = builder.createDst(rtail->getRegVar(), 0, 2, 1, Type_UD);
             // instruction has relocation must not be compacted
             auto addInst = builder.createBinOp(G4_add, g4::SIMD1, addDst, addSrc0,
@@ -8041,17 +8053,22 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
                 int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
 
                 uint32_t numPerThreadGRF = PTIS / numEltPerGRF<Type_UB>();
-                uint32_t numCrossThreadGRF = (CTIS < 0) ? numGRF - numPerThreadGRF : CTIS / numEltPerGRF<Type_UB>();
+                // cross-thread-data size must be 32-bytes aligned
+                uint32_t numCrossThreadDW = (CTIS < 0) ?
+                    (payloadSizeByte - numPerThreadGRF * getGRFSize()) / TypeSize(Type_UD) :
+                    CTIS % 32 ? CTIS + 32 - CTIS % 32 : CTIS;
                 uint32_t crossThreadStart = startGRF + numPerThreadGRF;
 
                 if (useInlineData)
                 {
                     // first GRF of cross-thread data is already loaded
                     crossThreadStart++;
-                    numCrossThreadGRF--;
+                    numCrossThreadDW -= numEltPerGRF<Type_UD>();
                 }
                 {
-                    loadFromMemory(rtail, crossThreadStart, numCrossThreadGRF);
+                    // GRF size is 32-bytes in this case so numCrossThreadDW must be GRF size aligned
+                    assert(!(numCrossThreadDW % numEltPerGRF<Type_UD>()));
+                    loadFromMemory(rtail, crossThreadStart, numCrossThreadDW / numEltPerGRF<Type_UD>());
                 }
             }
 

Original file line number	Diff line number	Diff line change
`@@ -1872,8 +1872,9 @@ RETVAL CGen8OpenCLStateProcessor::CreatePatchList(`
`1872`	`1872`	`annotations.m_PrivateMemSize->Offset + iOpenCL::DATA_PARAMETER_DATA_SIZE );`
`1873`	`1873`	`}`
`1874`	`1874`
`1875`		`- // Payload must be a multiple of a GRF register`
`1876`		`- dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, CPlatform(m_Platform).getGRFSize());`
	`1875`	`+ // Payload must be a multiple of 32 bytes`
	`1876`	`+ // This assumption has to be the same as in vISA::Optimizer::loadThreadPayload`
	`1877`	`+ dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, 32);`
`1877`	`1878`
`1878`	`1879`	`if( retValue.Success )`
`1879`	`1880`	`{`