Improve scalar atomic add/sub

jgu222 · igcbot · commit 065dba60ab7f · 2022-09-02T23:27:32.000+02:00
For scalar atomic (add/sub/inc/dec) without return and with uniform addend,
a more efficient code sequence will be used. For example,
"atomic_add (16|M0)  p,  1" will be:

  emask = current emask
  numBits = numOfOne(emask);
  (W) atomic_add (1|M0)  p, numBits

We basically save numBits for reuses within the same BB.
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
@@ -943,7 +943,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
     for (uint i = 0; i < m_pattern->m_numBlocks; i++)
     {
         SBasicBlock& block = m_pattern->m_blocks[i];
-        block.m_activeMask = nullptr;   // clear for each SIMD size
+        block.clearCaching();   // clear for each SIMD size
         m_currentBlock = i;
         if (m_blockCoalescing->IsEmptyBlock(block.bb))
         {
@@ -975,6 +975,8 @@ bool EmitPass::runOnFunction(llvm::Function& F)
         while (I != E)
         {
             Instruction* llvmInst = I->m_root;
+            resetCurrInstNumInstances();
+
             if (llvmInst->getDebugLoc())
             {
                 unsigned int curLineNumber = llvmInst->getDebugLoc().getLine();
@@ -1004,6 +1006,8 @@ bool EmitPass::runOnFunction(llvm::Function& F)
             bool slicing = false;
             uint numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
             IGC_ASSERT(numInstance == 1 || numInstance == 2);
+            // caching the number of instance
+            setCurrInstNumInstances(numInstance);
 
             if (slicing && !disableSlicing)
             {
@@ -1033,6 +1037,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
                 if (slicing)
                 {
                     numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
+                    setCurrInstNumInstances(numInstance);
                 }
 
                 if (llvmtoVISADump)
@@ -12327,32 +12332,93 @@ void EmitPass::emitScalarAtomics(
         uniformAtomicOp = EATOMIC_IADD;
     }
     bool returnsImmValue = (!pInst->use_empty());
-    CVariable* pFinalAtomicSrcVal = m_currShader->GetNewVariable(
-        1,
-        type,
-        isA64 ? EALIGN_2GRF : EALIGN_GRF,
-        true,
-        CName::NONE);
+    CVariable* pFinalAtomicSrcVal;
     CVariable* pSrcsArr[2] = { nullptr, nullptr };
-    if (returnsImmValue)
+
+    if (op == EOPCODE_ADD && bitWidth == 32 && pSrc->IsUniform() &&
+        getCurrInstNumInstances() == 1 && !returnsImmValue)
     {
-        // sum all the lanes
-        emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
+        // Special case for uniform DW src (like atomic_add(1) without return value.
+        // Note: limit this code for a single instance for now as scalar atomic must have
+        //       instance = 1 (see DecideInstanceAndSlice()).
+        //
+        // The following sequence will be generated:
+        //    (W) mov (16|M0)  f0.0<1>:uw  0:uw
+        //        cmp.eq.f0.0  (16|M0)  dummy:uw dummy:uw
+        //    (W) mov (1|M0)   r2.0<1>:uw   f0.0:uw
+        //    (W) cbit (1|M0)  r1.0:uw  r2.0:uw         <-- r1.0 : number of active lanes
+        //    (W) mul  (1|M0)  r10:ud   pSrc  r1.0:uw
+        SBasicBlock& currBlk = getCurrentBlock();
+        CVariable* numActiveLanes = currBlk.m_numActiveLanes;
+        if (numActiveLanes == nullptr)
+        {
+            CVariable* emask = GetExecutionMask();  // execution mask for the entire dispatch size
+            // Count the number of '1' bits we have in the execmask to get the number of active lanes.
+            // For example, given emask = 1011011000100010b,  numActiveLanes = 7
+            // This will handle cases in which not all lanes are active.
+            numActiveLanes = m_currShader->GetNewVariable(1, ISA_TYPE_W, EALIGN_DWORD, true, CName::NONE);
+            m_encoder->CBit(numActiveLanes, emask);
+            m_encoder->Push();
+
+            // save it for possible re-use later.
+            currBlk.m_numActiveLanes = numActiveLanes;
+        }
 
-        CVariable* pSrcCopy = pSrcsArr[0];
-        if (m_currShader->m_numberInstance == 2)
+        // pFinalAtomicSrcVal is used in msg's payload and thus needs to be GRF-aligned
+        pFinalAtomicSrcVal = m_currShader->GetNewVariable(1, ISA_TYPE_D, EALIGN_GRF, true, CName::NONE);
+        if (pSrc->IsImmediate() && pSrc->GetImmediateValue() == 1)
         {
-            pSrcCopy = pSrcsArr[1];
+            if (negateSrc)
+            {
+                m_encoder->SetSrcModifier(0, EMOD_NEG);
+            }
+            m_encoder->Cast(pFinalAtomicSrcVal, numActiveLanes);
+            m_encoder->Push();
         }
+        else
+        {
+            m_encoder->Mul(pFinalAtomicSrcVal, pSrc, numActiveLanes);
+            m_encoder->Push();
 
-        m_encoder->SetSrcRegion(0, 0, 1, 0);
-        m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
-        m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
-        m_encoder->Push();
+            // using neg srcmod with mul will end up with more insts, thus using srcmod on mov
+            if (negateSrc)
+            {
+                m_encoder->SetSrcModifier(0, EMOD_NEG);
+            }
+            m_encoder->Copy(pFinalAtomicSrcVal, pFinalAtomicSrcVal);
+            m_encoder->Push();
+        }
     }
     else
     {
-        emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
+        // general case
+        pFinalAtomicSrcVal = m_currShader->GetNewVariable(
+            1,
+            type,
+            isA64 ? EALIGN_2GRF : EALIGN_GRF,
+            true,
+            CName::NONE);
+
+        if (returnsImmValue)
+        {
+            // sum all the lanes
+            emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
+
+            CVariable* pSrcCopy = pSrcsArr[0];
+            if (m_currShader->m_numberInstance == 2)
+            {
+                pSrcCopy = pSrcsArr[1];
+            }
+
+            m_encoder->SetSrcRegion(0, 0, 1, 0);
+            m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
+            m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
+            m_encoder->Push();
+        }
+        else
+        {
+            emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
+        }
     }
 
     auto moveToReg = [&](CVariable*& pVar)
@@ -12388,11 +12454,6 @@ void EmitPass::emitScalarAtomics(
     m_encoder->SetSimdSize(SIMDMode::SIMD1);
     m_encoder->SetNoMask();
 
-    CVariable* pReturnVal = returnsImmValue ?
-        m_currShader->GetNewVariable(
-            1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
-        nullptr;
-
     if (bitWidth == 16)
     {
         CVariable* pCastAtomicSrcVal =
@@ -12402,6 +12463,11 @@ void EmitPass::emitScalarAtomics(
         pFinalAtomicSrcVal = pCastAtomicSrcVal;
     }
 
+    CVariable* pReturnVal = returnsImmValue ?
+        m_currShader->GetNewVariable(
+            1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
+        nullptr;
+
     if (shouldGenerateLSC(pInst))
     {
         m_encoder->LSC_AtomicRaw(
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp
@@ -736,6 +736,12 @@ class EmitPass : public llvm::FunctionPass
 
     llvm::DenseMap<llvm::Instruction*, bool> instrMap;
 
+    // caching the number of instances for the current inst.
+    int16_t m_currInstNumInstances = -1;
+    inline void resetCurrInstNumInstances() { m_currInstNumInstances = -1; }
+    inline void setCurrInstNumInstances(int16_t aV) { m_currInstNumInstances = aV; }
+    inline int16_t getCurrInstNumInstances() const { return m_currInstNumInstances; }
+
     // Current rounding Mode
     //   As RM of FPCvtInt and FP could be different, there
     //   are two fields to keep track of their current values.
diff --git a/IGC/Compiler/CISACodeGen/PatternMatchPass.hpp b/IGC/Compiler/CISACodeGen/PatternMatchPass.hpp
@@ -106,6 +106,12 @@ namespace IGC
         // caches the active lane mask (a flag variable) for this BB
         // this is currently set only when we enable the A64 WA
         CVariable* m_activeMask = nullptr;
+        // caching of the number of active lanes under dispatch size (not 1st or 2nd instances)
+        CVariable* m_numActiveLanes;
+        void clearCaching() {
+            m_activeMask = nullptr;
+            m_numActiveLanes = nullptr;
+        }
     };
 
     class CodeGenPatternMatch : public llvm::FunctionPass, public llvm::InstVisitor<CodeGenPatternMatch>