Skip to content

Commit 065dba6

Browse files
jgu222igcbot
authored andcommitted
Improve scalar atomic add/sub
For scalar atomic (add/sub/inc/dec) without return and with uniform addend, a more efficient code sequence will be used. For example, "atomic_add (16|M0) p, 1" will be: emask = current emask numBits = numOfOne(emask); (W) atomic_add (1|M0) p, numBits We basically save numBits for reuses within the same BB.
1 parent f19f59d commit 065dba6

File tree

3 files changed

+101
-23
lines changed

3 files changed

+101
-23
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 89 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -943,7 +943,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
943943
for (uint i = 0; i < m_pattern->m_numBlocks; i++)
944944
{
945945
SBasicBlock& block = m_pattern->m_blocks[i];
946-
block.m_activeMask = nullptr; // clear for each SIMD size
946+
block.clearCaching(); // clear for each SIMD size
947947
m_currentBlock = i;
948948
if (m_blockCoalescing->IsEmptyBlock(block.bb))
949949
{
@@ -975,6 +975,8 @@ bool EmitPass::runOnFunction(llvm::Function& F)
975975
while (I != E)
976976
{
977977
Instruction* llvmInst = I->m_root;
978+
resetCurrInstNumInstances();
979+
978980
if (llvmInst->getDebugLoc())
979981
{
980982
unsigned int curLineNumber = llvmInst->getDebugLoc().getLine();
@@ -1004,6 +1006,8 @@ bool EmitPass::runOnFunction(llvm::Function& F)
10041006
bool slicing = false;
10051007
uint numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
10061008
IGC_ASSERT(numInstance == 1 || numInstance == 2);
1009+
// caching the number of instance
1010+
setCurrInstNumInstances(numInstance);
10071011

10081012
if (slicing && !disableSlicing)
10091013
{
@@ -1033,6 +1037,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
10331037
if (slicing)
10341038
{
10351039
numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
1040+
setCurrInstNumInstances(numInstance);
10361041
}
10371042

10381043
if (llvmtoVISADump)
@@ -12327,32 +12332,93 @@ void EmitPass::emitScalarAtomics(
1232712332
uniformAtomicOp = EATOMIC_IADD;
1232812333
}
1232912334
bool returnsImmValue = (!pInst->use_empty());
12330-
CVariable* pFinalAtomicSrcVal = m_currShader->GetNewVariable(
12331-
1,
12332-
type,
12333-
isA64 ? EALIGN_2GRF : EALIGN_GRF,
12334-
true,
12335-
CName::NONE);
12335+
CVariable* pFinalAtomicSrcVal;
1233612336
CVariable* pSrcsArr[2] = { nullptr, nullptr };
12337-
if (returnsImmValue)
12337+
12338+
if (op == EOPCODE_ADD && bitWidth == 32 && pSrc->IsUniform() &&
12339+
getCurrInstNumInstances() == 1 && !returnsImmValue)
1233812340
{
12339-
// sum all the lanes
12340-
emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
12341+
// Special case for uniform DW src (like atomic_add(1) without return value.
12342+
// Note: limit this code for a single instance for now as scalar atomic must have
12343+
// instance = 1 (see DecideInstanceAndSlice()).
12344+
//
12345+
// The following sequence will be generated:
12346+
// (W) mov (16|M0) f0.0<1>:uw 0:uw
12347+
// cmp.eq.f0.0 (16|M0) dummy:uw dummy:uw
12348+
// (W) mov (1|M0) r2.0<1>:uw f0.0:uw
12349+
// (W) cbit (1|M0) r1.0:uw r2.0:uw <-- r1.0 : number of active lanes
12350+
// (W) mul (1|M0) r10:ud pSrc r1.0:uw
12351+
SBasicBlock& currBlk = getCurrentBlock();
12352+
CVariable* numActiveLanes = currBlk.m_numActiveLanes;
12353+
if (numActiveLanes == nullptr)
12354+
{
12355+
CVariable* emask = GetExecutionMask(); // execution mask for the entire dispatch size
12356+
// Count the number of '1' bits we have in the execmask to get the number of active lanes.
12357+
// For example, given emask = 1011011000100010b, numActiveLanes = 7
12358+
// This will handle cases in which not all lanes are active.
12359+
numActiveLanes = m_currShader->GetNewVariable(1, ISA_TYPE_W, EALIGN_DWORD, true, CName::NONE);
12360+
m_encoder->CBit(numActiveLanes, emask);
12361+
m_encoder->Push();
12362+
12363+
// save it for possible re-use later.
12364+
currBlk.m_numActiveLanes = numActiveLanes;
12365+
}
1234112366

12342-
CVariable* pSrcCopy = pSrcsArr[0];
12343-
if (m_currShader->m_numberInstance == 2)
12367+
// pFinalAtomicSrcVal is used in msg's payload and thus needs to be GRF-aligned
12368+
pFinalAtomicSrcVal = m_currShader->GetNewVariable(1, ISA_TYPE_D, EALIGN_GRF, true, CName::NONE);
12369+
if (pSrc->IsImmediate() && pSrc->GetImmediateValue() == 1)
1234412370
{
12345-
pSrcCopy = pSrcsArr[1];
12371+
if (negateSrc)
12372+
{
12373+
m_encoder->SetSrcModifier(0, EMOD_NEG);
12374+
}
12375+
m_encoder->Cast(pFinalAtomicSrcVal, numActiveLanes);
12376+
m_encoder->Push();
1234612377
}
12378+
else
12379+
{
12380+
m_encoder->Mul(pFinalAtomicSrcVal, pSrc, numActiveLanes);
12381+
m_encoder->Push();
1234712382

12348-
m_encoder->SetSrcRegion(0, 0, 1, 0);
12349-
m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
12350-
m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
12351-
m_encoder->Push();
12383+
// using neg srcmod with mul will end up with more insts, thus using srcmod on mov
12384+
if (negateSrc)
12385+
{
12386+
m_encoder->SetSrcModifier(0, EMOD_NEG);
12387+
}
12388+
m_encoder->Copy(pFinalAtomicSrcVal, pFinalAtomicSrcVal);
12389+
m_encoder->Push();
12390+
}
1235212391
}
1235312392
else
1235412393
{
12355-
emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
12394+
// general case
12395+
pFinalAtomicSrcVal = m_currShader->GetNewVariable(
12396+
1,
12397+
type,
12398+
isA64 ? EALIGN_2GRF : EALIGN_GRF,
12399+
true,
12400+
CName::NONE);
12401+
12402+
if (returnsImmValue)
12403+
{
12404+
// sum all the lanes
12405+
emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
12406+
12407+
CVariable* pSrcCopy = pSrcsArr[0];
12408+
if (m_currShader->m_numberInstance == 2)
12409+
{
12410+
pSrcCopy = pSrcsArr[1];
12411+
}
12412+
12413+
m_encoder->SetSrcRegion(0, 0, 1, 0);
12414+
m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
12415+
m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
12416+
m_encoder->Push();
12417+
}
12418+
else
12419+
{
12420+
emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
12421+
}
1235612422
}
1235712423

1235812424
auto moveToReg = [&](CVariable*& pVar)
@@ -12388,11 +12454,6 @@ void EmitPass::emitScalarAtomics(
1238812454
m_encoder->SetSimdSize(SIMDMode::SIMD1);
1238912455
m_encoder->SetNoMask();
1239012456

12391-
CVariable* pReturnVal = returnsImmValue ?
12392-
m_currShader->GetNewVariable(
12393-
1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
12394-
nullptr;
12395-
1239612457
if (bitWidth == 16)
1239712458
{
1239812459
CVariable* pCastAtomicSrcVal =
@@ -12402,6 +12463,11 @@ void EmitPass::emitScalarAtomics(
1240212463
pFinalAtomicSrcVal = pCastAtomicSrcVal;
1240312464
}
1240412465

12466+
CVariable* pReturnVal = returnsImmValue ?
12467+
m_currShader->GetNewVariable(
12468+
1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
12469+
nullptr;
12470+
1240512471
if (shouldGenerateLSC(pInst))
1240612472
{
1240712473
m_encoder->LSC_AtomicRaw(

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,12 @@ class EmitPass : public llvm::FunctionPass
736736

737737
llvm::DenseMap<llvm::Instruction*, bool> instrMap;
738738

739+
// caching the number of instances for the current inst.
740+
int16_t m_currInstNumInstances = -1;
741+
inline void resetCurrInstNumInstances() { m_currInstNumInstances = -1; }
742+
inline void setCurrInstNumInstances(int16_t aV) { m_currInstNumInstances = aV; }
743+
inline int16_t getCurrInstNumInstances() const { return m_currInstNumInstances; }
744+
739745
// Current rounding Mode
740746
// As RM of FPCvtInt and FP could be different, there
741747
// are two fields to keep track of their current values.

IGC/Compiler/CISACodeGen/PatternMatchPass.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,12 @@ namespace IGC
106106
// caches the active lane mask (a flag variable) for this BB
107107
// this is currently set only when we enable the A64 WA
108108
CVariable* m_activeMask = nullptr;
109+
// caching of the number of active lanes under dispatch size (not 1st or 2nd instances)
110+
CVariable* m_numActiveLanes;
111+
void clearCaching() {
112+
m_activeMask = nullptr;
113+
m_numActiveLanes = nullptr;
114+
}
109115
};
110116

111117
class CodeGenPatternMatch : public llvm::FunctionPass, public llvm::InstVisitor<CodeGenPatternMatch>

0 commit comments

Comments
 (0)