Skip to content

Commit c74ad90

Browse files
jgu222igcbot
authored andcommitted
[Autobackout][FuncReg]Revert of change: 065dba6
Improve scalar atomic add/sub For scalar atomic (add/sub/inc/dec) without return and with uniform addend, a more efficient code sequence will be used. For example, "atomic_add (16|M0) p, 1" will be: emask = current emask numBits = numOfOne(emask); (W) atomic_add (1|M0) p, numBits We basically save numBits for reuses within the same BB.
1 parent 065dba6 commit c74ad90

File tree

3 files changed

+23
-101
lines changed

3 files changed

+23
-101
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 23 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -943,7 +943,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
943943
for (uint i = 0; i < m_pattern->m_numBlocks; i++)
944944
{
945945
SBasicBlock& block = m_pattern->m_blocks[i];
946-
block.clearCaching(); // clear for each SIMD size
946+
block.m_activeMask = nullptr; // clear for each SIMD size
947947
m_currentBlock = i;
948948
if (m_blockCoalescing->IsEmptyBlock(block.bb))
949949
{
@@ -975,8 +975,6 @@ bool EmitPass::runOnFunction(llvm::Function& F)
975975
while (I != E)
976976
{
977977
Instruction* llvmInst = I->m_root;
978-
resetCurrInstNumInstances();
979-
980978
if (llvmInst->getDebugLoc())
981979
{
982980
unsigned int curLineNumber = llvmInst->getDebugLoc().getLine();
@@ -1006,8 +1004,6 @@ bool EmitPass::runOnFunction(llvm::Function& F)
10061004
bool slicing = false;
10071005
uint numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
10081006
IGC_ASSERT(numInstance == 1 || numInstance == 2);
1009-
// caching the number of instance
1010-
setCurrInstNumInstances(numInstance);
10111007

10121008
if (slicing && !disableSlicing)
10131009
{
@@ -1037,7 +1033,6 @@ bool EmitPass::runOnFunction(llvm::Function& F)
10371033
if (slicing)
10381034
{
10391035
numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
1040-
setCurrInstNumInstances(numInstance);
10411036
}
10421037

10431038
if (llvmtoVISADump)
@@ -12332,93 +12327,32 @@ void EmitPass::emitScalarAtomics(
1233212327
uniformAtomicOp = EATOMIC_IADD;
1233312328
}
1233412329
bool returnsImmValue = (!pInst->use_empty());
12335-
CVariable* pFinalAtomicSrcVal;
12330+
CVariable* pFinalAtomicSrcVal = m_currShader->GetNewVariable(
12331+
1,
12332+
type,
12333+
isA64 ? EALIGN_2GRF : EALIGN_GRF,
12334+
true,
12335+
CName::NONE);
1233612336
CVariable* pSrcsArr[2] = { nullptr, nullptr };
12337-
12338-
if (op == EOPCODE_ADD && bitWidth == 32 && pSrc->IsUniform() &&
12339-
getCurrInstNumInstances() == 1 && !returnsImmValue)
12337+
if (returnsImmValue)
1234012338
{
12341-
// Special case for uniform DW src (like atomic_add(1) without return value.
12342-
// Note: limit this code for a single instance for now as scalar atomic must have
12343-
// instance = 1 (see DecideInstanceAndSlice()).
12344-
//
12345-
// The following sequence will be generated:
12346-
// (W) mov (16|M0) f0.0<1>:uw 0:uw
12347-
// cmp.eq.f0.0 (16|M0) dummy:uw dummy:uw
12348-
// (W) mov (1|M0) r2.0<1>:uw f0.0:uw
12349-
// (W) cbit (1|M0) r1.0:uw r2.0:uw <-- r1.0 : number of active lanes
12350-
// (W) mul (1|M0) r10:ud pSrc r1.0:uw
12351-
SBasicBlock& currBlk = getCurrentBlock();
12352-
CVariable* numActiveLanes = currBlk.m_numActiveLanes;
12353-
if (numActiveLanes == nullptr)
12354-
{
12355-
CVariable* emask = GetExecutionMask(); // execution mask for the entire dispatch size
12356-
// Count the number of '1' bits we have in the execmask to get the number of active lanes.
12357-
// For example, given emask = 1011011000100010b, numActiveLanes = 7
12358-
// This will handle cases in which not all lanes are active.
12359-
numActiveLanes = m_currShader->GetNewVariable(1, ISA_TYPE_W, EALIGN_DWORD, true, CName::NONE);
12360-
m_encoder->CBit(numActiveLanes, emask);
12361-
m_encoder->Push();
12362-
12363-
// save it for possible re-use later.
12364-
currBlk.m_numActiveLanes = numActiveLanes;
12365-
}
12339+
// sum all the lanes
12340+
emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
1236612341

12367-
// pFinalAtomicSrcVal is used in msg's payload and thus needs to be GRF-aligned
12368-
pFinalAtomicSrcVal = m_currShader->GetNewVariable(1, ISA_TYPE_D, EALIGN_GRF, true, CName::NONE);
12369-
if (pSrc->IsImmediate() && pSrc->GetImmediateValue() == 1)
12342+
CVariable* pSrcCopy = pSrcsArr[0];
12343+
if (m_currShader->m_numberInstance == 2)
1237012344
{
12371-
if (negateSrc)
12372-
{
12373-
m_encoder->SetSrcModifier(0, EMOD_NEG);
12374-
}
12375-
m_encoder->Cast(pFinalAtomicSrcVal, numActiveLanes);
12376-
m_encoder->Push();
12345+
pSrcCopy = pSrcsArr[1];
1237712346
}
12378-
else
12379-
{
12380-
m_encoder->Mul(pFinalAtomicSrcVal, pSrc, numActiveLanes);
12381-
m_encoder->Push();
1238212347

12383-
// using neg srcmod with mul will end up with more insts, thus using srcmod on mov
12384-
if (negateSrc)
12385-
{
12386-
m_encoder->SetSrcModifier(0, EMOD_NEG);
12387-
}
12388-
m_encoder->Copy(pFinalAtomicSrcVal, pFinalAtomicSrcVal);
12389-
m_encoder->Push();
12390-
}
12348+
m_encoder->SetSrcRegion(0, 0, 1, 0);
12349+
m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
12350+
m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
12351+
m_encoder->Push();
1239112352
}
1239212353
else
1239312354
{
12394-
// general case
12395-
pFinalAtomicSrcVal = m_currShader->GetNewVariable(
12396-
1,
12397-
type,
12398-
isA64 ? EALIGN_2GRF : EALIGN_GRF,
12399-
true,
12400-
CName::NONE);
12401-
12402-
if (returnsImmValue)
12403-
{
12404-
// sum all the lanes
12405-
emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
12406-
12407-
CVariable* pSrcCopy = pSrcsArr[0];
12408-
if (m_currShader->m_numberInstance == 2)
12409-
{
12410-
pSrcCopy = pSrcsArr[1];
12411-
}
12412-
12413-
m_encoder->SetSrcRegion(0, 0, 1, 0);
12414-
m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
12415-
m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
12416-
m_encoder->Push();
12417-
}
12418-
else
12419-
{
12420-
emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
12421-
}
12355+
emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
1242212356
}
1242312357

1242412358
auto moveToReg = [&](CVariable*& pVar)
@@ -12454,6 +12388,11 @@ void EmitPass::emitScalarAtomics(
1245412388
m_encoder->SetSimdSize(SIMDMode::SIMD1);
1245512389
m_encoder->SetNoMask();
1245612390

12391+
CVariable* pReturnVal = returnsImmValue ?
12392+
m_currShader->GetNewVariable(
12393+
1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
12394+
nullptr;
12395+
1245712396
if (bitWidth == 16)
1245812397
{
1245912398
CVariable* pCastAtomicSrcVal =
@@ -12463,11 +12402,6 @@ void EmitPass::emitScalarAtomics(
1246312402
pFinalAtomicSrcVal = pCastAtomicSrcVal;
1246412403
}
1246512404

12466-
CVariable* pReturnVal = returnsImmValue ?
12467-
m_currShader->GetNewVariable(
12468-
1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
12469-
nullptr;
12470-
1247112405
if (shouldGenerateLSC(pInst))
1247212406
{
1247312407
m_encoder->LSC_AtomicRaw(

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -736,12 +736,6 @@ class EmitPass : public llvm::FunctionPass
736736

737737
llvm::DenseMap<llvm::Instruction*, bool> instrMap;
738738

739-
// caching the number of instances for the current inst.
740-
int16_t m_currInstNumInstances = -1;
741-
inline void resetCurrInstNumInstances() { m_currInstNumInstances = -1; }
742-
inline void setCurrInstNumInstances(int16_t aV) { m_currInstNumInstances = aV; }
743-
inline int16_t getCurrInstNumInstances() const { return m_currInstNumInstances; }
744-
745739
// Current rounding Mode
746740
// As RM of FPCvtInt and FP could be different, there
747741
// are two fields to keep track of their current values.

IGC/Compiler/CISACodeGen/PatternMatchPass.hpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,6 @@ namespace IGC
106106
// caches the active lane mask (a flag variable) for this BB
107107
// this is currently set only when we enable the A64 WA
108108
CVariable* m_activeMask = nullptr;
109-
// caching of the number of active lanes under dispatch size (not 1st or 2nd instances)
110-
CVariable* m_numActiveLanes;
111-
void clearCaching() {
112-
m_activeMask = nullptr;
113-
m_numActiveLanes = nullptr;
114-
}
115109
};
116110

117111
class CodeGenPatternMatch : public llvm::FunctionPass, public llvm::InstVisitor<CodeGenPatternMatch>

0 commit comments

Comments
 (0)