@@ -943,7 +943,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
943
943
for (uint i = 0; i < m_pattern->m_numBlocks; i++)
944
944
{
945
945
SBasicBlock& block = m_pattern->m_blocks[i];
946
- block.m_activeMask = nullptr ; // clear for each SIMD size
946
+ block.clearCaching() ; // clear for each SIMD size
947
947
m_currentBlock = i;
948
948
if (m_blockCoalescing->IsEmptyBlock(block.bb))
949
949
{
@@ -975,6 +975,8 @@ bool EmitPass::runOnFunction(llvm::Function& F)
975
975
while (I != E)
976
976
{
977
977
Instruction* llvmInst = I->m_root;
978
+ resetCurrInstNumInstances();
979
+
978
980
if (llvmInst->getDebugLoc())
979
981
{
980
982
unsigned int curLineNumber = llvmInst->getDebugLoc().getLine();
@@ -1004,6 +1006,8 @@ bool EmitPass::runOnFunction(llvm::Function& F)
1004
1006
bool slicing = false;
1005
1007
uint numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
1006
1008
IGC_ASSERT(numInstance == 1 || numInstance == 2);
1009
+ // caching the number of instance
1010
+ setCurrInstNumInstances(numInstance);
1007
1011
1008
1012
if (slicing && !disableSlicing)
1009
1013
{
@@ -1033,6 +1037,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
1033
1037
if (slicing)
1034
1038
{
1035
1039
numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
1040
+ setCurrInstNumInstances(numInstance);
1036
1041
}
1037
1042
1038
1043
if (llvmtoVISADump)
@@ -12327,32 +12332,93 @@ void EmitPass::emitScalarAtomics(
12327
12332
uniformAtomicOp = EATOMIC_IADD;
12328
12333
}
12329
12334
bool returnsImmValue = (!pInst->use_empty());
12330
- CVariable* pFinalAtomicSrcVal = m_currShader->GetNewVariable(
12331
- 1,
12332
- type,
12333
- isA64 ? EALIGN_2GRF : EALIGN_GRF,
12334
- true,
12335
- CName::NONE);
12335
+ CVariable* pFinalAtomicSrcVal;
12336
12336
CVariable* pSrcsArr[2] = { nullptr, nullptr };
12337
- if (returnsImmValue)
12337
+
12338
+ if (op == EOPCODE_ADD && bitWidth == 32 && pSrc->IsUniform() &&
12339
+ getCurrInstNumInstances() == 1 && !returnsImmValue)
12338
12340
{
12339
- // sum all the lanes
12340
- emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
12341
+ // Special case for uniform DW src (like atomic_add(1) without return value.
12342
+ // Note: limit this code for a single instance for now as scalar atomic must have
12343
+ // instance = 1 (see DecideInstanceAndSlice()).
12344
+ //
12345
+ // The following sequence will be generated:
12346
+ // (W) mov (16|M0) f0.0<1>:uw 0:uw
12347
+ // cmp.eq.f0.0 (16|M0) dummy:uw dummy:uw
12348
+ // (W) mov (1|M0) r2.0<1>:uw f0.0:uw
12349
+ // (W) cbit (1|M0) r1.0:uw r2.0:uw <-- r1.0 : number of active lanes
12350
+ // (W) mul (1|M0) r10:ud pSrc r1.0:uw
12351
+ SBasicBlock& currBlk = getCurrentBlock();
12352
+ CVariable* numActiveLanes = currBlk.m_numActiveLanes;
12353
+ if (numActiveLanes == nullptr)
12354
+ {
12355
+ CVariable* emask = GetExecutionMask(); // execution mask for the entire dispatch size
12356
+ // Count the number of '1' bits we have in the execmask to get the number of active lanes.
12357
+ // For example, given emask = 1011011000100010b, numActiveLanes = 7
12358
+ // This will handle cases in which not all lanes are active.
12359
+ numActiveLanes = m_currShader->GetNewVariable(1, ISA_TYPE_W, EALIGN_DWORD, true, CName::NONE);
12360
+ m_encoder->CBit(numActiveLanes, emask);
12361
+ m_encoder->Push();
12362
+
12363
+ // save it for possible re-use later.
12364
+ currBlk.m_numActiveLanes = numActiveLanes;
12365
+ }
12341
12366
12342
- CVariable* pSrcCopy = pSrcsArr[0];
12343
- if (m_currShader->m_numberInstance == 2)
12367
+ // pFinalAtomicSrcVal is used in msg's payload and thus needs to be GRF-aligned
12368
+ pFinalAtomicSrcVal = m_currShader->GetNewVariable(1, ISA_TYPE_D, EALIGN_GRF, true, CName::NONE);
12369
+ if (pSrc->IsImmediate() && pSrc->GetImmediateValue() == 1)
12344
12370
{
12345
- pSrcCopy = pSrcsArr[1];
12371
+ if (negateSrc)
12372
+ {
12373
+ m_encoder->SetSrcModifier(0, EMOD_NEG);
12374
+ }
12375
+ m_encoder->Cast(pFinalAtomicSrcVal, numActiveLanes);
12376
+ m_encoder->Push();
12346
12377
}
12378
+ else
12379
+ {
12380
+ m_encoder->Mul(pFinalAtomicSrcVal, pSrc, numActiveLanes);
12381
+ m_encoder->Push();
12347
12382
12348
- m_encoder->SetSrcRegion(0, 0, 1, 0);
12349
- m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
12350
- m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
12351
- m_encoder->Push();
12383
+ // using neg srcmod with mul will end up with more insts, thus using srcmod on mov
12384
+ if (negateSrc)
12385
+ {
12386
+ m_encoder->SetSrcModifier(0, EMOD_NEG);
12387
+ }
12388
+ m_encoder->Copy(pFinalAtomicSrcVal, pFinalAtomicSrcVal);
12389
+ m_encoder->Push();
12390
+ }
12352
12391
}
12353
12392
else
12354
12393
{
12355
- emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
12394
+ // general case
12395
+ pFinalAtomicSrcVal = m_currShader->GetNewVariable(
12396
+ 1,
12397
+ type,
12398
+ isA64 ? EALIGN_2GRF : EALIGN_GRF,
12399
+ true,
12400
+ CName::NONE);
12401
+
12402
+ if (returnsImmValue)
12403
+ {
12404
+ // sum all the lanes
12405
+ emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
12406
+
12407
+ CVariable* pSrcCopy = pSrcsArr[0];
12408
+ if (m_currShader->m_numberInstance == 2)
12409
+ {
12410
+ pSrcCopy = pSrcsArr[1];
12411
+ }
12412
+
12413
+ m_encoder->SetSrcRegion(0, 0, 1, 0);
12414
+ m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
12415
+ m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
12416
+ m_encoder->Push();
12417
+ }
12418
+ else
12419
+ {
12420
+ emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
12421
+ }
12356
12422
}
12357
12423
12358
12424
auto moveToReg = [&](CVariable*& pVar)
@@ -12388,11 +12454,6 @@ void EmitPass::emitScalarAtomics(
12388
12454
m_encoder->SetSimdSize(SIMDMode::SIMD1);
12389
12455
m_encoder->SetNoMask();
12390
12456
12391
- CVariable* pReturnVal = returnsImmValue ?
12392
- m_currShader->GetNewVariable(
12393
- 1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
12394
- nullptr;
12395
-
12396
12457
if (bitWidth == 16)
12397
12458
{
12398
12459
CVariable* pCastAtomicSrcVal =
@@ -12402,6 +12463,11 @@ void EmitPass::emitScalarAtomics(
12402
12463
pFinalAtomicSrcVal = pCastAtomicSrcVal;
12403
12464
}
12404
12465
12466
+ CVariable* pReturnVal = returnsImmValue ?
12467
+ m_currShader->GetNewVariable(
12468
+ 1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
12469
+ nullptr;
12470
+
12405
12471
if (shouldGenerateLSC(pInst))
12406
12472
{
12407
12473
m_encoder->LSC_AtomicRaw(
0 commit comments