@@ -8225,7 +8225,7 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
8225
8225
case GenISAIntrinsic::GenISA_fcmpxchgatomicraw:
8226
8226
case GenISAIntrinsic::GenISA_icmpxchgatomicrawA64:
8227
8227
case GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64:
8228
- emitAtomicRaw(inst);
8228
+ emitAtomicRaw(inst, inst->getOperand(1) );
8229
8229
break;
8230
8230
case GenISAIntrinsic::GenISA_intatomictyped:
8231
8231
case GenISAIntrinsic::GenISA_icmpxchgatomictyped:
@@ -13686,18 +13686,13 @@ add happens with destination address as <addr> = constant. <src> = constant too.
13686
13686
say for SIMD8 there are 8 lanes trying to write to the same address. H/W will serialize this to
13687
13687
8 back to back atomic instructions which are extremely slow to execute.
13688
13688
*/
13689
- void EmitPass::emitScalarAtomics(
13690
- llvm::Instruction* pInst,
13691
- ResourceDescriptor& resource,
13692
- AtomicOp atomic_op,
13693
- CVariable* pDstAddr,
13694
- CVariable* pU,
13695
- CVariable* pV,
13696
- CVariable* pR,
13697
- CVariable* pSrc,
13698
- bool isA64,
13699
- int bitWidth)
13700
- {
13689
+ void EmitPass::emitScalarAtomics(llvm::Instruction *pInst,
13690
+ ResourceDescriptor &resource,
13691
+ AtomicOp atomic_op,
13692
+ CVariable *pDstAddr, CVariable *pU,
13693
+ CVariable *pV, CVariable *pR, CVariable *pSrc,
13694
+ bool isA64, int bitWidth, int immOffset,
13695
+ int immScale, LSC_ADDR_SIZE addrSize) {
13701
13696
e_opcode op = EOPCODE_ADD;
13702
13697
// find the value for which opcode(x, identity) == x
13703
13698
unsigned int identityValue = 0;
@@ -14022,7 +14017,10 @@ void EmitPass::emitScalarAtomicLoad(
14022
14017
CVariable* pR,
14023
14018
CVariable* pSrc,
14024
14019
bool isA64,
14025
- int bitWidth)
14020
+ int bitWidth,
14021
+ int immOffset,
14022
+ int immScale,
14023
+ LSC_ADDR_SIZE addrSize)
14026
14024
{
14027
14025
auto moveToReg = [&](CVariable*& pVar)
14028
14026
{
@@ -14070,6 +14068,7 @@ void EmitPass::emitScalarAtomicLoad(
14070
14068
true,
14071
14069
pDstAddr ? pDstAddr->getName() : CName::NONE) : nullptr;
14072
14070
{
14071
+
14073
14072
if (isA64)
14074
14073
{
14075
14074
m_encoder->AtomicRawA64(
@@ -14259,20 +14258,22 @@ CVariable* EmitPass::UnpackOrBroadcastIfUniform(CVariable* pVar)
14259
14258
return pUnpacked;
14260
14259
}
14261
14260
14262
- void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
14261
+ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst *pInst, Value *dstAddr,
14262
+ ConstantInt *immOffset, ConstantInt *immScale
14263
+ )
14263
14264
{
14264
14265
ForceDMask();
14265
14266
// Currently, Dword Atomics can be called by matching 2 intrinsics. One is the DwordAtomicRaw
14266
14267
// and AtomicCmpXchg (which has 2 srcs unlike the other atomics).
14267
- IGC_ASSERT(IGCLLVM::getNumArgOperands(pInsn ) == 4);
14268
+ IGC_ASSERT(IGCLLVM::getNumArgOperands(pInst ) == 4);
14268
14269
14269
14270
/// Immediate Atomics return the value before the atomic operation is performed. So that flag
14270
14271
/// needs to be set for this.
14271
- bool returnsImmValue = !pInsn ->use_empty();
14272
+ bool returnsImmValue = !pInst ->use_empty();
14272
14273
14273
- llvm::Value* pllbuffer = pInsn ->getOperand(0);
14274
- llvm::Value* pllDstAddr = pInsn ->getOperand(1);
14275
- llvm::Value* pllSrc0 = pInsn ->getOperand(2);
14274
+ llvm::Value* pllbuffer = pInst ->getOperand(0);
14275
+ if (!dstAddr) dstAddr = pInst ->getOperand(1);
14276
+ llvm::Value* pllSrc0 = pInst ->getOperand(2);
14276
14277
ResourceDescriptor resource = GetResourceVariable(pllbuffer);
14277
14278
CountStatelessIndirectAccess(pllbuffer, resource);
14278
14279
AtomicOp atomic_op = EATOMIC_UNDEF;
@@ -14284,18 +14285,18 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
14284
14285
14285
14286
CVariable* pSrc0 = nullptr;
14286
14287
CVariable* pSrc1 = nullptr;
14287
- llvm::GenIntrinsicInst* pIntrinCall = llvm::cast<llvm::GenIntrinsicInst>(pInsn );
14288
+ llvm::GenIntrinsicInst* pIntrinCall = llvm::cast<llvm::GenIntrinsicInst>(pInst );
14288
14289
GenISAIntrinsic::ID IID = pIntrinCall->getIntrinsicID();
14289
14290
if (IID == GenISAIntrinsic::GenISA_icmpxchgatomicraw ||
14290
14291
IID == GenISAIntrinsic::GenISA_fcmpxchgatomicraw ||
14291
14292
IID == GenISAIntrinsic::GenISA_icmpxchgatomicrawA64 ||
14292
14293
IID == GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64)
14293
14294
{
14294
- llvm::Value* pllSrc1 = pInsn ->getOperand(3);
14295
+ llvm::Value* pllSrc1 = pInst ->getOperand(3);
14295
14296
pSrc1 = GetSymbol(pllSrc1);
14296
14297
14297
- Function* F = pInsn ->getParent()->getParent();
14298
- if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn ))
14298
+ Function* F = pInst ->getParent()->getParent();
14299
+ if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInst ))
14299
14300
{
14300
14301
m_encoder->SetSimdSize(SIMDMode::SIMD1);
14301
14302
m_encoder->SetNoMask();
@@ -14314,11 +14315,11 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
14314
14315
}
14315
14316
else
14316
14317
{
14317
- atomic_op = static_cast<AtomicOp>(llvm::cast<llvm::ConstantInt>(pInsn ->getOperand(3))->getZExtValue());
14318
+ atomic_op = static_cast<AtomicOp>(llvm::cast<llvm::ConstantInt>(pInst ->getOperand(3))->getZExtValue());
14318
14319
}
14319
14320
14320
14321
14321
- unsigned short bitwidth = pInsn ->getType()->getScalarSizeInBits();
14322
+ unsigned short bitwidth = pInst ->getType()->getScalarSizeInBits();
14322
14323
const bool is16Bit = (bitwidth == 16);
14323
14324
14324
14325
if (is16Bit)
@@ -14335,38 +14336,52 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
14335
14336
}
14336
14337
14337
14338
// Dst address in bytes.
14338
- CVariable* pDstAddr = GetSymbol(pllDstAddr);
14339
+ CVariable* pDstAddr = GetSymbol(dstAddr);
14340
+
14341
+
14342
+ PointerType *PtrTy = dyn_cast<PointerType>(dstAddr->getType());
14343
+ bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
14344
+ LSC_ADDR_SIZE addrSize = isA64 ? LSC_ADDR_SIZE_64b : LSC_ADDR_SIZE_32b;
14345
+
14346
+ const int immOffsetVal =
14347
+ immOffset ? static_cast<int>(immOffset->getSExtValue()) : 0;
14348
+ const int immScaleVal =
14349
+ immScale ? static_cast<int>(immScale->getSExtValue()) : 1;
14350
+
14339
14351
// If DisableScalarAtomics regkey is enabled or DisableIGCOptimizations regkey is enabled then
14340
14352
// don't enable scalar atomics
14341
- if (IsUniformAtomic(pInsn ))
14353
+ if (IsUniformAtomic(pInst ))
14342
14354
{
14343
- PointerType* PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
14344
- bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
14345
14355
e_alignment uniformAlign = isA64 ? EALIGN_2GRF : EALIGN_GRF;
14346
14356
// Re-align the pointer if it's not GRF aligned.
14347
14357
pDstAddr = ReAlignUniformVariable(pDstAddr, uniformAlign);
14348
- if (atomic_op == EATOMIC_OR && OrWith0Atomic(pInsn , 2))
14358
+ if (atomic_op == EATOMIC_OR && OrWith0Atomic(pInst , 2))
14349
14359
{
14350
14360
// special case of atomic_load
14351
- emitScalarAtomicLoad(pInsn, resource, pDstAddr, nullptr /*u*/, nullptr /*v*/, nullptr /*r*/, pSrc0, isA64, bitwidth);
14361
+ emitScalarAtomicLoad(pInst, resource,
14362
+ pDstAddr, nullptr /*u*/, nullptr /*v*/,
14363
+ nullptr /*r*/, pSrc0, isA64, bitwidth,
14364
+ immOffsetVal, immScaleVal, addrSize);
14352
14365
}
14353
- else
14354
- {
14355
- emitScalarAtomics(pInsn, resource, atomic_op, pDstAddr, nullptr /*u*/, nullptr /*v*/, nullptr /*r*/, pSrc0, isA64, bitwidth);
14366
+ else {
14367
+ emitScalarAtomics(pInst, resource, atomic_op,
14368
+ pDstAddr, nullptr /*u*/, nullptr /*v*/,
14369
+ nullptr /*r*/, pSrc0, isA64, bitwidth,
14370
+ immOffsetVal, immScaleVal, addrSize);
14356
14371
ResetVMask();
14357
14372
}
14358
14373
return;
14359
14374
}
14360
14375
14361
- Function* F = pInsn ->getParent()->getParent();
14362
- if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn ))
14376
+ Function* F = pInst ->getParent()->getParent();
14377
+ if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInst ))
14363
14378
{
14364
14379
m_encoder->SetSimdSize(SIMDMode::SIMD1);
14365
14380
m_encoder->SetNoMask();
14366
14381
}
14367
14382
pDstAddr = BroadcastIfUniform(pDstAddr);
14368
14383
14369
- if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn ))
14384
+ if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInst ))
14370
14385
{
14371
14386
m_encoder->SetSimdSize(SIMDMode::SIMD1);
14372
14387
m_encoder->SetNoMask();
@@ -14376,7 +14391,7 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
14376
14391
pSrc0 = UnpackOrBroadcastIfUniform(pSrc0);
14377
14392
}
14378
14393
14379
- if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn ))
14394
+ if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInst ))
14380
14395
{
14381
14396
m_encoder->SetSimdSize(SIMDMode::SIMD1);
14382
14397
m_encoder->SetNoMask();
@@ -14390,9 +14405,6 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
14390
14405
EALIGN_GRF, CName::NONE) :
14391
14406
nullptr;
14392
14407
14393
- PointerType* PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
14394
- bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
14395
- LSC_ADDR_SIZE addrSize = isA64 ? LSC_ADDR_SIZE_64b : LSC_ADDR_SIZE_32b;
14396
14408
bool extendPointer = (bitwidth == 64 && !isA64);
14397
14409
// DG2 onward with LSC we do not have to extend an A32 pointer to an
14398
14410
// A64 pointer for 64bit atomics
@@ -14410,17 +14422,11 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
14410
14422
}
14411
14423
else
14412
14424
{
14413
- if (shouldGenerateLSC())
14414
- {
14415
- m_encoder->LSC_AtomicRaw(
14416
- atomic_op,
14417
- pDst, pDstAddr,
14418
- pSrc0, pSrc1,
14419
- bitwidth,
14420
- &resource,
14421
- addrSize,
14422
- 0,
14423
- LSC_DEFAULT_CACHING);
14425
+ if (shouldGenerateLSC()) {
14426
+ m_encoder->LSC_AtomicRaw(atomic_op, pDst,
14427
+ pDstAddr, pSrc0, pSrc1, bitwidth,
14428
+ &resource, addrSize, immOffsetVal,
14429
+ immScaleVal, LSC_DEFAULT_CACHING);
14424
14430
}
14425
14431
else
14426
14432
{
@@ -14466,19 +14472,15 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
14466
14472
uint label = 0;
14467
14473
CVariable* flag = nullptr;
14468
14474
bool needLoop = ResourceLoopHeader(resource, flag, label);
14469
- if (shouldGenerateLSC(pInsn))
14470
- {
14471
- m_encoder->LSC_AtomicRaw(
14472
- atomic_op,
14473
- pDst, pDstAddr,
14474
- pSrc0, pSrc1,
14475
- bitwidth,
14476
- &resource, addrSize,
14477
- 0,
14478
- LSC_DEFAULT_CACHING);
14475
+ if (shouldGenerateLSC(pInst)) {
14476
+ m_encoder->LSC_AtomicRaw(atomic_op, pDst,
14477
+ pDstAddr, pSrc0, pSrc1, bitwidth,
14478
+ &resource, addrSize, immOffsetVal,
14479
+ immScaleVal, LSC_DEFAULT_CACHING);
14479
14480
}
14480
14481
else
14481
14482
{
14483
+ IGC_ASSERT_MESSAGE(!immScale && !immOffset, "Scale and offset not supported on non-LSC path!");
14482
14484
m_encoder->DwordAtomicRaw(
14483
14485
atomic_op,
14484
14486
resource,
@@ -14561,11 +14563,17 @@ void EmitPass::emitAtomicTyped(GenIntrinsicInst* pInsn)
14561
14563
if (atomic_op == EATOMIC_OR && OrWith0Atomic(pInsn, 4))
14562
14564
{
14563
14565
// special case of atomic_load
14564
- emitScalarAtomicLoad(pInsn, resource, nullptr /*pDstAddr*/, pU, pV, pR, pSrc0, false /*isA64*/, bitwidth);
14566
+ emitScalarAtomicLoad(pInsn, resource,
14567
+ nullptr /*pDstAddr*/, pU, pV, pR, pSrc0,
14568
+ false /*isA64*/, bitwidth, 0, 1,
14569
+ LSC_ADDR_SIZE_32b);
14565
14570
}
14566
14571
else
14567
14572
{
14568
- emitScalarAtomics(pInsn, resource, atomic_op, nullptr /*pDstAddr*/, pU, pV, pR, pSrc0, false /*isA64*/, bitwidth);
14573
+ emitScalarAtomics(pInsn, resource, atomic_op,
14574
+ nullptr /*pDstAddr*/, pU, pV, pR, pSrc0,
14575
+ false /*isA64*/, bitwidth, 0, 1,
14576
+ LSC_ADDR_SIZE_32b);
14569
14577
}
14570
14578
}
14571
14579
else
@@ -21572,11 +21580,9 @@ void EmitPass::emitLSCAtomic(llvm::GenIntrinsicInst* inst)
21572
21580
21573
21581
auto cacheOpts = translateLSCCacheControlsFromValue(inst->getOperand(5), false);
21574
21582
21575
- m_encoder->LSC_AtomicRaw(
21576
- atomicOp, pOldValue, pDstAddr, pAtomicVal,
21577
- pAtomicCmp, bitwidth, &resource,
21578
- addrSize, immOff,
21579
- cacheOpts);
21583
+ m_encoder->LSC_AtomicRaw(atomicOp, pOldValue,
21584
+ pDstAddr, pAtomicVal, pAtomicCmp, bitwidth,
21585
+ &resource, addrSize, immOff, 1, cacheOpts);
21580
21586
m_encoder->Push();
21581
21587
}
21582
21588
0 commit comments