@@ -7179,9 +7179,6 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
7179
7179
case GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64:
7180
7180
emitAtomicRaw (inst);
7181
7181
break ;
7182
- case GenISAIntrinsic::GenISA_WaveUniformAtomic:
7183
- emitScalarAtomics (inst);
7184
- break ;
7185
7182
case GenISAIntrinsic::GenISA_dwordatomicstructured:
7186
7183
case GenISAIntrinsic::GenISA_floatatomicstructured:
7187
7184
case GenISAIntrinsic::GenISA_cmpxchgatomicstructured:
@@ -9791,16 +9788,12 @@ void EmitPass::emitPreOrPostFixOp(e_opcode op, uint64_t identityValue, VISA_Type
9791
9788
CVariable* maskedSrc[2 ] = { 0 };
9792
9789
for (int i = 0 ; i < counter; ++i)
9793
9790
{
9791
+ CVariable* pSrcCopy = m_currShader->GetNewVariable (
9792
+ numLanes (m_currShader->m_SIMDSize ),
9793
+ type,
9794
+ IGC::EALIGN_GRF,
9795
+ false );
9794
9796
9795
- CVariable* pSrcCopy = pSrcsArr[i];
9796
- if (pSrcCopy == nullptr )
9797
- {
9798
- pSrcCopy = m_currShader->GetNewVariable (
9799
- numLanes (m_currShader->m_SIMDSize ),
9800
- type,
9801
- IGC::EALIGN_GRF,
9802
- false );
9803
- }
9804
9797
9805
9798
// Set the GRF to 0 with no mask. This will set all the registers to 0
9806
9799
CVariable* pIdentityValue = m_currShader->ImmToVariable (identityValue, type);
@@ -10029,60 +10022,161 @@ void EmitPass::emitPreOrPostFixOp(e_opcode op, uint64_t identityValue, VISA_Type
10029
10022
m_encoder->SetSecondHalf (false );
10030
10023
}
10031
10024
10032
- // / Emit single atomic for the whole HW thread
10033
- void EmitPass::emitScalarAtomics (Instruction* pInst)
10034
- {
10035
- CVariable* pDstAddr = GetSymbol (pInst->getOperand (0 ));
10036
- CVariable* offset = pDstAddr;
10037
- CVariable* src = GetSymbol (pInst->getOperand (2 ));
10038
- ResourceDescriptor resource = GetResourceVariable (pInst->getOperand (0 ));
10039
- AtomicOp atomic_op = static_cast <AtomicOp>(cast<ConstantInt>(pInst->getOperand (3 ))->getZExtValue ());
10040
- if (!pDstAddr->IsUniform ())
10025
+ /*
10026
+ ScalarAtomics: This optimization attempts to reduce the number of atomic instructions issued when
10027
+ the destination addresses and the source are both uniform. For example lets say we have an atomic
10028
+ add happens with destination address as <addr> = constant. <src> = constant too. In this case, lets
10029
+ say for SIMD8 there are 8 lanes trying to write to the same address. H/W will serialize this to
10030
+ 8 back to back atomic instructions which are extremely slow to execute.
10031
+ */
10032
+ void EmitPass::emitScalarAtomics (
10033
+ llvm::Instruction* pInst,
10034
+ const ResourceDescriptor& resource,
10035
+ AtomicOp atomic_op,
10036
+ CVariable* pDstAddr,
10037
+ CVariable* pSrc,
10038
+ bool isA64,
10039
+ bool is16Bit)
10040
+ {
10041
+ e_opcode op = EOPCODE_ADD;
10042
+ // find the value for which opcode(x, identity) == x
10043
+ unsigned int identityValue = 0 ;
10044
+ switch (atomic_op)
10045
+ {
10046
+ case EATOMIC_IADD:
10047
+ case EATOMIC_SUB:
10048
+ case EATOMIC_INC:
10049
+ case EATOMIC_DEC:
10050
+ identityValue = 0 ;
10051
+ op = EOPCODE_ADD;
10052
+ break ;
10053
+ case EATOMIC_UMAX:
10054
+ identityValue = 0 ;
10055
+ op = EOPCODE_MAX;
10056
+ break ;
10057
+ case EATOMIC_IMAX:
10058
+ identityValue = 0x80000000 ;
10059
+ op = EOPCODE_MAX;
10060
+ break ;
10061
+ case EATOMIC_UMIN:
10062
+ identityValue = 0xFFFFFFFF ;
10063
+ op = EOPCODE_MIN;
10064
+ break ;
10065
+ case EATOMIC_IMIN:
10066
+ identityValue = 0X7FFFFFFF ;
10067
+ op = EOPCODE_MIN;
10068
+ break ;
10069
+ default :
10070
+ assert (0 && " unsupported scalar atomic type" );
10071
+ break ;
10072
+ }
10073
+
10074
+ VISA_Type type = is16Bit ? ISA_TYPE_W : ISA_TYPE_D;
10075
+ if (atomic_op == EATOMIC_INC || atomic_op == EATOMIC_DEC)
10041
10076
{
10042
- pDstAddr = UniformCopy (pDstAddr);
10077
+ if (atomic_op == EATOMIC_INC)
10078
+ {
10079
+ atomic_op = EATOMIC_IADD;
10080
+ }
10081
+ else
10082
+ {
10083
+ atomic_op = EATOMIC_SUB;
10084
+ }
10085
+
10086
+ pSrc = m_currShader->ImmToVariable (1 , type);
10043
10087
}
10044
- PointerType *PtrTy = dyn_cast<PointerType>(pInst->getOperand (0 )->getType ());
10045
- bool isA64 = PtrTy && isA64Ptr (PtrTy, m_currShader->GetContext ());
10046
- const bool is16Bit = (pInst->getType ()->getScalarSizeInBits () == 16 );
10047
- // make sure the registers are aligned
10048
- src = ReAlignUniformVariable (src, EALIGN_GRF);
10049
- if (!isa<UndefValue>(pInst->getOperand (1 )))
10088
+ if (atomic_op == EATOMIC_UMAX || atomic_op == EATOMIC_UMIN)
10050
10089
{
10051
- offset = GetSymbol (pInst->getOperand (1 ));
10052
- offset = ReAlignUniformVariable (offset, EALIGN_GRF);
10090
+ type = GetUnsignedType (type);
10091
+ }
10092
+ AtomicOp uniformAtomicOp = atomic_op;
10093
+ bool negateSrc = false ;
10094
+ if (atomic_op == EATOMIC_SUB)
10095
+ {
10096
+ negateSrc = true ;
10097
+ uniformAtomicOp = EATOMIC_IADD;
10098
+ }
10099
+ bool returnsImmValue = (!pInst->use_empty ());
10100
+ CVariable* pFinalAtomicSrcVal = m_currShader->GetNewVariable (
10101
+ 1 ,
10102
+ type,
10103
+ isA64 ? IGC::EALIGN_2GRF : IGC::EALIGN_GRF,
10104
+ true );
10105
+ CVariable *pSrcsArr[2 ] = { nullptr , nullptr };
10106
+ if (returnsImmValue)
10107
+ {
10108
+ // sum all the lanes
10109
+ emitPreOrPostFixOp (op, identityValue, type, negateSrc, pSrc, pSrcsArr);
10110
+
10111
+ CVariable *pSrcCopy = pSrcsArr[0 ];
10112
+ if (m_currShader->m_dispatchSize == SIMDMode::SIMD32)
10113
+ pSrcCopy = pSrcsArr[1 ];
10114
+
10115
+ m_encoder->SetSrcRegion (0 , 0 , 1 , 0 );
10116
+ m_encoder->SetSrcSubReg (0 , numLanes (m_currShader->m_SIMDSize ) - 1 );
10117
+ m_encoder->Copy (pFinalAtomicSrcVal, pSrcCopy);
10118
+ m_encoder->Push ();
10053
10119
}
10054
10120
else
10055
10121
{
10056
- pDstAddr = ReAlignUniformVariable (pDstAddr, isA64 ? EALIGN_2GRF : EALIGN_GRF );
10122
+ emitReductionAll (op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal );
10057
10123
}
10058
- CVariable* dst = m_destination;
10059
- if (!dst-> IsGRFAligned ())
10124
+
10125
+ if (pDstAddr-> IsImmediate ())
10060
10126
{
10061
- dst = m_currShader->GetNewVariable (1 , m_destination->GetType (), EALIGN_GRF, true );
10127
+ CVariable* pDstAddrCopy = m_currShader->GetNewVariable (1 , ISA_TYPE_UD, IGC::EALIGN_GRF, true );
10128
+ m_encoder->SetSimdSize (SIMDMode::SIMD1);
10129
+ m_encoder->SetNoMask ();
10130
+ m_encoder->Copy (pDstAddrCopy, pDstAddr);
10131
+ m_encoder->Push ();
10132
+ pDstAddr = pDstAddrCopy;
10062
10133
}
10063
- if (is16Bit)
10134
+
10135
+ m_encoder->SetSimdSize (SIMDMode::SIMD1);
10136
+ m_encoder->SetNoMask ();
10137
+
10138
+ CVariable *pReturnVal = returnsImmValue ?
10139
+ m_currShader->GetNewVariable (1 , ISA_TYPE_UD, IGC::EALIGN_GRF, true ) :
10140
+ nullptr ;
10141
+
10142
+ if (is16Bit)
10064
10143
{
10065
10144
CVariable *pCastAtomicSrcVal =
10066
10145
m_currShader->GetNewVariable (1 , ISA_TYPE_UD, IGC::EALIGN_GRF, true );
10067
- m_encoder->Cast (pCastAtomicSrcVal, src);
10068
- src = pCastAtomicSrcVal;
10146
+
10147
+ m_encoder->Cast (pCastAtomicSrcVal, pFinalAtomicSrcVal);
10148
+ pFinalAtomicSrcVal = pCastAtomicSrcVal;
10069
10149
}
10070
- m_encoder->SetSimdSize (SIMDMode::SIMD1);
10071
- m_encoder->SetNoMask ();
10150
+
10072
10151
if (isA64)
10073
10152
{
10074
- m_encoder->AtomicRawA64 (atomic_op, dst , pDstAddr, src , nullptr , is16Bit ? 16 : 32 );
10153
+ m_encoder->AtomicRawA64 (uniformAtomicOp, pReturnVal , pDstAddr, pFinalAtomicSrcVal , nullptr , is16Bit ? 16 : 32 );
10075
10154
}
10076
10155
else
10077
10156
{
10078
-
10079
- m_encoder->DwordAtomicRaw (atomic_op, resource, dst, offset, src, nullptr , is16Bit);
10157
+ m_encoder->DwordAtomicRaw (uniformAtomicOp, resource, pReturnVal, pDstAddr, pFinalAtomicSrcVal, nullptr , is16Bit);
10080
10158
}
10081
10159
m_encoder->Push ();
10082
- if (dst != m_destination)
10160
+
10161
+ if (returnsImmValue)
10083
10162
{
10084
- m_encoder->Copy (m_destination, dst);
10085
- m_encoder->Push ();
10163
+ unsigned int counter = m_currShader->m_dispatchSize == SIMDMode::SIMD32 ? 2 : 1 ;
10164
+ assert (op == EOPCODE_ADD && " we can only get the return value for add right now" );
10165
+ for (unsigned int i = 0 ; i < counter; ++i)
10166
+ {
10167
+ m_encoder->SetNoMask ();
10168
+ m_encoder->Add (pSrcsArr[i], pSrcsArr[i], pReturnVal);
10169
+ m_encoder->Push ();
10170
+
10171
+ if (atomic_op == EATOMIC_IADD)
10172
+ {
10173
+ m_encoder->SetSrcModifier (1 , EMOD_NEG);
10174
+ }
10175
+
10176
+ m_encoder->SetSecondHalf (i == 1 );
10177
+ m_encoder->Add (m_destination, pSrcsArr[i], pSrc);
10178
+ m_encoder->Push ();
10179
+ }
10086
10180
}
10087
10181
}
10088
10182
@@ -10091,7 +10185,34 @@ bool EmitPass::IsUniformAtomic(llvm::Instruction* pInst)
10091
10185
if (llvm::GenIntrinsicInst* pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(pInst))
10092
10186
{
10093
10187
GenISAIntrinsic::ID id = pIntrinsic->getIntrinsicID ();
10188
+
10189
+ // Dst address in bytes.
10190
+ if (id == GenISAIntrinsic::GenISA_intatomicraw ||
10191
+ id == GenISAIntrinsic::GenISA_intatomicrawA64)
10192
+ {
10193
+ if (IGC_IS_FLAG_ENABLED (DisableScalarAtomics) || m_currShader->m_DriverInfo ->WASLMPointersDwordUnit ())
10194
+ return false ;
10195
+ llvm::Value* pllDstAddr = pInst->getOperand (1 );
10196
+ CVariable* pDstAddr = GetSymbol (pllDstAddr);
10197
+ if (pDstAddr->IsUniform ())
10198
+ {
10199
+ AtomicOp atomic_op = static_cast <AtomicOp>(llvm::cast<llvm::ConstantInt>(pInst->getOperand (3 ))->getZExtValue ());
10200
+
10201
+ bool isAddAtomic = atomic_op == EATOMIC_IADD ||
10202
+ atomic_op == EATOMIC_INC ||
10203
+ atomic_op == EATOMIC_SUB;
10204
+ bool isMinMaxAtomic =
10205
+ atomic_op == EATOMIC_UMAX ||
10206
+ atomic_op == EATOMIC_UMIN ||
10207
+ atomic_op == EATOMIC_IMIN ||
10208
+ atomic_op == EATOMIC_IMAX;
10209
+
10210
+ if (isAddAtomic || (isMinMaxAtomic && pInst->use_empty ()))
10211
+ return true ;
10212
+ }
10213
+ }
10094
10214
}
10215
+
10095
10216
return false ;
10096
10217
}
10097
10218
@@ -10156,7 +10277,10 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
10156
10277
atomic_op = static_cast <AtomicOp>(llvm::cast<llvm::ConstantInt>(pInsn->getOperand (3 ))->getZExtValue ());
10157
10278
}
10158
10279
10280
+ unsigned short bitwidth = pInsn->getType ()->getScalarSizeInBits ();
10159
10281
const bool is16Bit = (pInsn->getType ()->getScalarSizeInBits () == 16 );
10282
+
10283
+
10160
10284
// atomic_inc and atomic_dec don't have both src0 and src1.
10161
10285
if (atomic_op != EATOMIC_INC && atomic_op != EATOMIC_DEC)
10162
10286
{
@@ -10165,6 +10289,20 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
10165
10289
10166
10290
// Dst address in bytes.
10167
10291
CVariable* pDstAddr = GetSymbol (pllDstAddr);
10292
+ // If DisableScalarAtomics regkey is enabled or DisableIGCOptimizations regkey is enabled then
10293
+ // don't enable scalar atomics, also do not enable for 64 bit
10294
+ if (IsUniformAtomic (pInsn) && bitwidth != 64 )
10295
+ {
10296
+ PointerType *PtrTy = dyn_cast<PointerType>(pllDstAddr->getType ());
10297
+ bool isA64 = PtrTy && isA64Ptr (PtrTy, m_currShader->GetContext ());
10298
+ e_alignment uniformAlign = isA64 ? EALIGN_2GRF : EALIGN_GRF;
10299
+ // Re-align the pointer if it's not GRF aligned.
10300
+ pDstAddr = ReAlignUniformVariable (pDstAddr, uniformAlign);
10301
+ emitScalarAtomics (pInsn, resource, atomic_op, pDstAddr, pSrc0, isA64, is16Bit);
10302
+ ResetVMask ();
10303
+ return ;
10304
+ }
10305
+
10168
10306
pDstAddr = BroadcastIfUniform (pDstAddr);
10169
10307
if (pSrc0)
10170
10308
{
@@ -10181,7 +10319,6 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
10181
10319
m_currShader->GetNewVariable (numLanes (m_currShader->m_SIMDSize ), m_destination->GetType (), EALIGN_GRF) :
10182
10320
nullptr ;
10183
10321
10184
- unsigned short bitwidth = pInsn->getType ()->getScalarSizeInBits ();
10185
10322
PointerType *PtrTy = dyn_cast<PointerType>(pllDstAddr->getType ());
10186
10323
bool isA64 = PtrTy && isA64Ptr (PtrTy, m_currShader->GetContext ());
10187
10324
bool extendPointer = (bitwidth == 64 && !isA64);
@@ -13935,12 +14072,13 @@ void EmitPass::emitWavePrefix(llvm::GenIntrinsicInst* inst, bool isQuad)
13935
14072
GetReductionOp (op, inst->getOperand (0 )->getType (), identity, opCode, type);
13936
14073
CVariable* src = GetSymbol (inst->getOperand (0 ));
13937
14074
CVariable *dst[2 ] = { nullptr , nullptr };
13938
- dst[0 ] = m_currShader->GetVarHalf (m_destination, 0 );
13939
- if (m_currShader->m_dispatchSize == SIMDMode::SIMD32)
14075
+ emitPreOrPostFixOp (opCode, identity, type, false , src, dst, !isInclusiveScan, isQuad);
14076
+ m_encoder->Copy (m_destination, dst[0 ]);
14077
+ if (m_currShader->m_dispatchSize == SIMDMode::SIMD32)
13940
14078
{
13941
- dst[1 ] = m_currShader->GetVarHalf (m_destination, 1 );
14079
+ m_encoder->SetSecondHalf (true );
14080
+ m_encoder->Copy (m_destination, dst[1 ]);
13942
14081
}
13943
- emitPreOrPostFixOp (opCode, identity, type, false , src, dst, !isInclusiveScan, isQuad);
13944
14082
m_encoder->Push ();
13945
14083
}
13946
14084
0 commit comments