Skip to content

Commit 966f6c8

Browse files
Raoux, Thomas Fpaigeale
authored andcommitted
Revert of commit 3246f77.
Change-Id: I0ac07722731508d5145fd64f610f361ad5d89e21
1 parent 5774698 commit 966f6c8

File tree

11 files changed

+203
-374
lines changed

11 files changed

+203
-374
lines changed

IGC/Compiler/CISACodeGen/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ set(IGC_BUILD__SRC__CISACodeGen_Common
7171
"${CMAKE_CURRENT_SOURCE_DIR}/Simd32Profitability.cpp"
7272
"${CMAKE_CURRENT_SOURCE_DIR}/TypeDemote.cpp"
7373
"${CMAKE_CURRENT_SOURCE_DIR}/VariableReuseAnalysis.cpp"
74-
"${CMAKE_CURRENT_SOURCE_DIR}/UniformAtomic.cpp"
7574
"${CMAKE_CURRENT_SOURCE_DIR}/TranslationTable.cpp"
7675
"${CMAKE_CURRENT_SOURCE_DIR}/VectorPreProcess.cpp"
7776
"${CMAKE_CURRENT_SOURCE_DIR}/VectorProcess.cpp"
@@ -115,7 +114,6 @@ set(IGC_BUILD__HDR__CISACodeGen_Common
115114
"${CMAKE_CURRENT_SOURCE_DIR}/FoldKnownWorkGroupSizes.h"
116115
"${CMAKE_CURRENT_SOURCE_DIR}/GenCodeGenModule.h"
117116
"${CMAKE_CURRENT_SOURCE_DIR}/GenIRLowering.h"
118-
"${CMAKE_CURRENT_SOURCE_DIR}/GenLLVMPasses.h"
119117
"${CMAKE_CURRENT_SOURCE_DIR}/GenNullPointerLowering.h"
120118
"${CMAKE_CURRENT_SOURCE_DIR}/GenSimplification.h"
121119
"${CMAKE_CURRENT_SOURCE_DIR}/GeometryShaderCodeGen.hpp"

IGC/Compiler/CISACodeGen/CShader.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,7 +1646,6 @@ static bool IsRawAtomicIntrinsic(llvm::Value *V) {
16461646
case GenISAIntrinsic::GenISA_fcmpxchgatomicraw:
16471647
case GenISAIntrinsic::GenISA_icmpxchgatomicrawA64:
16481648
case GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64:
1649-
case GenISAIntrinsic::GenISA_WaveUniformAtomic:
16501649
return true;
16511650
}
16521651

@@ -1685,7 +1684,7 @@ static e_alignment GetPreferredAlignmentOnUse(llvm::Value *V, WIAnalysis *WIA,
16851684
}
16861685

16871686
if (IsRawAtomicIntrinsic(GII)) {
1688-
Value *Ptr = V;
1687+
Value *Ptr = GII->getArgOperand(1);
16891688
if (WIA->whichDepend(Ptr) == WIAnalysis::UNIFORM) {
16901689
if (PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType())) {
16911690
if (IGC::isA64Ptr(PtrTy, pContext))
@@ -2147,8 +2146,6 @@ unsigned int CShader::EvaluateSIMDConstExpr(Value* C)
21472146
{
21482147
switch(op->getOpcode())
21492148
{
2150-
case Instruction::Sub:
2151-
return EvaluateSIMDConstExpr(op->getOperand(0)) - EvaluateSIMDConstExpr(op->getOperand(1));
21522149
case Instruction::Add:
21532150
return EvaluateSIMDConstExpr(op->getOperand(0)) + EvaluateSIMDConstExpr(op->getOperand(1));
21542151
case Instruction::Mul:

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 188 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -7179,9 +7179,6 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
71797179
case GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64:
71807180
emitAtomicRaw(inst);
71817181
break;
7182-
case GenISAIntrinsic::GenISA_WaveUniformAtomic:
7183-
emitScalarAtomics(inst);
7184-
break;
71857182
case GenISAIntrinsic::GenISA_dwordatomicstructured:
71867183
case GenISAIntrinsic::GenISA_floatatomicstructured:
71877184
case GenISAIntrinsic::GenISA_cmpxchgatomicstructured:
@@ -9791,16 +9788,12 @@ void EmitPass::emitPreOrPostFixOp(e_opcode op, uint64_t identityValue, VISA_Type
97919788
CVariable* maskedSrc[2] = { 0 };
97929789
for(int i = 0; i < counter; ++i)
97939790
{
9791+
CVariable* pSrcCopy = m_currShader->GetNewVariable(
9792+
numLanes(m_currShader->m_SIMDSize),
9793+
type,
9794+
IGC::EALIGN_GRF,
9795+
false);
97949796

9795-
CVariable* pSrcCopy = pSrcsArr[i];
9796-
if(pSrcCopy == nullptr)
9797-
{
9798-
pSrcCopy = m_currShader->GetNewVariable(
9799-
numLanes(m_currShader->m_SIMDSize),
9800-
type,
9801-
IGC::EALIGN_GRF,
9802-
false);
9803-
}
98049797

98059798
// Set the GRF to 0 with no mask. This will set all the registers to 0
98069799
CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
@@ -10029,60 +10022,161 @@ void EmitPass::emitPreOrPostFixOp(e_opcode op, uint64_t identityValue, VISA_Type
1002910022
m_encoder->SetSecondHalf(false);
1003010023
}
1003110024

10032-
/// Emit single atomic for the whole HW thread
10033-
void EmitPass::emitScalarAtomics(Instruction* pInst)
10034-
{
10035-
CVariable* pDstAddr = GetSymbol(pInst->getOperand(0));
10036-
CVariable* offset = pDstAddr;
10037-
CVariable* src = GetSymbol(pInst->getOperand(2));
10038-
ResourceDescriptor resource = GetResourceVariable(pInst->getOperand(0));
10039-
AtomicOp atomic_op = static_cast<AtomicOp>(cast<ConstantInt>(pInst->getOperand(3))->getZExtValue());
10040-
if(!pDstAddr->IsUniform())
10025+
/*
10026+
ScalarAtomics: This optimization attempts to reduce the number of atomic instructions issued when
10027+
the destination addresses and the source are both uniform. For example lets say we have an atomic
10028+
add happens with destination address as <addr> = constant. <src> = constant too. In this case, lets
10029+
say for SIMD8 there are 8 lanes trying to write to the same address. H/W will serialize this to
10030+
8 back to back atomic instructions which are extremely slow to execute.
10031+
*/
10032+
void EmitPass::emitScalarAtomics(
10033+
llvm::Instruction* pInst,
10034+
const ResourceDescriptor& resource,
10035+
AtomicOp atomic_op,
10036+
CVariable* pDstAddr,
10037+
CVariable* pSrc,
10038+
bool isA64,
10039+
bool is16Bit)
10040+
{
10041+
e_opcode op = EOPCODE_ADD;
10042+
// find the value for which opcode(x, identity) == x
10043+
unsigned int identityValue = 0;
10044+
switch(atomic_op)
10045+
{
10046+
case EATOMIC_IADD:
10047+
case EATOMIC_SUB:
10048+
case EATOMIC_INC:
10049+
case EATOMIC_DEC:
10050+
identityValue = 0;
10051+
op = EOPCODE_ADD;
10052+
break;
10053+
case EATOMIC_UMAX:
10054+
identityValue = 0;
10055+
op = EOPCODE_MAX;
10056+
break;
10057+
case EATOMIC_IMAX:
10058+
identityValue = 0x80000000;
10059+
op = EOPCODE_MAX;
10060+
break;
10061+
case EATOMIC_UMIN:
10062+
identityValue = 0xFFFFFFFF;
10063+
op = EOPCODE_MIN;
10064+
break;
10065+
case EATOMIC_IMIN:
10066+
identityValue = 0X7FFFFFFF;
10067+
op = EOPCODE_MIN;
10068+
break;
10069+
default:
10070+
assert(0 && "unsupported scalar atomic type");
10071+
break;
10072+
}
10073+
10074+
VISA_Type type = is16Bit ? ISA_TYPE_W : ISA_TYPE_D;
10075+
if (atomic_op == EATOMIC_INC || atomic_op == EATOMIC_DEC)
1004110076
{
10042-
pDstAddr = UniformCopy(pDstAddr);
10077+
if (atomic_op == EATOMIC_INC)
10078+
{
10079+
atomic_op = EATOMIC_IADD;
10080+
}
10081+
else
10082+
{
10083+
atomic_op = EATOMIC_SUB;
10084+
}
10085+
10086+
pSrc = m_currShader->ImmToVariable(1, type);
1004310087
}
10044-
PointerType *PtrTy = dyn_cast<PointerType>(pInst->getOperand(0)->getType());
10045-
bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
10046-
const bool is16Bit = (pInst->getType()->getScalarSizeInBits() == 16);
10047-
// make sure the registers are aligned
10048-
src = ReAlignUniformVariable(src, EALIGN_GRF);
10049-
if(!isa<UndefValue>(pInst->getOperand(1)))
10088+
if(atomic_op == EATOMIC_UMAX || atomic_op == EATOMIC_UMIN)
1005010089
{
10051-
offset = GetSymbol(pInst->getOperand(1));
10052-
offset = ReAlignUniformVariable(offset, EALIGN_GRF);
10090+
type = GetUnsignedType(type);
10091+
}
10092+
AtomicOp uniformAtomicOp = atomic_op;
10093+
bool negateSrc = false;
10094+
if(atomic_op == EATOMIC_SUB)
10095+
{
10096+
negateSrc = true;
10097+
uniformAtomicOp = EATOMIC_IADD;
10098+
}
10099+
bool returnsImmValue = (!pInst->use_empty());
10100+
CVariable* pFinalAtomicSrcVal = m_currShader->GetNewVariable(
10101+
1,
10102+
type,
10103+
isA64 ? IGC::EALIGN_2GRF : IGC::EALIGN_GRF,
10104+
true);
10105+
CVariable *pSrcsArr[2] = { nullptr, nullptr };
10106+
if(returnsImmValue)
10107+
{
10108+
// sum all the lanes
10109+
emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
10110+
10111+
CVariable *pSrcCopy = pSrcsArr[0];
10112+
if(m_currShader->m_dispatchSize == SIMDMode::SIMD32)
10113+
pSrcCopy = pSrcsArr[1];
10114+
10115+
m_encoder->SetSrcRegion(0, 0, 1, 0);
10116+
m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
10117+
m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
10118+
m_encoder->Push();
1005310119
}
1005410120
else
1005510121
{
10056-
pDstAddr = ReAlignUniformVariable(pDstAddr, isA64 ? EALIGN_2GRF : EALIGN_GRF);
10122+
emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
1005710123
}
10058-
CVariable* dst = m_destination;
10059-
if(!dst->IsGRFAligned())
10124+
10125+
if (pDstAddr->IsImmediate())
1006010126
{
10061-
dst = m_currShader->GetNewVariable(1, m_destination->GetType(), EALIGN_GRF, true);
10127+
CVariable* pDstAddrCopy = m_currShader->GetNewVariable(1, ISA_TYPE_UD, IGC::EALIGN_GRF, true);
10128+
m_encoder->SetSimdSize(SIMDMode::SIMD1);
10129+
m_encoder->SetNoMask();
10130+
m_encoder->Copy(pDstAddrCopy, pDstAddr);
10131+
m_encoder->Push();
10132+
pDstAddr = pDstAddrCopy;
1006210133
}
10063-
if(is16Bit)
10134+
10135+
m_encoder->SetSimdSize(SIMDMode::SIMD1);
10136+
m_encoder->SetNoMask();
10137+
10138+
CVariable *pReturnVal = returnsImmValue ?
10139+
m_currShader->GetNewVariable(1, ISA_TYPE_UD, IGC::EALIGN_GRF, true) :
10140+
nullptr;
10141+
10142+
if (is16Bit)
1006410143
{
1006510144
CVariable *pCastAtomicSrcVal =
1006610145
m_currShader->GetNewVariable(1, ISA_TYPE_UD, IGC::EALIGN_GRF, true);
10067-
m_encoder->Cast(pCastAtomicSrcVal, src);
10068-
src = pCastAtomicSrcVal;
10146+
10147+
m_encoder->Cast(pCastAtomicSrcVal, pFinalAtomicSrcVal);
10148+
pFinalAtomicSrcVal = pCastAtomicSrcVal;
1006910149
}
10070-
m_encoder->SetSimdSize(SIMDMode::SIMD1);
10071-
m_encoder->SetNoMask();
10150+
1007210151
if(isA64)
1007310152
{
10074-
m_encoder->AtomicRawA64(atomic_op, dst, pDstAddr, src, nullptr, is16Bit ? 16 : 32);
10153+
m_encoder->AtomicRawA64(uniformAtomicOp, pReturnVal, pDstAddr, pFinalAtomicSrcVal, nullptr, is16Bit ? 16 : 32);
1007510154
}
1007610155
else
1007710156
{
10078-
10079-
m_encoder->DwordAtomicRaw(atomic_op, resource, dst, offset, src, nullptr, is16Bit);
10157+
m_encoder->DwordAtomicRaw(uniformAtomicOp, resource, pReturnVal, pDstAddr, pFinalAtomicSrcVal, nullptr, is16Bit);
1008010158
}
1008110159
m_encoder->Push();
10082-
if(dst != m_destination)
10160+
10161+
if (returnsImmValue)
1008310162
{
10084-
m_encoder->Copy(m_destination, dst);
10085-
m_encoder->Push();
10163+
unsigned int counter = m_currShader->m_dispatchSize == SIMDMode::SIMD32 ? 2 : 1;
10164+
assert(op == EOPCODE_ADD && "we can only get the return value for add right now");
10165+
for (unsigned int i = 0; i < counter; ++i)
10166+
{
10167+
m_encoder->SetNoMask();
10168+
m_encoder->Add(pSrcsArr[i], pSrcsArr[i], pReturnVal);
10169+
m_encoder->Push();
10170+
10171+
if (atomic_op == EATOMIC_IADD)
10172+
{
10173+
m_encoder->SetSrcModifier(1, EMOD_NEG);
10174+
}
10175+
10176+
m_encoder->SetSecondHalf(i == 1);
10177+
m_encoder->Add(m_destination, pSrcsArr[i], pSrc);
10178+
m_encoder->Push();
10179+
}
1008610180
}
1008710181
}
1008810182

@@ -10091,7 +10185,34 @@ bool EmitPass::IsUniformAtomic(llvm::Instruction* pInst)
1009110185
if (llvm::GenIntrinsicInst* pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(pInst))
1009210186
{
1009310187
GenISAIntrinsic::ID id = pIntrinsic->getIntrinsicID();
10188+
10189+
// Dst address in bytes.
10190+
if (id == GenISAIntrinsic::GenISA_intatomicraw ||
10191+
id == GenISAIntrinsic::GenISA_intatomicrawA64)
10192+
{
10193+
if(IGC_IS_FLAG_ENABLED(DisableScalarAtomics) || m_currShader->m_DriverInfo->WASLMPointersDwordUnit())
10194+
return false;
10195+
llvm::Value* pllDstAddr = pInst->getOperand(1);
10196+
CVariable* pDstAddr = GetSymbol(pllDstAddr);
10197+
if (pDstAddr->IsUniform())
10198+
{
10199+
AtomicOp atomic_op = static_cast<AtomicOp>(llvm::cast<llvm::ConstantInt>(pInst->getOperand(3))->getZExtValue());
10200+
10201+
bool isAddAtomic = atomic_op == EATOMIC_IADD ||
10202+
atomic_op == EATOMIC_INC ||
10203+
atomic_op == EATOMIC_SUB;
10204+
bool isMinMaxAtomic =
10205+
atomic_op == EATOMIC_UMAX ||
10206+
atomic_op == EATOMIC_UMIN ||
10207+
atomic_op == EATOMIC_IMIN ||
10208+
atomic_op == EATOMIC_IMAX;
10209+
10210+
if (isAddAtomic || (isMinMaxAtomic && pInst->use_empty()))
10211+
return true;
10212+
}
10213+
}
1009410214
}
10215+
1009510216
return false;
1009610217
}
1009710218

@@ -10156,7 +10277,10 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
1015610277
atomic_op = static_cast<AtomicOp>(llvm::cast<llvm::ConstantInt>(pInsn->getOperand(3))->getZExtValue());
1015710278
}
1015810279

10280+
unsigned short bitwidth = pInsn->getType()->getScalarSizeInBits();
1015910281
const bool is16Bit = (pInsn->getType()->getScalarSizeInBits() == 16);
10282+
10283+
1016010284
// atomic_inc and atomic_dec don't have both src0 and src1.
1016110285
if(atomic_op != EATOMIC_INC && atomic_op != EATOMIC_DEC)
1016210286
{
@@ -10165,6 +10289,20 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
1016510289

1016610290
// Dst address in bytes.
1016710291
CVariable* pDstAddr = GetSymbol(pllDstAddr);
10292+
// If DisableScalarAtomics regkey is enabled or DisableIGCOptimizations regkey is enabled then
10293+
// don't enable scalar atomics, also do not enable for 64 bit
10294+
if (IsUniformAtomic(pInsn) && bitwidth != 64)
10295+
{
10296+
PointerType *PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
10297+
bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
10298+
e_alignment uniformAlign = isA64 ? EALIGN_2GRF : EALIGN_GRF;
10299+
// Re-align the pointer if it's not GRF aligned.
10300+
pDstAddr = ReAlignUniformVariable(pDstAddr, uniformAlign);
10301+
emitScalarAtomics(pInsn, resource, atomic_op, pDstAddr, pSrc0, isA64, is16Bit);
10302+
ResetVMask();
10303+
return;
10304+
}
10305+
1016810306
pDstAddr = BroadcastIfUniform(pDstAddr);
1016910307
if (pSrc0)
1017010308
{
@@ -10181,7 +10319,6 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
1018110319
m_currShader->GetNewVariable(numLanes(m_currShader->m_SIMDSize), m_destination->GetType(), EALIGN_GRF) :
1018210320
nullptr;
1018310321

10184-
unsigned short bitwidth = pInsn->getType()->getScalarSizeInBits();
1018510322
PointerType *PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
1018610323
bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
1018710324
bool extendPointer = (bitwidth == 64 && !isA64);
@@ -13935,12 +14072,13 @@ void EmitPass::emitWavePrefix(llvm::GenIntrinsicInst* inst, bool isQuad)
1393514072
GetReductionOp(op, inst->getOperand(0)->getType(), identity, opCode, type);
1393614073
CVariable* src = GetSymbol(inst->getOperand(0));
1393714074
CVariable *dst[2] = { nullptr, nullptr };
13938-
dst[0] = m_currShader->GetVarHalf(m_destination, 0);
13939-
if(m_currShader->m_dispatchSize == SIMDMode::SIMD32)
14075+
emitPreOrPostFixOp(opCode, identity, type, false, src, dst, !isInclusiveScan, isQuad);
14076+
m_encoder->Copy(m_destination, dst[0]);
14077+
if (m_currShader->m_dispatchSize == SIMDMode::SIMD32)
1394014078
{
13941-
dst[1] = m_currShader->GetVarHalf(m_destination, 1);
14079+
m_encoder->SetSecondHalf(true);
14080+
m_encoder->Copy(m_destination, dst[1]);
1394214081
}
13943-
emitPreOrPostFixOp(opCode, identity, type, false, src, dst, !isInclusiveScan, isQuad);
1394414082
m_encoder->Push();
1394514083
}
1394614084

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,14 @@ class EmitPass : public llvm::FunctionPass
229229

230230
void emitUAVSerialize();
231231

232-
void emitScalarAtomics(llvm::Instruction* pInst);
232+
void emitScalarAtomics(
233+
llvm::Instruction* pInst,
234+
const ResourceDescriptor& resource,
235+
AtomicOp atomic_op,
236+
CVariable* pDstAddr,
237+
CVariable* pSrc,
238+
bool isA64,
239+
bool is16Bit);
233240
/// do reduction and accummulate all the activate channels, return a uniform
234241
void emitReductionAll(
235242
e_opcode op,

0 commit comments

Comments
 (0)