Skip to content

Commit f6fd34c

Browse files
jgu222gfxbot
authored andcommitted
Enable fusing atomic messages. It is off. Turn it on by setting
igc key EnableAtomicFusion to 1. Change-Id: I9234b8bab65d05018a6dc2fc78718515c3408327
1 parent 1403698 commit f6fd34c

File tree

5 files changed

+204
-28
lines changed

5 files changed

+204
-28
lines changed

IGC/Compiler/CISACodeGen/CISABuilder.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3829,14 +3829,25 @@ void CEncoder::InitEncoder( bool canAbortOnSpill )
38293829
vbuilder->SetOption(vISA_FuseTypedWrites, true);
38303830
}
38313831

3832-
// Enable SendFusion for SIMD8
3832+
// Enable SendFusion for SIMD8
38333833
if (IGC_IS_FLAG_ENABLED(EnableSendFusion) &&
3834-
m_program->GetContext()->platform.supportSplitSend() &&
3834+
m_program->GetContext()->platform.supportSplitSend() &&
38353835
m_program->m_dispatchSize == SIMDMode::SIMD8 &&
38363836
(IGC_GET_FLAG_VALUE(EnableSendFusion) == FLAG_LEVEL_2 || // 2: force send fusion
38373837
context->m_DriverInfo.AllowSendFusion()))
38383838
{
38393839
vbuilder->SetOption(vISA_EnableSendFusion, true);
3840+
if (IGC_IS_FLAG_ENABLED(EnableAtomicFusion) &&
3841+
context->type == ShaderType::OPENCL_SHADER)
3842+
{
3843+
vbuilder->SetOption(vISA_EnableAtomicFusion, true);
3844+
}
3845+
}
3846+
3847+
if (context->getModuleMetaData()->compOpt.FastRelaxedMath ||
3848+
context->getModuleMetaData()->compOpt.UnsafeMathOptimizations)
3849+
{
3850+
vbuilder->SetOption(vISA_unsafeMath, true);
38403851
}
38413852

38423853
// With statelessToStatefull on, it is possible that two different BTI messages

IGC/common/igc_flags.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ DECLARE_IGC_REGKEY(bool, ExpandPlane, 0, "Enable pln to ma
5656
DECLARE_IGC_REGKEY(bool, EnableBCR, false, "Enable bank conflict reduction.")
5757
DECLARE_IGC_REGKEY(bool, GlobalSendVarSplit, false, "Enable global send variable splitting when we are about to spill")
5858
DECLARE_IGC_REGKEY(DWORD,EnableSendFusion, 1, "Enable(!=0)/disable(0)/force(2) send fusion. Valid for simd8 shader/kernel only.")
59+
DECLARE_IGC_REGKEY(bool, EnableAtomicFusion, false, "To enable/disable atomic send fusion (simd8 shaders). Valid if EnableSendFusion is on.")
5960

6061
DECLARE_IGC_GROUP("IGC Optimization")
6162
DECLARE_IGC_REGKEY(bool, DisableIGCOptimizations, false, "Setting this to 1/true adds a compiler switch to disables all the above IGC optimizations")

visa/Gen4_IR.hpp

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -409,28 +409,61 @@ class G4_SendMsgDescriptor
409409
funcID == SFID_SPAWNER || funcID == SFID_URB || funcID == SFID_NUM;
410410
}
411411

412-
bool isAtomicMessage() const
412+
bool isIntAtomicMessage() const
413413
{
414414
auto funcID = extDesc.layout.funcID;
415415
if (funcID != SFID_DP_DC1)
416416
return false;
417417

418-
uint32_t funcCtrl = getFuncCtrl();
419-
uint16_t msgType = (funcCtrl >> 14) & 0xF;
418+
uint16_t msgType = getMessageType();
420419
if (msgType == DC1_UNTYPED_ATOMIC || msgType == DC1_A64_ATOMIC)
421420
{
422421
return true;
423422
}
424423
if (getGenxPlatform() >= GENX_SKL)
424+
{
425+
if (msgType == DC1_TYPED_ATOMIC)
426+
return true;
427+
}
428+
return false;
429+
}
430+
431+
bool isFloatAtomicMessage() const
432+
{
433+
auto funcID = extDesc.layout.funcID;
434+
if (funcID != SFID_DP_DC1)
435+
return false;
436+
437+
uint16_t msgType = getMessageType();
438+
if (getGenxPlatform() >= GENX_SKL)
425439
{
426440
if (msgType == DC1_UNTYPED_FLOAT_ATOMIC ||
427-
msgType == DC1_A64_UNTYPED_FLOAT_ATOMIC ||
428-
msgType == DC1_TYPED_ATOMIC)
441+
msgType == DC1_A64_UNTYPED_FLOAT_ATOMIC)
429442
return true;
430443
}
431444
return false;
432445
}
433446

447+
bool isAtomicMessage() const
448+
{
449+
return isIntAtomicMessage() || isFloatAtomicMessage();
450+
}
451+
452+
uint16_t getAtomicOp() const
453+
{
454+
assert(isAtomicMessage() && "ICE: getting atomicOp from non-atomic message!");
455+
uint32_t funcCtrl = getFuncCtrl();
456+
if (isIntAtomicMessage())
457+
{
458+
// bits: 11:8
459+
return (uint16_t)((funcCtrl >> 8) & 0xF);
460+
}
461+
462+
// must be float Atomic
463+
// bits: 10:8
464+
return (int16_t)((funcCtrl >> 8) & 0x7);
465+
}
466+
434467
bool isBarrierMsg() const
435468
{
436469
auto funcID = extDesc.layout.funcID;

visa/SendFusion.cpp

Lines changed: 150 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ namespace vISA
130130
G4_VarBase* getVarBase(G4_VarBase* RegVar, G4_Type Ty);
131131
uint32_t getFuncCtrlWithSimd16(G4_SendMsgDescriptor* Desc);
132132
void simplifyMsg(INST_LIST_ITER SendIter);
133+
bool isAtomicCandidate(G4_SendMsgDescriptor* msgDesc);
133134

134135
bool WAce0Read;
135136

@@ -190,6 +191,11 @@ uint32_t SendFusion::getFuncCtrlWithSimd16(G4_SendMsgDescriptor* Desc)
190191
// bit13-12: SM3
191192
FC = ((FC & ~0x3000) | (MDC_SM3_SIMD16 << 12));
192193
break;
194+
case DC1_UNTYPED_ATOMIC:
195+
case DC1_UNTYPED_FLOAT_ATOMIC:
196+
// bit12: SM2R
197+
FC = ((FC & ~0x1000) | (MDC_SM2R_SIMD16 << 12));
198+
break;
193199
}
194200
}
195201
else if (funcID == SFID_DP_DC2)
@@ -220,9 +226,74 @@ uint32_t SendFusion::getFuncCtrlWithSimd16(G4_SendMsgDescriptor* Desc)
220226
return FC;
221227
}
222228

229+
bool SendFusion::isAtomicCandidate(G4_SendMsgDescriptor* msgDesc)
230+
{
231+
uint32_t funcID = msgDesc->getFuncId();
232+
if (funcID != SFID_DP_DC1) {
233+
return false;
234+
}
235+
236+
// Right now, the following atomic messages are DW per simd-lane.
237+
uint16_t msgType = msgDesc->getMessageType();
238+
bool intAtomic = true; // true: int; false : float
239+
switch (msgType) {
240+
default:
241+
return false;
242+
case DC1_UNTYPED_ATOMIC:
243+
break;
244+
case DC1_UNTYPED_FLOAT_ATOMIC:
245+
intAtomic = false;
246+
break;
247+
}
248+
249+
// Had right atomic type, now check AtomicOp
250+
uint16_t atomicOp = msgDesc->getAtomicOp();
251+
if (intAtomic)
252+
{
253+
switch (atomicOp)
254+
{
255+
default:
256+
return false;
257+
case GEN_ATOMIC_AND:
258+
case GEN_ATOMIC_OR:
259+
case GEN_ATOMIC_XOR:
260+
case GEN_ATOMIC_INC:
261+
case GEN_ATOMIC_DEC:
262+
case GEN_ATOMIC_ADD:
263+
case GEN_ATOMIC_SUB:
264+
case GEN_ATOMIC_REVSUB:
265+
case GEN_ATOMIC_IMAX:
266+
case GEN_ATOMIC_IMIN:
267+
case GEN_ATOMIC_UMAX:
268+
case GEN_ATOMIC_UMIN:
269+
case GEN_ATOMIC_PREDEC:
270+
break;
271+
}
272+
}
273+
else
274+
{
275+
if (!Builder->getOption(vISA_unsafeMath))
276+
{
277+
return false;
278+
}
279+
280+
switch (atomicOp)
281+
{
282+
default:
283+
return false;
284+
case GEN_ATOMIC_FMAX:
285+
case GEN_ATOMIC_FMIN:
286+
break;
287+
}
288+
}
289+
return true;
290+
291+
// Need to check if it is packed half integer/float ?
292+
}
293+
223294
// We will do send fusion for a few messages. Those messages all
224-
// have the address payload for each channel, thus address payload
225-
// is 1 GRF for exec_size=8 (no A64 messages for now).
295+
// have DW-sized address for each lane, thus address payload is
296+
// 1 GRF for exec_size=8 (no A64 messages for now).
226297
//
227298
// The optimization is performed for the following cases:
228299
// 1) [(w)] send(8) + send(8) --> (W&flag) send(16), and
@@ -286,10 +357,52 @@ bool SendFusion::simplifyAndCheckCandidate(INST_LIST_ITER Iter)
286357
return false;
287358
}
288359

289-
// Unless we can prove there are no aliases of two sends, we will not be
290-
// able to do fusion (or we know for sure that the first addr's value or
291-
// last addr's value is taken). For now, disable it.
292-
if (msgDesc->isDataPortWrite())
360+
// For write messages:
361+
// unless we can prove there are no aliases of two sends's address payload,
362+
// we will not be able to do fusion as hardware does not have deterministic
363+
// behavior if the same address appear more than once. For example,
364+
//
365+
// send (8) (a1_0, C, a1_2, ..., a1_7) (d1_0, d1_1, d1_2, ..., d1_7)
366+
// send (8) (a2_0, C, a2_2, ..., a2_7) (d2_0, d2_1, d2_2, ..., d2_7)
367+
//
368+
// Both sends have the same addr 'C' in lane 1, and C's value is d2_1 after
369+
// two sends. However, if they are fused, they become:
370+
//
371+
// send (16) (a1_0, C, ..., a2_0, C, ..., a2_7)
372+
// (d1_0, d1_1, ..., d2_0, d2_1, ..., d2_7)
373+
//
374+
// The hardware cannot guarantee that d2_1 will be the value written to addr 'C'.
375+
// Thus, we have to disallow fusion for writes
376+
//
377+
// But we can do it for atomic messages if certain conditions meet:
378+
// (Checked in isAtomicCandidate().)
379+
// 1. Can fuse if no return value.
380+
// Atomic messages has both read/write. Let us take a look at atomic_add
381+
// for example: let's assume atomic_add returns the old value and does updating.
382+
//
383+
// location at p : 10 (original value)
384+
// x = atomic_add p, 1
385+
// y = atomic_add p, 2
386+
// then, x = 10 & y = 11
387+
//
388+
// If fused them, it becomes:
389+
// {x, y} = atomic_add {p, p}, {1, 2}
390+
// it's possible that we have x = 12 & y = 10 as the 2nd atomic operations
391+
// could be performed first. This does change the behavior of the program.
392+
//
393+
// Since we don't know if two sends share the same address, we will conservatively
394+
// avoid fusing two sends with return values for now.
395+
// 2. Assume unsafe math is present for float atomic.
396+
// As two sends might share the same location, and fused send might change the order
397+
// of float atomic operations. We need to have unsafe-math to perform fusing legally.
398+
//
399+
bool isAtomicCand = false;
400+
if (Builder->getOption(vISA_EnableAtomicFusion))
401+
{
402+
isAtomicCand = isAtomicCandidate(msgDesc);
403+
}
404+
if ((!isAtomicCand && msgDesc->isDataPortWrite()) ||
405+
(isAtomicCand && rspLen > 0))
293406
{
294407
return false;
295408
}
@@ -317,6 +430,14 @@ bool SendFusion::simplifyAndCheckCandidate(INST_LIST_ITER Iter)
317430
return false;
318431
}
319432

433+
// Only handling the following messages that have DW as data element.
434+
// Untyped with up to 4 DW per lane is handled.
435+
436+
// special handling of atomic
437+
if (isAtomicCand) {
438+
return true;
439+
}
440+
320441
uint32_t funcID = msgDesc->getFuncId();
321442
uint32_t msgType = msgDesc->getMessageType();
322443
if (funcID == SFID_DP_DC)
@@ -542,9 +663,12 @@ bool SendFusion::canFusion(INST_LIST_ITER IT0, INST_LIST_ITER IT1)
542663
// and hardware does not have deterministic behavior about which data,
543664
// data0 or data1, will be stored into x! For this reason, no write
544665
// will be fused if they have common address. Since we don't know (for now)
545-
// if addresses of two sends can point to the same address, we just
666+
// if addresses of two sends can point to the same location, we just
546667
// conservatively do not fuse any write messages.
547-
//
668+
//
669+
// Atomic messages:
670+
// As only no-return-value atomic can be fused, RAW will be false always.
671+
//
548672
G4_SendMsgDescriptor* desc0 = I0->getMsgDesc();
549673
G4_SendMsgDescriptor* desc1 = I1->getMsgDesc();
550674
bool fusion = I0->getOption() == I1->getOption() &&
@@ -1382,33 +1506,39 @@ bool SendFusion::run(G4_BB* BB)
13821506
++II0;
13831507
continue;
13841508
}
1385-
1509+
13861510
G4_INST* inst1 = nullptr;
13871511
INST_LIST_ITER II1 = II0;
13881512
++II1;
13891513
while(II1 != IE)
13901514
{
13911515
G4_INST* tmp = *II1;
1392-
if (tmp->opcode() == inst0->opcode() &&
1393-
tmp->getExecSize() == inst0->getExecSize() &&
1394-
simplifyAndCheckCandidate(II1))
1516+
if (simplifyAndCheckCandidate(II1))
13951517
{
1396-
if (canFusion(II0, II1))
1518+
// possible 2nd send to be fused
1519+
if (tmp->opcode() == inst0->opcode() &&
1520+
tmp->getExecSize() == inst0->getExecSize())
13971521
{
1398-
// Found
1399-
inst1 = tmp;
1522+
if (canFusion(II0, II1))
1523+
{
1524+
// Found
1525+
inst1 = tmp;
1526+
}
14001527
}
14011528

1402-
// Don't advance II1 as II1 might be the first send
1403-
// for the next pair of candidates.
1529+
// If found (inst1 != null), exit the inner loop to start fusing;
1530+
// if not found, exit the inner loop and use this one (II1) as
1531+
// the 1st send of possible next pair to start the outer loop again.
1532+
//
1533+
// In both case, don't advance II1.
14041534
break;
14051535
}
14061536

14071537
++II1;
14081538
if (tmp->isSend() || tmp->isFence() || tmp->isOptBarrier())
14091539
{
14101540
// Don't try to fusion two sends that are separated
1411-
// by other memory/barrier instructions.
1541+
// by other memory/barrier instructions.
14121542
break;
14131543
}
14141544
}
@@ -1457,9 +1587,8 @@ bool SendFusion::run(G4_BB* BB)
14571587
//
14581588
// [(w)] send(8) + send(8) --> (W&flag) send(16)
14591589
//
1460-
// Either noMask or not. When no NoMask
1461-
//
1462-
// Note that (w) send(1|2|4) is also supported.
1590+
// Either noMask or not. When no NoMask, send insts with
1591+
// execsize=1|2|4 are also supported.
14631592
//
14641593
bool vISA::doSendFusion(FlowGraph* aCFG, Mem_Manager* aMMgr)
14651594
{

visa/include/VISAOptions.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ DEF_VISA_OPTION(vISA_dumpRPE, ET_BOOL, "-dumpRPE", UNUSED,
3333
//=== Optimization options ===
3434
DEF_VISA_OPTION(vISA_EnableAlways, ET_BOOL, NULLSTR, UNUSED, true)
3535
DEF_VISA_OPTION(vISA_EnableSendFusion, ET_BOOL, "-enableSendFusion", UNUSED, false)
36+
DEF_VISA_OPTION(vISA_EnableAtomicFusion, ET_BOOL, "-enableAtomicFusion", UNUSED, false)
3637
DEF_VISA_OPTION(vISA_LocalCopyProp, ET_BOOL, "-nocopyprop", UNUSED, true)
3738
DEF_VISA_OPTION(vISA_LocalFlagOpt, ET_BOOL, "-noflagopt", UNUSED, true)
3839
DEF_VISA_OPTION(vISA_LocalMACopt, ET_BOOL, "-nomacopt", UNUSED, true)
@@ -62,6 +63,7 @@ DEF_VISA_OPTION(vISA_LVN, ET_BOOL, "-nolvn", UNUSED, tru
6263
// only affects acc substitution for now
6364
DEF_VISA_OPTION(vISA_numGeneralAcc, ET_INT32, "-numGeneralAcc", "USAGE: -numGeneralAcc <accNum>\n", 0)
6465
DEF_VISA_OPTION(vISA_reassociate, ET_BOOL, "-noreassoc", UNUSED, true)
66+
DEF_VISA_OPTION(vISA_unsafeMath, ET_BOOL, "-unsafeMath", UNUSED, false)
6567
DEF_VISA_OPTION(vISA_split4GRFVar, ET_BOOL, "-no4GRFSplit", UNUSED, true)
6668

6769
//=== code gen options ===

0 commit comments

Comments
 (0)