@@ -130,6 +130,7 @@ namespace vISA
130
130
G4_VarBase* getVarBase (G4_VarBase* RegVar, G4_Type Ty);
131
131
uint32_t getFuncCtrlWithSimd16 (G4_SendMsgDescriptor* Desc);
132
132
void simplifyMsg (INST_LIST_ITER SendIter);
133
+ bool isAtomicCandidate (G4_SendMsgDescriptor* msgDesc);
133
134
134
135
bool WAce0Read;
135
136
@@ -190,6 +191,11 @@ uint32_t SendFusion::getFuncCtrlWithSimd16(G4_SendMsgDescriptor* Desc)
190
191
// bit13-12: SM3
191
192
FC = ((FC & ~0x3000 ) | (MDC_SM3_SIMD16 << 12 ));
192
193
break ;
194
+ case DC1_UNTYPED_ATOMIC:
195
+ case DC1_UNTYPED_FLOAT_ATOMIC:
196
+ // bit12: SM2R
197
+ FC = ((FC & ~0x1000 ) | (MDC_SM2R_SIMD16 << 12 ));
198
+ break ;
193
199
}
194
200
}
195
201
else if (funcID == SFID_DP_DC2)
@@ -220,9 +226,74 @@ uint32_t SendFusion::getFuncCtrlWithSimd16(G4_SendMsgDescriptor* Desc)
220
226
return FC;
221
227
}
222
228
229
+ bool SendFusion::isAtomicCandidate (G4_SendMsgDescriptor* msgDesc)
230
+ {
231
+ uint32_t funcID = msgDesc->getFuncId ();
232
+ if (funcID != SFID_DP_DC1) {
233
+ return false ;
234
+ }
235
+
236
+ // Right now, the following atomic messages are DW per simd-lane.
237
+ uint16_t msgType = msgDesc->getMessageType ();
238
+ bool intAtomic = true ; // true: int; false : float
239
+ switch (msgType) {
240
+ default :
241
+ return false ;
242
+ case DC1_UNTYPED_ATOMIC:
243
+ break ;
244
+ case DC1_UNTYPED_FLOAT_ATOMIC:
245
+ intAtomic = false ;
246
+ break ;
247
+ }
248
+
249
+ // Had right atomic type, now check AtomicOp
250
+ uint16_t atomicOp = msgDesc->getAtomicOp ();
251
+ if (intAtomic)
252
+ {
253
+ switch (atomicOp)
254
+ {
255
+ default :
256
+ return false ;
257
+ case GEN_ATOMIC_AND:
258
+ case GEN_ATOMIC_OR:
259
+ case GEN_ATOMIC_XOR:
260
+ case GEN_ATOMIC_INC:
261
+ case GEN_ATOMIC_DEC:
262
+ case GEN_ATOMIC_ADD:
263
+ case GEN_ATOMIC_SUB:
264
+ case GEN_ATOMIC_REVSUB:
265
+ case GEN_ATOMIC_IMAX:
266
+ case GEN_ATOMIC_IMIN:
267
+ case GEN_ATOMIC_UMAX:
268
+ case GEN_ATOMIC_UMIN:
269
+ case GEN_ATOMIC_PREDEC:
270
+ break ;
271
+ }
272
+ }
273
+ else
274
+ {
275
+ if (!Builder->getOption (vISA_unsafeMath))
276
+ {
277
+ return false ;
278
+ }
279
+
280
+ switch (atomicOp)
281
+ {
282
+ default :
283
+ return false ;
284
+ case GEN_ATOMIC_FMAX:
285
+ case GEN_ATOMIC_FMIN:
286
+ break ;
287
+ }
288
+ }
289
+ return true ;
290
+
291
+ // Need to check if it is packed half integer/float ?
292
+ }
293
+
223
294
// We will do send fusion for a few messages. Those messages all
224
- // have the address payload for each channel , thus address payload
225
- // is 1 GRF for exec_size=8 (no A64 messages for now).
295
+ // have DW-sized address for each lane , thus address payload is
296
+ // 1 GRF for exec_size=8 (no A64 messages for now).
226
297
//
227
298
// The optimization is performed for the following cases:
228
299
// 1) [(w)] send(8) + send(8) --> (W&flag) send(16), and
@@ -286,10 +357,52 @@ bool SendFusion::simplifyAndCheckCandidate(INST_LIST_ITER Iter)
286
357
return false ;
287
358
}
288
359
289
- // Unless we can prove there are no aliases of two sends, we will not be
290
- // able to do fusion (or we know for sure that the first addr's value or
291
- // last addr's value is taken). For now, disable it.
292
- if (msgDesc->isDataPortWrite ())
360
+ // For write messages:
361
+ // unless we can prove there are no aliases of two sends's address payload,
362
+ // we will not be able to do fusion as hardware does not have deterministic
363
+ // behavior if the same address appear more than once. For example,
364
+ //
365
+ // send (8) (a1_0, C, a1_2, ..., a1_7) (d1_0, d1_1, d1_2, ..., d1_7)
366
+ // send (8) (a2_0, C, a2_2, ..., a2_7) (d2_0, d2_1, d2_2, ..., d2_7)
367
+ //
368
+ // Both sends have the same addr 'C' in lane 1, and C's value is d2_1 after
369
+ // two sends. However, if they are fused, they become:
370
+ //
371
+ // send (16) (a1_0, C, ..., a2_0, C, ..., a2_7)
372
+ // (d1_0, d1_1, ..., d2_0, d2_1, ..., d2_7)
373
+ //
374
+ // The hardware cannot guarantee that d2_1 will be the value written to addr 'C'.
375
+ // Thus, we have to disallow fusion for writes
376
+ //
377
+ // But we can do it for atomic messages if certain conditions meet:
378
+ // (Checked in isAtomicCandidate().)
379
+ // 1. Can fuse if no return value.
380
+ // Atomic messages has both read/write. Let us take a look at atomic_add
381
+ // for example: let's assume atomic_add returns the old value and does updating.
382
+ //
383
+ // location at p : 10 (original value)
384
+ // x = atomic_add p, 1
385
+ // y = atomic_add p, 2
386
+ // then, x = 10 & y = 11
387
+ //
388
+ // If fused them, it becomes:
389
+ // {x, y} = atomic_add {p, p}, {1, 2}
390
+ // it's possible that we have x = 12 & y = 10 as the 2nd atomic operations
391
+ // could be performed first. This does change the behavior of the program.
392
+ //
393
+ // Since we don't know if two sends share the same address, we will conservatively
394
+ // avoid fusing two sends with return values for now.
395
+ // 2. Assume unsafe math is present for float atomic.
396
+ // As two sends might share the same location, and fused send might change the order
397
+ // of float atomic operations. We need to have unsafe-math to perform fusing legally.
398
+ //
399
+ bool isAtomicCand = false ;
400
+ if (Builder->getOption (vISA_EnableAtomicFusion))
401
+ {
402
+ isAtomicCand = isAtomicCandidate (msgDesc);
403
+ }
404
+ if ((!isAtomicCand && msgDesc->isDataPortWrite ()) ||
405
+ (isAtomicCand && rspLen > 0 ))
293
406
{
294
407
return false ;
295
408
}
@@ -317,6 +430,14 @@ bool SendFusion::simplifyAndCheckCandidate(INST_LIST_ITER Iter)
317
430
return false ;
318
431
}
319
432
433
+ // Only handling the following messages that have DW as data element.
434
+ // Untyped with up to 4 DW per lane is handled.
435
+
436
+ // special handling of atomic
437
+ if (isAtomicCand) {
438
+ return true ;
439
+ }
440
+
320
441
uint32_t funcID = msgDesc->getFuncId ();
321
442
uint32_t msgType = msgDesc->getMessageType ();
322
443
if (funcID == SFID_DP_DC)
@@ -542,9 +663,12 @@ bool SendFusion::canFusion(INST_LIST_ITER IT0, INST_LIST_ITER IT1)
542
663
// and hardware does not have deterministic behavior about which data,
543
664
// data0 or data1, will be stored into x! For this reason, no write
544
665
// will be fused if they have common address. Since we don't know (for now)
545
- // if addresses of two sends can point to the same address , we just
666
+ // if addresses of two sends can point to the same location , we just
546
667
// conservatively do not fuse any write messages.
547
- //
668
+ //
669
+ // Atomic messages:
670
+ // As only no-return-value atomic can be fused, RAW will be false always.
671
+ //
548
672
G4_SendMsgDescriptor* desc0 = I0->getMsgDesc ();
549
673
G4_SendMsgDescriptor* desc1 = I1->getMsgDesc ();
550
674
bool fusion = I0->getOption () == I1->getOption () &&
@@ -1382,33 +1506,39 @@ bool SendFusion::run(G4_BB* BB)
1382
1506
++II0;
1383
1507
continue ;
1384
1508
}
1385
-
1509
+
1386
1510
G4_INST* inst1 = nullptr ;
1387
1511
INST_LIST_ITER II1 = II0;
1388
1512
++II1;
1389
1513
while (II1 != IE)
1390
1514
{
1391
1515
G4_INST* tmp = *II1;
1392
- if (tmp->opcode () == inst0->opcode () &&
1393
- tmp->getExecSize () == inst0->getExecSize () &&
1394
- simplifyAndCheckCandidate (II1))
1516
+ if (simplifyAndCheckCandidate (II1))
1395
1517
{
1396
- if (canFusion (II0, II1))
1518
+ // possible 2nd send to be fused
1519
+ if (tmp->opcode () == inst0->opcode () &&
1520
+ tmp->getExecSize () == inst0->getExecSize ())
1397
1521
{
1398
- // Found
1399
- inst1 = tmp;
1522
+ if (canFusion (II0, II1))
1523
+ {
1524
+ // Found
1525
+ inst1 = tmp;
1526
+ }
1400
1527
}
1401
1528
1402
- // Don't advance II1 as II1 might be the first send
1403
- // for the next pair of candidates.
1529
+ // If found (inst1 != null), exit the inner loop to start fusing;
1530
+ // if not found, exit the inner loop and use this one (II1) as
1531
+ // the 1st send of possible next pair to start the outer loop again.
1532
+ //
1533
+ // In both case, don't advance II1.
1404
1534
break ;
1405
1535
}
1406
1536
1407
1537
++II1;
1408
1538
if (tmp->isSend () || tmp->isFence () || tmp->isOptBarrier ())
1409
1539
{
1410
1540
// Don't try to fusion two sends that are separated
1411
- // by other memory/barrier instructions.
1541
+ // by other memory/barrier instructions.
1412
1542
break ;
1413
1543
}
1414
1544
}
@@ -1457,9 +1587,8 @@ bool SendFusion::run(G4_BB* BB)
1457
1587
//
1458
1588
// [(w)] send(8) + send(8) --> (W&flag) send(16)
1459
1589
//
1460
- // Either noMask or not. When no NoMask
1461
- //
1462
- // Note that (w) send(1|2|4) is also supported.
1590
+ // Either noMask or not. When no NoMask, send insts with
1591
+ // execsize=1|2|4 are also supported.
1463
1592
//
1464
1593
bool vISA::doSendFusion (FlowGraph* aCFG, Mem_Manager* aMMgr)
1465
1594
{
0 commit comments