@@ -216,7 +216,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
216
216
CombineInfo &Paired, bool Modify = false );
217
217
static bool widthsFit (const GCNSubtarget &STI, const CombineInfo &CI,
218
218
const CombineInfo &Paired);
219
- static unsigned getNewOpcode (const CombineInfo &CI, const CombineInfo &Paired);
219
+ static unsigned getNewOpcode (const CombineInfo &CI, const CombineInfo &Paired,
220
+ const GCNSubtarget *STI = nullptr );
220
221
static std::pair<unsigned , unsigned > getSubRegIdxs (const CombineInfo &CI,
221
222
const CombineInfo &Paired);
222
223
const TargetRegisterClass *
@@ -343,6 +344,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
343
344
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344
345
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345
346
case AMDGPU::S_LOAD_DWORD_IMM:
347
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
346
348
case AMDGPU::GLOBAL_LOAD_DWORD:
347
349
case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348
350
case AMDGPU::GLOBAL_STORE_DWORD:
@@ -353,6 +355,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
353
355
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354
356
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355
357
case AMDGPU::S_LOAD_DWORDX2_IMM:
358
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
356
359
case AMDGPU::GLOBAL_LOAD_DWORDX2:
357
360
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
358
361
case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +366,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363
366
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
364
367
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
365
368
case AMDGPU::S_LOAD_DWORDX3_IMM:
369
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
366
370
case AMDGPU::GLOBAL_LOAD_DWORDX3:
367
371
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
368
372
case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +377,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
373
377
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
374
378
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
375
379
case AMDGPU::S_LOAD_DWORDX4_IMM:
380
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
376
381
case AMDGPU::GLOBAL_LOAD_DWORDX4:
377
382
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
378
383
case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +388,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
383
388
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
384
389
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
385
390
case AMDGPU::S_LOAD_DWORDX8_IMM:
391
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
386
392
return 8 ;
387
393
case AMDGPU::DS_READ_B32:
388
394
case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +513,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
507
513
case AMDGPU::S_LOAD_DWORDX3_IMM:
508
514
case AMDGPU::S_LOAD_DWORDX4_IMM:
509
515
case AMDGPU::S_LOAD_DWORDX8_IMM:
516
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
517
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
518
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
519
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
520
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
510
521
return S_LOAD_IMM;
511
522
case AMDGPU::DS_READ_B32:
512
523
case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +602,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
591
602
case AMDGPU::S_LOAD_DWORDX3_IMM:
592
603
case AMDGPU::S_LOAD_DWORDX4_IMM:
593
604
case AMDGPU::S_LOAD_DWORDX8_IMM:
605
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
606
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
607
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
608
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
609
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
594
610
return AMDGPU::S_LOAD_DWORD_IMM;
595
611
case AMDGPU::GLOBAL_LOAD_DWORD:
596
612
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +719,11 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703
719
case AMDGPU::S_LOAD_DWORDX3_IMM:
704
720
case AMDGPU::S_LOAD_DWORDX4_IMM:
705
721
case AMDGPU::S_LOAD_DWORDX8_IMM:
722
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
723
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
724
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
725
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
726
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
706
727
Result.SBase = true ;
707
728
return Result;
708
729
case AMDGPU::DS_READ_B32:
@@ -1212,8 +1233,17 @@ void SILoadStoreOptimizer::copyToDestRegs(
1212
1233
1213
1234
// Copy to the old destination registers.
1214
1235
const MCInstrDesc &CopyDesc = TII->get (TargetOpcode::COPY);
1215
- const auto *Dest0 = TII->getNamedOperand (*CI.I , OpName);
1216
- const auto *Dest1 = TII->getNamedOperand (*Paired.I , OpName);
1236
+ auto *Dest0 = TII->getNamedOperand (*CI.I , OpName);
1237
+ auto *Dest1 = TII->getNamedOperand (*Paired.I , OpName);
1238
+
1239
+ // The constrained sload instructions in S_LOAD_IMM class will have
1240
+ // `early-clobber` flag in the dst operand. Remove the flag before using the
1241
+ // MOs in copies.
1242
+ if (Dest0->isEarlyClobber ())
1243
+ Dest0->setIsEarlyClobber (false );
1244
+
1245
+ if (Dest1->isEarlyClobber ())
1246
+ Dest1->setIsEarlyClobber (false );
1217
1247
1218
1248
BuildMI (*MBB, InsertBefore, DL, CopyDesc)
1219
1249
.add (*Dest0) // Copy to same destination including flags and sub reg.
@@ -1446,7 +1476,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1446
1476
MachineBasicBlock::iterator InsertBefore) {
1447
1477
MachineBasicBlock *MBB = CI.I ->getParent ();
1448
1478
DebugLoc DL = CI.I ->getDebugLoc ();
1449
- const unsigned Opcode = getNewOpcode (CI, Paired);
1479
+ const unsigned Opcode = getNewOpcode (CI, Paired, STM );
1450
1480
1451
1481
const TargetRegisterClass *SuperRC = getTargetRegisterClass (CI, Paired);
1452
1482
@@ -1658,7 +1688,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1658
1688
}
1659
1689
1660
1690
unsigned SILoadStoreOptimizer::getNewOpcode (const CombineInfo &CI,
1661
- const CombineInfo &Paired) {
1691
+ const CombineInfo &Paired,
1692
+ const GCNSubtarget *STI) {
1662
1693
const unsigned Width = CI.Width + Paired.Width ;
1663
1694
1664
1695
switch (getCommonInstClass (CI, Paired)) {
@@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1701
1732
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1702
1733
}
1703
1734
case S_LOAD_IMM:
1704
- switch (Width) {
1705
- default :
1706
- return 0 ;
1707
- case 2 :
1708
- return AMDGPU::S_LOAD_DWORDX2_IMM;
1709
- case 3 :
1710
- return AMDGPU::S_LOAD_DWORDX3_IMM;
1711
- case 4 :
1712
- return AMDGPU::S_LOAD_DWORDX4_IMM;
1713
- case 8 :
1714
- return AMDGPU::S_LOAD_DWORDX8_IMM;
1735
+ // For targets that support XNACK replay, use the constrained load opcode.
1736
+ if (STI && STI->hasXnackReplay ()) {
1737
+ switch (Width) {
1738
+ default :
1739
+ return 0 ;
1740
+ case 2 :
1741
+ return AMDGPU::S_LOAD_DWORDX2_IMM_ec;
1742
+ case 3 :
1743
+ return AMDGPU::S_LOAD_DWORDX3_IMM_ec;
1744
+ case 4 :
1745
+ return AMDGPU::S_LOAD_DWORDX4_IMM_ec;
1746
+ case 8 :
1747
+ return AMDGPU::S_LOAD_DWORDX8_IMM_ec;
1748
+ }
1749
+ } else {
1750
+ switch (Width) {
1751
+ default :
1752
+ return 0 ;
1753
+ case 2 :
1754
+ return AMDGPU::S_LOAD_DWORDX2_IMM;
1755
+ case 3 :
1756
+ return AMDGPU::S_LOAD_DWORDX3_IMM;
1757
+ case 4 :
1758
+ return AMDGPU::S_LOAD_DWORDX4_IMM;
1759
+ case 8 :
1760
+ return AMDGPU::S_LOAD_DWORDX8_IMM;
1761
+ }
1715
1762
}
1716
1763
case GLOBAL_LOAD:
1717
1764
switch (Width) {
0 commit comments