@@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
352
352
return 1 ;
353
353
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354
354
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
356
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
355
357
case AMDGPU::S_LOAD_DWORDX2_IMM:
356
358
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
357
359
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363
365
return 2 ;
364
366
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
365
367
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
368
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
369
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
366
370
case AMDGPU::S_LOAD_DWORDX3_IMM:
367
371
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
368
372
case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
374
378
return 3 ;
375
379
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376
380
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
381
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
382
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
377
383
case AMDGPU::S_LOAD_DWORDX4_IMM:
378
384
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
379
385
case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
385
391
return 4 ;
386
392
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387
393
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
394
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
395
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
388
396
case AMDGPU::S_LOAD_DWORDX8_IMM:
389
397
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390
398
return 8 ;
@@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
499
507
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
500
508
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501
509
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
510
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
511
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
512
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
513
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
502
514
return S_BUFFER_LOAD_IMM;
503
515
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504
516
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
505
517
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506
518
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507
519
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
520
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
521
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
522
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
523
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
508
524
return S_BUFFER_LOAD_SGPR_IMM;
509
525
case AMDGPU::S_LOAD_DWORD_IMM:
510
526
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
587
603
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
588
604
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589
605
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
606
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
607
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
608
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
609
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
590
610
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591
611
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592
612
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
593
613
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594
614
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595
615
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
616
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
617
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
618
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
619
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
596
620
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597
621
case AMDGPU::S_LOAD_DWORD_IMM:
598
622
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -703,13 +727,21 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703
727
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704
728
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705
729
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
730
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
731
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
732
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
733
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
706
734
Result.SOffset = true ;
707
735
[[fallthrough]];
708
736
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
709
737
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
710
738
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
711
739
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712
740
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
741
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
742
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
743
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
744
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
713
745
case AMDGPU::S_LOAD_DWORD_IMM:
714
746
case AMDGPU::S_LOAD_DWORDX2_IMM:
715
747
case AMDGPU::S_LOAD_DWORDX3_IMM:
@@ -1679,6 +1711,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1679
1711
return New;
1680
1712
}
1681
1713
1714
+ static bool needsConstrainedOpcode (const GCNSubtarget &STM,
1715
+ ArrayRef<MachineMemOperand *> MMOs,
1716
+ unsigned Width) {
1717
+ // Conservatively returns true if not found the MMO.
1718
+ return STM.isXNACKEnabled () &&
1719
+ (MMOs.size () != 1 || MMOs[0 ]->getAlign ().value () < Width * 4 );
1720
+ }
1721
+
1682
1722
unsigned SILoadStoreOptimizer::getNewOpcode (const CombineInfo &CI,
1683
1723
const CombineInfo &Paired) {
1684
1724
const unsigned Width = CI.Width + Paired.Width ;
@@ -1696,38 +1736,55 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1696
1736
1697
1737
case UNKNOWN:
1698
1738
llvm_unreachable (" Unknown instruction class" );
1699
- case S_BUFFER_LOAD_IMM:
1739
+ case S_BUFFER_LOAD_IMM: {
1740
+ // If XNACK is enabled, use the constrained opcodes when the first load is
1741
+ // under-aligned.
1742
+ bool NeedsConstrainedOpc =
1743
+ needsConstrainedOpcode (*STM, CI.I ->memoperands (), Width);
1700
1744
switch (Width) {
1701
1745
default :
1702
1746
return 0 ;
1703
1747
case 2 :
1704
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1748
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1749
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1705
1750
case 3 :
1706
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1751
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1752
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1707
1753
case 4 :
1708
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1754
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1755
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1709
1756
case 8 :
1710
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1757
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1758
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1711
1759
}
1712
- case S_BUFFER_LOAD_SGPR_IMM:
1760
+ }
1761
+ case S_BUFFER_LOAD_SGPR_IMM: {
1762
+ // If XNACK is enabled, use the constrained opcodes when the first load is
1763
+ // under-aligned.
1764
+ bool NeedsConstrainedOpc =
1765
+ needsConstrainedOpcode (*STM, CI.I ->memoperands (), Width);
1713
1766
switch (Width) {
1714
1767
default :
1715
1768
return 0 ;
1716
1769
case 2 :
1717
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1770
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1771
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1718
1772
case 3 :
1719
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1773
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1774
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1720
1775
case 4 :
1721
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1776
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1777
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1722
1778
case 8 :
1723
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1779
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1780
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1724
1781
}
1782
+ }
1725
1783
case S_LOAD_IMM: {
1726
1784
// If XNACK is enabled, use the constrained opcodes when the first load is
1727
1785
// under-aligned.
1728
- const MachineMemOperand *MMO = *CI.I ->memoperands_begin ();
1729
1786
bool NeedsConstrainedOpc =
1730
- STM-> isXNACKEnabled () && MMO-> getAlign (). value () < Width * 4 ;
1787
+ needsConstrainedOpcode (*STM, CI. I -> memoperands (), Width) ;
1731
1788
switch (Width) {
1732
1789
default :
1733
1790
return 0 ;
0 commit comments