@@ -2615,7 +2615,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2615
2615
? &AMDGPU::SReg_32RegClass
2616
2616
: &AMDGPU::VGPR_32RegClass;
2617
2617
bool IsCopy = MI->getOpcode () == AMDGPU::V_MOV_B32_e32 ||
2618
- MI->getOpcode () == AMDGPU::V_MOV_B32_e64;
2618
+ MI->getOpcode () == AMDGPU::V_MOV_B32_e64 ||
2619
+ MI->getOpcode () == AMDGPU::S_MOV_B32;
2619
2620
Register ResultReg =
2620
2621
IsCopy ? MI->getOperand (0 ).getReg ()
2621
2622
: RS->scavengeRegisterBackwards (*RC, MI, false , 0 );
@@ -2624,7 +2625,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2624
2625
if (Offset == 0 ) {
2625
2626
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
2626
2627
: AMDGPU::V_LSHRREV_B32_e64;
2627
- auto Shift = BuildMI (*MBB, MI, DL, TII->get (OpCode), ResultReg);
2628
+ Register TmpResultReg = ResultReg;
2629
+ if (IsSALU && LiveSCC) {
2630
+ TmpResultReg = RS->scavengeRegisterBackwards (
2631
+ AMDGPU::VGPR_32RegClass, MI, false , 0 );
2632
+ }
2633
+
2634
+ auto Shift = BuildMI (*MBB, MI, DL, TII->get (OpCode), TmpResultReg);
2628
2635
if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2629
2636
// For V_LSHRREV, the operands are reversed (the shift count goes
2630
2637
// first).
@@ -2634,11 +2641,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2634
2641
if (IsSALU && !LiveSCC)
2635
2642
Shift.getInstr ()->getOperand (3 ).setIsDead (); // Mark SCC as dead.
2636
2643
if (IsSALU && LiveSCC) {
2637
- Register NewDest = RS->scavengeRegisterBackwards (
2638
- AMDGPU::SReg_32RegClass, Shift, false , 0 );
2644
+ Register NewDest =
2645
+ IsCopy ? ResultReg
2646
+ : RS->scavengeRegisterBackwards (AMDGPU::SReg_32RegClass,
2647
+ Shift, false , 0 );
2639
2648
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_READFIRSTLANE_B32),
2640
2649
NewDest)
2641
- .addReg (ResultReg );
2650
+ .addReg (TmpResultReg );
2642
2651
ResultReg = NewDest;
2643
2652
}
2644
2653
} else {
@@ -2689,22 +2698,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2689
2698
2690
2699
// We may have 1 free scratch SGPR even though a carry out is
2691
2700
// unavailable. Only one additional mov is needed.
2692
- Register TmpScaledReg = RS->scavengeRegisterBackwards (
2693
- AMDGPU::SReg_32_XM0RegClass, MI, false , 0 , false );
2694
- Register ScaledReg = TmpScaledReg.isValid () ? TmpScaledReg : FrameReg;
2701
+ Register TmpScaledReg = IsCopy && IsSALU
2702
+ ? ResultReg
2703
+ : RS->scavengeRegisterBackwards (
2704
+ AMDGPU::SReg_32_XM0RegClass, MI,
2705
+ false , 0 , /* AllowSpill=*/ false );
2706
+ Register ScaledReg =
2707
+ TmpScaledReg.isValid () ? TmpScaledReg : FrameReg;
2708
+ Register TmpResultReg = ScaledReg;
2709
+
2710
+ if (!LiveSCC) {
2711
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_LSHR_B32), TmpResultReg)
2712
+ .addReg (FrameReg)
2713
+ .addImm (ST.getWavefrontSizeLog2 ());
2714
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_I32), TmpResultReg)
2715
+ .addReg (TmpResultReg, RegState::Kill)
2716
+ .addImm (Offset);
2717
+ } else {
2718
+ TmpResultReg = RS->scavengeRegisterBackwards (
2719
+ AMDGPU::VGPR_32RegClass, MI, false , 0 , /* AllowSpill=*/ true );
2720
+
2721
+ MachineInstrBuilder Add;
2722
+ if ((Add = TII->getAddNoCarry (*MBB, MI, DL, TmpResultReg, *RS))) {
2723
+ BuildMI (*MBB, *Add, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64),
2724
+ TmpResultReg)
2725
+ .addImm (ST.getWavefrontSizeLog2 ())
2726
+ .addReg (FrameReg);
2727
+ if (Add->getOpcode () == AMDGPU::V_ADD_CO_U32_e64) {
2728
+ BuildMI (*MBB, *Add, DL, TII->get (AMDGPU::S_MOV_B32),
2729
+ ResultReg)
2730
+ .addImm (Offset);
2731
+ Add.addReg (ResultReg, RegState::Kill)
2732
+ .addReg (TmpResultReg, RegState::Kill)
2733
+ .addImm (0 );
2734
+ } else
2735
+ Add.addImm (Offset).addReg (TmpResultReg, RegState::Kill);
2736
+ } else {
2737
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32),
2738
+ TmpResultReg)
2739
+ .addImm (Offset);
2740
+ assert (Offset > 0 &&
2741
+ isUInt<24 >(2 * ST.getMaxWaveScratchSize ()) &&
2742
+ " offset is unsafe for v_mad_u32_u24" );
2743
+ // We start with a frame pointer with a wave space value, and an
2744
+ // offset in lane-space. We are materializing a lane space
2745
+ // value. We can either do a right shift of the frame pointer to
2746
+ // get to lane space, or a left shift of the offset to get to
2747
+ // wavespace. We can right shift after the computation to get
2748
+ // back to the desired per-lane value.
2749
+ // We are using the mad_u32_u24 primarily as an add with no
2750
+ // carry out clobber.
2751
+ Add = BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MAD_U32_U24_e64),
2752
+ TmpResultReg)
2753
+ .addReg (TmpResultReg, RegState::Kill)
2754
+ .addImm (ST.getWavefrontSize ())
2755
+ .addReg (FrameReg)
2756
+ .addImm (0 );
2757
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64),
2758
+ TmpResultReg)
2759
+ .addImm (ST.getWavefrontSizeLog2 ())
2760
+ .addReg (FrameReg);
2761
+ }
2695
2762
2696
- BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_LSHR_B32), ScaledReg)
2697
- .addReg (FrameReg)
2698
- .addImm (ST.getWavefrontSizeLog2 ());
2699
- BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_I32), ScaledReg)
2700
- .addReg (ScaledReg, RegState::Kill)
2701
- .addImm (Offset);
2763
+ Register NewDest = IsCopy ? ResultReg
2764
+ : RS->scavengeRegisterBackwards (
2765
+ AMDGPU::SReg_32RegClass, *Add,
2766
+ false , 0 , /* AllowSpill=*/ true );
2767
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_READFIRSTLANE_B32),
2768
+ NewDest)
2769
+ .addReg (TmpResultReg);
2770
+ ResultReg = NewDest;
2771
+ }
2702
2772
if (!IsSALU)
2703
2773
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::COPY), ResultReg)
2704
- .addReg (ScaledReg , RegState::Kill);
2774
+ .addReg (TmpResultReg , RegState::Kill);
2705
2775
else
2706
- ResultReg = ScaledReg;
2707
-
2776
+ ResultReg = TmpResultReg;
2708
2777
// If there were truly no free SGPRs, we need to undo everything.
2709
2778
if (!TmpScaledReg.isValid ()) {
2710
2779
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_I32), ScaledReg)
0 commit comments