@@ -2604,7 +2604,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2604
2604
? &AMDGPU::SReg_32RegClass
2605
2605
: &AMDGPU::VGPR_32RegClass;
2606
2606
bool IsCopy = MI->getOpcode () == AMDGPU::V_MOV_B32_e32 ||
2607
- MI->getOpcode () == AMDGPU::V_MOV_B32_e64;
2607
+ MI->getOpcode () == AMDGPU::V_MOV_B32_e64 ||
2608
+ MI->getOpcode () == AMDGPU::S_MOV_B32;
2608
2609
Register ResultReg =
2609
2610
IsCopy ? MI->getOperand (0 ).getReg ()
2610
2611
: RS->scavengeRegisterBackwards (*RC, MI, false , 0 );
@@ -2613,7 +2614,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2613
2614
if (Offset == 0 ) {
2614
2615
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
2615
2616
: AMDGPU::V_LSHRREV_B32_e64;
2616
- auto Shift = BuildMI (*MBB, MI, DL, TII->get (OpCode), ResultReg);
2617
+ Register TmpResultReg = ResultReg;
2618
+ if (IsSALU && LiveSCC) {
2619
+ TmpResultReg = RS->scavengeRegisterBackwards (
2620
+ AMDGPU::VGPR_32RegClass, MI, false , 0 );
2621
+ }
2622
+
2623
+ auto Shift = BuildMI (*MBB, MI, DL, TII->get (OpCode), TmpResultReg);
2617
2624
if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2618
2625
// For V_LSHRREV, the operands are reversed (the shift count goes
2619
2626
// first).
@@ -2623,11 +2630,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2623
2630
if (IsSALU && !LiveSCC)
2624
2631
Shift.getInstr ()->getOperand (3 ).setIsDead (); // Mark SCC as dead.
2625
2632
if (IsSALU && LiveSCC) {
2626
- Register NewDest = RS->scavengeRegisterBackwards (
2627
- AMDGPU::SReg_32RegClass, Shift, false , 0 );
2633
+ Register NewDest =
2634
+ IsCopy ? ResultReg
2635
+ : RS->scavengeRegisterBackwards (AMDGPU::SReg_32RegClass,
2636
+ Shift, false , 0 );
2628
2637
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_READFIRSTLANE_B32),
2629
2638
NewDest)
2630
- .addReg (ResultReg );
2639
+ .addReg (TmpResultReg );
2631
2640
ResultReg = NewDest;
2632
2641
}
2633
2642
} else {
@@ -2678,22 +2687,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2678
2687
2679
2688
// We may have 1 free scratch SGPR even though a carry out is
2680
2689
// unavailable. Only one additional mov is needed.
2681
- Register TmpScaledReg = RS->scavengeRegisterBackwards (
2682
- AMDGPU::SReg_32_XM0RegClass, MI, false , 0 , false );
2683
- Register ScaledReg = TmpScaledReg.isValid () ? TmpScaledReg : FrameReg;
2690
+ Register TmpScaledReg = IsCopy && IsSALU
2691
+ ? ResultReg
2692
+ : RS->scavengeRegisterBackwards (
2693
+ AMDGPU::SReg_32_XM0RegClass, MI,
2694
+ false , 0 , /* AllowSpill=*/ false );
2695
+ Register ScaledReg =
2696
+ TmpScaledReg.isValid () ? TmpScaledReg : FrameReg;
2697
+ Register TmpResultReg = ScaledReg;
2698
+
2699
+ if (!LiveSCC) {
2700
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_LSHR_B32), TmpResultReg)
2701
+ .addReg (FrameReg)
2702
+ .addImm (ST.getWavefrontSizeLog2 ());
2703
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_I32), TmpResultReg)
2704
+ .addReg (TmpResultReg, RegState::Kill)
2705
+ .addImm (Offset);
2706
+ } else {
2707
+ TmpResultReg = RS->scavengeRegisterBackwards (
2708
+ AMDGPU::VGPR_32RegClass, MI, false , 0 , /* AllowSpill=*/ true );
2709
+
2710
+ MachineInstrBuilder Add;
2711
+ if ((Add = TII->getAddNoCarry (*MBB, MI, DL, TmpResultReg, *RS))) {
2712
+ BuildMI (*MBB, *Add, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64),
2713
+ TmpResultReg)
2714
+ .addImm (ST.getWavefrontSizeLog2 ())
2715
+ .addReg (FrameReg);
2716
+ if (Add->getOpcode () == AMDGPU::V_ADD_CO_U32_e64) {
2717
+ BuildMI (*MBB, *Add, DL, TII->get (AMDGPU::S_MOV_B32),
2718
+ ResultReg)
2719
+ .addImm (Offset);
2720
+ Add.addReg (ResultReg, RegState::Kill)
2721
+ .addReg (TmpResultReg, RegState::Kill)
2722
+ .addImm (0 );
2723
+ } else
2724
+ Add.addImm (Offset).addReg (TmpResultReg, RegState::Kill);
2725
+ } else {
2726
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32),
2727
+ TmpResultReg)
2728
+ .addImm (Offset);
2729
+ assert (Offset > 0 &&
2730
+ isUInt<24 >(2 * ST.getMaxWaveScratchSize ()) &&
2731
+ " offset is unsafe for v_mad_u32_u24" );
2732
+ // We start with a frame pointer with a wave space value, and an
2733
+ // offset in lane-space. We are materializing a lane space
2734
+ // value. We can either do a right shift of the frame pointer to
2735
+ // get to lane space, or a left shift of the offset to get to
2736
+ // wavespace. We can right shift after the computation to get
2737
+ // back to the desired per-lane value.
2738
+ // We are using the mad_u32_u24 primarily as an add with no
2739
+ // carry out clobber.
2740
+ Add = BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MAD_U32_U24_e64),
2741
+ TmpResultReg)
2742
+ .addReg (TmpResultReg, RegState::Kill)
2743
+ .addImm (ST.getWavefrontSize ())
2744
+ .addReg (FrameReg)
2745
+ .addImm (0 );
2746
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64),
2747
+ TmpResultReg)
2748
+ .addImm (ST.getWavefrontSizeLog2 ())
2749
+ .addReg (FrameReg);
2750
+ }
2684
2751
2685
- BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_LSHR_B32), ScaledReg)
2686
- .addReg (FrameReg)
2687
- .addImm (ST.getWavefrontSizeLog2 ());
2688
- BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_I32), ScaledReg)
2689
- .addReg (ScaledReg, RegState::Kill)
2690
- .addImm (Offset);
2752
+ Register NewDest = IsCopy ? ResultReg
2753
+ : RS->scavengeRegisterBackwards (
2754
+ AMDGPU::SReg_32RegClass, *Add,
2755
+ false , 0 , /* AllowSpill=*/ true );
2756
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_READFIRSTLANE_B32),
2757
+ NewDest)
2758
+ .addReg (TmpResultReg);
2759
+ ResultReg = NewDest;
2760
+ }
2691
2761
if (!IsSALU)
2692
2762
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::COPY), ResultReg)
2693
- .addReg (ScaledReg , RegState::Kill);
2763
+ .addReg (TmpResultReg , RegState::Kill);
2694
2764
else
2695
- ResultReg = ScaledReg;
2696
-
2765
+ ResultReg = TmpResultReg;
2697
2766
// If there were truly no free SGPRs, we need to undo everything.
2698
2767
if (!TmpScaledReg.isValid ()) {
2699
2768
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_I32), ScaledReg)
0 commit comments