@@ -2449,7 +2449,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2449
2449
? &AMDGPU::SReg_32RegClass
2450
2450
: &AMDGPU::VGPR_32RegClass;
2451
2451
bool IsCopy = MI->getOpcode () == AMDGPU::V_MOV_B32_e32 ||
2452
- MI->getOpcode () == AMDGPU::V_MOV_B32_e64;
2452
+ MI->getOpcode () == AMDGPU::V_MOV_B32_e64 ||
2453
+ MI->getOpcode () == AMDGPU::S_MOV_B32;
2453
2454
Register ResultReg =
2454
2455
IsCopy ? MI->getOperand (0 ).getReg ()
2455
2456
: RS->scavengeRegisterBackwards (*RC, MI, false , 0 );
@@ -2458,7 +2459,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2458
2459
if (Offset == 0 ) {
2459
2460
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
2460
2461
: AMDGPU::V_LSHRREV_B32_e64;
2461
- auto Shift = BuildMI (*MBB, MI, DL, TII->get (OpCode), ResultReg);
2462
+ Register TmpResultReg = ResultReg;
2463
+ if (IsSALU && LiveSCC) {
2464
+ TmpResultReg = RS->scavengeRegisterBackwards (
2465
+ AMDGPU::VGPR_32RegClass, MI, false , 0 );
2466
+ }
2467
+
2468
+ auto Shift = BuildMI (*MBB, MI, DL, TII->get (OpCode), TmpResultReg);
2462
2469
if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2463
2470
// For V_LSHRREV, the operands are reversed (the shift count goes
2464
2471
// first).
@@ -2468,11 +2475,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2468
2475
if (IsSALU && !LiveSCC)
2469
2476
Shift.getInstr ()->getOperand (3 ).setIsDead (); // Mark SCC as dead.
2470
2477
if (IsSALU && LiveSCC) {
2471
- Register NewDest = RS->scavengeRegisterBackwards (
2472
- AMDGPU::SReg_32RegClass, Shift, false , 0 );
2478
+ Register NewDest =
2479
+ IsCopy ? ResultReg
2480
+ : RS->scavengeRegisterBackwards (AMDGPU::SReg_32RegClass,
2481
+ Shift, false , 0 );
2473
2482
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_READFIRSTLANE_B32),
2474
2483
NewDest)
2475
- .addReg (ResultReg );
2484
+ .addReg (TmpResultReg );
2476
2485
ResultReg = NewDest;
2477
2486
}
2478
2487
} else {
@@ -2523,22 +2532,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2523
2532
2524
2533
// We may have 1 free scratch SGPR even though a carry out is
2525
2534
// unavailable. Only one additional mov is needed.
2526
- Register TmpScaledReg = RS->scavengeRegisterBackwards (
2527
- AMDGPU::SReg_32_XM0RegClass, MI, false , 0 , false );
2528
- Register ScaledReg = TmpScaledReg.isValid () ? TmpScaledReg : FrameReg;
2535
+ Register TmpScaledReg = IsCopy && IsSALU
2536
+ ? ResultReg
2537
+ : RS->scavengeRegisterBackwards (
2538
+ AMDGPU::SReg_32_XM0RegClass, MI,
2539
+ false , 0 , /* AllowSpill=*/ false );
2540
+ Register ScaledReg =
2541
+ TmpScaledReg.isValid () ? TmpScaledReg : FrameReg;
2542
+ Register TmpResultReg = ScaledReg;
2543
+
2544
+ if (!LiveSCC) {
2545
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_LSHR_B32), TmpResultReg)
2546
+ .addReg (FrameReg)
2547
+ .addImm (ST.getWavefrontSizeLog2 ());
2548
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_I32), TmpResultReg)
2549
+ .addReg (TmpResultReg, RegState::Kill)
2550
+ .addImm (Offset);
2551
+ } else {
2552
+ TmpResultReg = RS->scavengeRegisterBackwards (
2553
+ AMDGPU::VGPR_32RegClass, MI, false , 0 , /* AllowSpill=*/ true );
2554
+
2555
+ MachineInstrBuilder Add;
2556
+ if ((Add = TII->getAddNoCarry (*MBB, MI, DL, TmpResultReg, *RS))) {
2557
+ BuildMI (*MBB, *Add, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64),
2558
+ TmpResultReg)
2559
+ .addImm (ST.getWavefrontSizeLog2 ())
2560
+ .addReg (FrameReg);
2561
+ if (Add->getOpcode () == AMDGPU::V_ADD_CO_U32_e64) {
2562
+ BuildMI (*MBB, *Add, DL, TII->get (AMDGPU::S_MOV_B32),
2563
+ ResultReg)
2564
+ .addImm (Offset);
2565
+ Add.addReg (ResultReg, RegState::Kill)
2566
+ .addReg (TmpResultReg, RegState::Kill)
2567
+ .addImm (0 );
2568
+ } else
2569
+ Add.addImm (Offset).addReg (TmpResultReg, RegState::Kill);
2570
+ } else {
2571
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32),
2572
+ TmpResultReg)
2573
+ .addImm (Offset);
2574
+ assert (Offset > 0 &&
2575
+ isUInt<24 >(2 * ST.getMaxWaveScratchSize ()) &&
2576
+ " offset is unsafe for v_mad_u32_u24" );
2577
+ // We start with a frame pointer with a wave space value, and an
2578
+ // offset in lane-space. We are materializing a lane space
2579
+ // value. We can either do a right shift of the frame pointer to
2580
+ // get to lane space, or a left shift of the offset to get to
2581
+ // wavespace. We can right shift after the computation to get
2582
+ // back to the desired per-lane value.
2583
+ // We are using the mad_u32_u24 primarily as an add with no
2584
+ // carry out clobber.
2585
+ Add = BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MAD_U32_U24_e64),
2586
+ TmpResultReg)
2587
+ .addReg (TmpResultReg, RegState::Kill)
2588
+ .addImm (ST.getWavefrontSize ())
2589
+ .addReg (FrameReg)
2590
+ .addImm (0 );
2591
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64),
2592
+ TmpResultReg)
2593
+ .addImm (ST.getWavefrontSizeLog2 ())
2594
+ .addReg (FrameReg);
2595
+ }
2529
2596
2530
- BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_LSHR_B32), ScaledReg)
2531
- .addReg (FrameReg)
2532
- .addImm (ST.getWavefrontSizeLog2 ());
2533
- BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_I32), ScaledReg)
2534
- .addReg (ScaledReg, RegState::Kill)
2535
- .addImm (Offset);
2597
+ Register NewDest = IsCopy ? ResultReg
2598
+ : RS->scavengeRegisterBackwards (
2599
+ AMDGPU::SReg_32RegClass, *Add,
2600
+ false , 0 , /* AllowSpill=*/ true );
2601
+ BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_READFIRSTLANE_B32),
2602
+ NewDest)
2603
+ .addReg (TmpResultReg);
2604
+ ResultReg = NewDest;
2605
+ }
2536
2606
if (!IsSALU)
2537
2607
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::COPY), ResultReg)
2538
- .addReg (ScaledReg , RegState::Kill);
2608
+ .addReg (TmpResultReg , RegState::Kill);
2539
2609
else
2540
- ResultReg = ScaledReg;
2541
-
2610
+ ResultReg = TmpResultReg;
2542
2611
// If there were truly no free SGPRs, we need to undo everything.
2543
2612
if (!TmpScaledReg.isValid ()) {
2544
2613
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_I32), ScaledReg)
0 commit comments