@@ -2250,7 +2250,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2250
2250
assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
2251
2251
" unreserved scratch RSRC register" );
2252
2252
2253
- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2253
+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
2254
2254
int Index = MI->getOperand (FIOperandNum).getIndex ();
2255
2255
2256
2256
Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2432,6 +2432,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2432
2432
MI->eraseFromParent ();
2433
2433
return true ;
2434
2434
}
2435
+ case AMDGPU::V_ADD_U32_e32:
2436
+ case AMDGPU::V_ADD_U32_e64:
2437
+ case AMDGPU::V_ADD_CO_U32_e32:
2438
+ case AMDGPU::V_ADD_CO_U32_e64: {
2439
+ // TODO: Handle sub, and, or.
2440
+ unsigned NumDefs = MI->getNumExplicitDefs ();
2441
+ unsigned Src0Idx = NumDefs;
2442
+
2443
+ bool HasClamp = false ;
2444
+ MachineOperand *VCCOp = nullptr ;
2445
+
2446
+ switch (MI->getOpcode ()) {
2447
+ case AMDGPU::V_ADD_U32_e32:
2448
+ break ;
2449
+ case AMDGPU::V_ADD_U32_e64:
2450
+ HasClamp = MI->getOperand (3 ).getImm ();
2451
+ break ;
2452
+ case AMDGPU::V_ADD_CO_U32_e32:
2453
+ VCCOp = &MI->getOperand (3 );
2454
+ break ;
2455
+ case AMDGPU::V_ADD_CO_U32_e64:
2456
+ VCCOp = &MI->getOperand (1 );
2457
+ HasClamp = MI->getOperand (4 ).getImm ();
2458
+ break ;
2459
+ default :
2460
+ break ;
2461
+ }
2462
+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2463
+ MachineOperand &DstOp = MI->getOperand (0 );
2464
+ Register DstReg = DstOp.getReg ();
2465
+
2466
+ unsigned OtherOpIdx =
2467
+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2468
+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2469
+
2470
+ unsigned Src1Idx = Src0Idx + 1 ;
2471
+ Register MaterializedReg = FrameReg;
2472
+ Register ScavengedVGPR;
2473
+
2474
+ if (FrameReg && !ST.enableFlatScratch ()) {
2475
+ // We should just do an in-place update of the result register. However,
2476
+ // the value there may also be used by the add, in which case we need a
2477
+ // temporary register.
2478
+ //
2479
+ // FIXME: The scavenger is not finding the result register in the
2480
+ // common case where the add does not read the register.
2481
+
2482
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2483
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2484
+
2485
+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2486
+ // shift.
2487
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2488
+ .addDef (ScavengedVGPR, RegState::Renamable)
2489
+ .addImm (ST.getWavefrontSizeLog2 ())
2490
+ .addReg (FrameReg);
2491
+ MaterializedReg = ScavengedVGPR;
2492
+ }
2493
+
2494
+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2495
+ // For the non-immediate case, we could fall through to the default
2496
+ // handling, but we do an in-place update of the result register here to
2497
+ // avoid scavenging another register.
2498
+ if (OtherOp->isImm ()) {
2499
+ OtherOp->setImm (OtherOp->getImm () + Offset);
2500
+ Offset = 0 ;
2501
+ }
2502
+
2503
+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2504
+ if (ST.enableFlatScratch () &&
2505
+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2506
+ // We didn't need the shift above, so we have an SGPR for the frame
2507
+ // register, but may have a VGPR only operand.
2508
+ //
2509
+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2510
+ // and use the higher constant bus restriction to avoid this copy.
2511
+
2512
+ if (!ScavengedVGPR) {
2513
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2514
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2515
+ /* SPAdj=*/ 0 );
2516
+ }
2517
+
2518
+ assert (ScavengedVGPR != DstReg);
2519
+
2520
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2521
+ .addReg (MaterializedReg,
2522
+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2523
+ MaterializedReg = ScavengedVGPR;
2524
+ }
2525
+
2526
+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2527
+ .addDef (DstReg, RegState::Renamable);
2528
+ if (NumDefs == 2 )
2529
+ AddI32.add (MI->getOperand (1 ));
2530
+
2531
+ unsigned MaterializedRegFlags =
2532
+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2533
+
2534
+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2535
+ // If we know we have a VGPR already, it's more likely the other
2536
+ // operand is a legal vsrc0.
2537
+ AddI32
2538
+ .add (*OtherOp)
2539
+ .addReg (MaterializedReg, MaterializedRegFlags);
2540
+ } else {
2541
+ // Commute operands to avoid violating VOP2 restrictions. This will
2542
+ // typically happen when using scratch.
2543
+ AddI32
2544
+ .addReg (MaterializedReg, MaterializedRegFlags)
2545
+ .add (*OtherOp);
2546
+ }
2547
+
2548
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2549
+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2550
+ AddI32.addImm (0 ); // clamp
2551
+
2552
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2553
+ AddI32.setOperandDead (3 ); // Dead vcc
2554
+
2555
+ MaterializedReg = DstReg;
2556
+
2557
+ OtherOp->ChangeToRegister (MaterializedReg, false );
2558
+ OtherOp->setIsKill (true );
2559
+ FIOp->ChangeToImmediate (Offset);
2560
+ Offset = 0 ;
2561
+ } else if (Offset != 0 ) {
2562
+ assert (!MaterializedReg);
2563
+ FIOp->ChangeToImmediate (Offset);
2564
+ Offset = 0 ;
2565
+ } else {
2566
+ if (DeadVCC && !HasClamp) {
2567
+ assert (Offset == 0 );
2568
+
2569
+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2570
+ // let lowerCopy deal with it?
2571
+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2572
+ // Folded to an identity copy.
2573
+ MI->eraseFromParent ();
2574
+ return true ;
2575
+ }
2576
+
2577
+ // The immediate value should be in OtherOp
2578
+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2579
+ MI->removeOperand (FIOperandNum);
2580
+
2581
+ unsigned NumOps = MI->getNumOperands ();
2582
+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2583
+ MI->removeOperand (I);
2584
+
2585
+ if (NumDefs == 2 )
2586
+ MI->removeOperand (1 );
2587
+
2588
+ // The code below can't deal with a mov.
2589
+ return true ;
2590
+ }
2591
+
2592
+ // This folded to a constant, but we have to keep the add around for
2593
+ // pointless implicit defs or clamp modifier.
2594
+ FIOp->ChangeToImmediate (0 );
2595
+ }
2596
+
2597
+ // Try to improve legality by commuting.
2598
+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2599
+ std::swap (FIOp, OtherOp);
2600
+ std::swap (FIOperandNum, OtherOpIdx);
2601
+ }
2602
+
2603
+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2604
+ // Depending on operand constraints we may need to insert another copy.
2605
+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2606
+ // If commuting didn't make the operands legal, we need to materialize
2607
+ // in a register.
2608
+ // TODO: Can use SGPR on gfx10+ in some cases.
2609
+ if (!ScavengedVGPR) {
2610
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2611
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2612
+ /* SPAdj=*/ 0 );
2613
+ }
2614
+
2615
+ assert (ScavengedVGPR != DstReg);
2616
+
2617
+ MachineOperand &Src = MI->getOperand (SrcIdx);
2618
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2619
+ .add (Src);
2620
+
2621
+ Src.ChangeToRegister (ScavengedVGPR, false );
2622
+ Src.setIsKill (true );
2623
+ }
2624
+ }
2625
+
2626
+ // Fold out add of 0 case that can appear in kernels.
2627
+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2628
+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2629
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2630
+ }
2631
+
2632
+ MI->eraseFromParent ();
2633
+ }
2634
+
2635
+ return true ;
2636
+ }
2435
2637
case AMDGPU::S_ADD_I32: {
2436
2638
// TODO: Handle s_or_b32, s_and_b32.
2437
2639
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1 ;
@@ -2495,32 +2697,32 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2495
2697
OtherOp.ChangeToRegister (MaterializedReg, false );
2496
2698
OtherOp.setIsKill (true );
2497
2699
OtherOp.setIsRenamable (true );
2498
- FIOp. ChangeToImmediate (Offset);
2700
+ FIOp-> ChangeToImmediate (Offset);
2499
2701
} else if (!OtherOp.isImm () && !MaterializedReg) {
2500
- FIOp. ChangeToImmediate (Offset);
2702
+ FIOp-> ChangeToImmediate (Offset);
2501
2703
} else {
2502
2704
assert (Offset == 0 );
2503
2705
2504
2706
if (MaterializedReg)
2505
- FIOp. ChangeToRegister (MaterializedReg, false );
2707
+ FIOp-> ChangeToRegister (MaterializedReg, false );
2506
2708
else
2507
- FIOp. ChangeToImmediate (0 );
2709
+ FIOp-> ChangeToImmediate (0 );
2508
2710
}
2509
2711
2510
2712
if (DeadSCC && OtherOp.isImm () && OtherOp.getImm () == 0 ) {
2511
2713
assert (Offset == 0 );
2512
2714
MI->removeOperand (3 );
2513
2715
MI->removeOperand (OtherOpIdx);
2514
- MI->setDesc (TII->get (FIOp. isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2515
- } else if (DeadSCC && FIOp. isImm () && FIOp. getImm () == 0 ) {
2716
+ MI->setDesc (TII->get (FIOp-> isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2717
+ } else if (DeadSCC && FIOp-> isImm () && FIOp-> getImm () == 0 ) {
2516
2718
assert (Offset == 0 );
2517
2719
MI->removeOperand (3 );
2518
2720
MI->removeOperand (FIOperandNum);
2519
2721
MI->setDesc (
2520
2722
TII->get (OtherOp.isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2521
2723
}
2522
2724
2523
- assert (!FIOp. isFI ());
2725
+ assert (!FIOp-> isFI ());
2524
2726
2525
2727
return true ;
2526
2728
}
@@ -2537,7 +2739,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2537
2739
2538
2740
// The offset is always swizzled, just replace it
2539
2741
if (FrameReg)
2540
- FIOp. ChangeToRegister (FrameReg, false );
2742
+ FIOp-> ChangeToRegister (FrameReg, false );
2541
2743
2542
2744
MachineOperand *OffsetOp =
2543
2745
TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2590,18 +2792,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2590
2792
}
2591
2793
2592
2794
if (!FrameReg) {
2593
- FIOp. ChangeToImmediate (Offset);
2594
- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2795
+ FIOp-> ChangeToImmediate (Offset);
2796
+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
2595
2797
return false ;
2596
2798
}
2597
2799
2598
2800
// We need to use register here. Check if we can use an SGPR or need
2599
2801
// a VGPR.
2600
- FIOp. ChangeToRegister (AMDGPU::M0, false );
2601
- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2802
+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2803
+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
2602
2804
2603
2805
if (!Offset && FrameReg && UseSGPR) {
2604
- FIOp. setReg (FrameReg);
2806
+ FIOp-> setReg (FrameReg);
2605
2807
return false ;
2606
2808
}
2607
2809
@@ -2610,8 +2812,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2610
2812
2611
2813
Register TmpReg =
2612
2814
RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2613
- FIOp. setReg (TmpReg);
2614
- FIOp. setIsKill ();
2815
+ FIOp-> setReg (TmpReg);
2816
+ FIOp-> setIsKill ();
2615
2817
2616
2818
if ((!FrameReg || !Offset) && TmpReg) {
2617
2819
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2640,8 +2842,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2640
2842
if (!TmpSReg) {
2641
2843
// Use frame register and restore it after.
2642
2844
TmpSReg = FrameReg;
2643
- FIOp. setReg (FrameReg);
2644
- FIOp. setIsKill (false );
2845
+ FIOp-> setReg (FrameReg);
2846
+ FIOp-> setIsKill (false );
2645
2847
}
2646
2848
2647
2849
if (NeedSaveSCC) {
@@ -2889,7 +3091,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2889
3091
MI->eraseFromParent ();
2890
3092
return true ;
2891
3093
}
2892
- FIOp. ChangeToRegister (ResultReg, false , false , true );
3094
+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
2893
3095
return false ;
2894
3096
}
2895
3097
@@ -2920,13 +3122,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2920
3122
// If the offset is simply too big, don't convert to a scratch wave offset
2921
3123
// relative index.
2922
3124
2923
- FIOp. ChangeToImmediate (Offset);
2924
- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
3125
+ FIOp-> ChangeToImmediate (Offset);
3126
+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
2925
3127
Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
2926
3128
MI, false , 0 );
2927
3129
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
2928
3130
.addImm (Offset);
2929
- FIOp. ChangeToRegister (TmpReg, false , false , true );
3131
+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
2930
3132
}
2931
3133
}
2932
3134
}
0 commit comments