@@ -2250,7 +2250,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2250
2250
assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
2251
2251
" unreserved scratch RSRC register" );
2252
2252
2253
- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2253
+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
2254
2254
int Index = MI->getOperand (FIOperandNum).getIndex ();
2255
2255
2256
2256
Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2432,6 +2432,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2432
2432
MI->eraseFromParent ();
2433
2433
return true ;
2434
2434
}
2435
+ case AMDGPU::V_ADD_U32_e32:
2436
+ case AMDGPU::V_ADD_U32_e64:
2437
+ case AMDGPU::V_ADD_CO_U32_e32:
2438
+ case AMDGPU::V_ADD_CO_U32_e64: {
2439
+ // TODO: Handle sub, and, or.
2440
+ unsigned NumDefs = MI->getNumExplicitDefs ();
2441
+ unsigned Src0Idx = NumDefs;
2442
+
2443
+ bool HasClamp = false ;
2444
+ MachineOperand *VCCOp = nullptr ;
2445
+
2446
+ switch (MI->getOpcode ()) {
2447
+ case AMDGPU::V_ADD_U32_e32:
2448
+ break ;
2449
+ case AMDGPU::V_ADD_U32_e64:
2450
+ HasClamp = MI->getOperand (3 ).getImm ();
2451
+ break ;
2452
+ case AMDGPU::V_ADD_CO_U32_e32:
2453
+ VCCOp = &MI->getOperand (3 );
2454
+ break ;
2455
+ case AMDGPU::V_ADD_CO_U32_e64:
2456
+ VCCOp = &MI->getOperand (1 );
2457
+ HasClamp = MI->getOperand (4 ).getImm ();
2458
+ break ;
2459
+ default :
2460
+ break ;
2461
+ }
2462
+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2463
+ MachineOperand &DstOp = MI->getOperand (0 );
2464
+ Register DstReg = DstOp.getReg ();
2465
+
2466
+ unsigned OtherOpIdx =
2467
+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2468
+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2469
+
2470
+ unsigned Src1Idx = Src0Idx + 1 ;
2471
+ Register MaterializedReg = FrameReg;
2472
+ Register ScavengedVGPR;
2473
+
2474
+ if (FrameReg && !ST.enableFlatScratch ()) {
2475
+ // We should just do an in-place update of the result register. However,
2476
+ // the value there may also be used by the add, in which case we need a
2477
+ // temporary register.
2478
+ //
2479
+ // FIXME: The scavenger is not finding the result register in the
2480
+ // common case where the add does not read the register.
2481
+
2482
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2483
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2484
+
2485
+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2486
+ // shift.
2487
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2488
+ .addDef (ScavengedVGPR, RegState::Renamable)
2489
+ .addImm (ST.getWavefrontSizeLog2 ())
2490
+ .addReg (FrameReg);
2491
+ MaterializedReg = ScavengedVGPR;
2492
+ }
2493
+
2494
+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2495
+ // For the non-immediate case, we could fall through to the default
2496
+ // handling, but we do an in-place update of the result register here to
2497
+ // avoid scavenging another register.
2498
+ if (OtherOp->isImm ()) {
2499
+ OtherOp->setImm (OtherOp->getImm () + Offset);
2500
+ Offset = 0 ;
2501
+ }
2502
+
2503
+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2504
+ if (ST.enableFlatScratch () &&
2505
+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2506
+ // We didn't need the shift above, so we have an SGPR for the frame
2507
+ // register, but may have a VGPR only operand.
2508
+ //
2509
+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2510
+ // and use the higher constant bus restriction to avoid this copy.
2511
+
2512
+ if (!ScavengedVGPR) {
2513
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2514
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2515
+ /* SPAdj=*/ 0 );
2516
+ }
2517
+
2518
+ assert (ScavengedVGPR != DstReg);
2519
+
2520
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2521
+ .addReg (MaterializedReg,
2522
+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2523
+ MaterializedReg = ScavengedVGPR;
2524
+ }
2525
+
2526
+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2527
+ .addDef (DstReg, RegState::Renamable);
2528
+ if (NumDefs == 2 )
2529
+ AddI32.add (MI->getOperand (1 ));
2530
+
2531
+ unsigned MaterializedRegFlags =
2532
+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2533
+
2534
+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2535
+ // If we know we have a VGPR already, it's more likely the other
2536
+ // operand is a legal vsrc0.
2537
+ AddI32
2538
+ .add (*OtherOp)
2539
+ .addReg (MaterializedReg, MaterializedRegFlags);
2540
+ } else {
2541
+ // Commute operands to avoid violating VOP2 restrictions. This will
2542
+ // typically happen when using scratch.
2543
+ AddI32
2544
+ .addReg (MaterializedReg, MaterializedRegFlags)
2545
+ .add (*OtherOp);
2546
+ }
2547
+
2548
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2549
+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2550
+ AddI32.addImm (0 ); // clamp
2551
+
2552
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2553
+ AddI32.setOperandDead (3 ); // Dead vcc
2554
+
2555
+ MaterializedReg = DstReg;
2556
+
2557
+ OtherOp->ChangeToRegister (MaterializedReg, false );
2558
+ OtherOp->setIsKill (true );
2559
+ FIOp->ChangeToImmediate (Offset);
2560
+ Offset = 0 ;
2561
+ } else if (Offset != 0 ) {
2562
+ assert (!MaterializedReg);
2563
+ FIOp->ChangeToImmediate (Offset);
2564
+ Offset = 0 ;
2565
+ } else {
2566
+ if (DeadVCC && !HasClamp) {
2567
+ assert (Offset == 0 );
2568
+
2569
+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2570
+ // let lowerCopy deal with it?
2571
+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2572
+ // Folded to an identity copy.
2573
+ MI->eraseFromParent ();
2574
+ return true ;
2575
+ }
2576
+
2577
+ // The immediate value should be in OtherOp
2578
+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2579
+ MI->removeOperand (FIOperandNum);
2580
+
2581
+ unsigned NumOps = MI->getNumOperands ();
2582
+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2583
+ MI->removeOperand (I);
2584
+
2585
+ if (NumDefs == 2 )
2586
+ MI->removeOperand (1 );
2587
+
2588
+ // The code below can't deal with a mov.
2589
+ return true ;
2590
+ }
2591
+
2592
+ // This folded to a constant, but we have to keep the add around for
2593
+ // pointless implicit defs or clamp modifier.
2594
+ FIOp->ChangeToImmediate (0 );
2595
+ }
2596
+
2597
+ // Try to improve legality by commuting.
2598
+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2599
+ std::swap (FIOp, OtherOp);
2600
+ std::swap (FIOperandNum, OtherOpIdx);
2601
+ }
2602
+
2603
+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2604
+ // Depending on operand constraints we may need to insert another copy.
2605
+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2606
+ // If commuting didn't make the operands legal, we need to materialize
2607
+ // in a register.
2608
+ // TODO: Can use SGPR on gfx10+ in some cases.
2609
+ if (!ScavengedVGPR) {
2610
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2611
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2612
+ /* SPAdj=*/ 0 );
2613
+ }
2614
+
2615
+ assert (ScavengedVGPR != DstReg);
2616
+
2617
+ MachineOperand &Src = MI->getOperand (SrcIdx);
2618
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2619
+ .add (Src);
2620
+
2621
+ Src.ChangeToRegister (ScavengedVGPR, false );
2622
+ Src.setIsKill (true );
2623
+ }
2624
+ }
2625
+
2626
+ // Fold out add of 0 case that can appear in kernels.
2627
+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2628
+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2629
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2630
+ }
2631
+
2632
+ MI->eraseFromParent ();
2633
+ }
2634
+
2635
+ return true ;
2636
+ }
2435
2637
case AMDGPU::S_ADD_I32: {
2436
2638
// TODO: Handle s_or_b32, s_and_b32.
2437
2639
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1 ;
@@ -2472,9 +2674,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2472
2674
Offset = 0 ;
2473
2675
2474
2676
if (MaterializedReg)
2475
- FIOp. ChangeToRegister (MaterializedReg, false );
2677
+ FIOp-> ChangeToRegister (MaterializedReg, false );
2476
2678
else
2477
- FIOp. ChangeToImmediate (0 );
2679
+ FIOp-> ChangeToImmediate (0 );
2478
2680
} else if (MaterializedReg) {
2479
2681
// If we can't fold the other operand, do another increment.
2480
2682
Register DstReg = DstOp.getReg ();
@@ -2497,27 +2699,27 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2497
2699
OtherOp.ChangeToRegister (MaterializedReg, false );
2498
2700
OtherOp.setIsKill (true );
2499
2701
OtherOp.setIsRenamable (true );
2500
- FIOp. ChangeToImmediate (Offset);
2702
+ FIOp-> ChangeToImmediate (Offset);
2501
2703
} else {
2502
2704
// If we don't have any other offset to apply, we can just directly
2503
2705
// interpret the frame index as the offset.
2504
- FIOp. ChangeToImmediate (Offset);
2706
+ FIOp-> ChangeToImmediate (Offset);
2505
2707
}
2506
2708
2507
2709
if (DeadSCC && OtherOp.isImm () && OtherOp.getImm () == 0 ) {
2508
2710
assert (Offset == 0 );
2509
2711
MI->removeOperand (3 );
2510
2712
MI->removeOperand (OtherOpIdx);
2511
- MI->setDesc (TII->get (FIOp. isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2512
- } else if (DeadSCC && FIOp. isImm () && FIOp. getImm () == 0 ) {
2713
+ MI->setDesc (TII->get (FIOp-> isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2714
+ } else if (DeadSCC && FIOp-> isImm () && FIOp-> getImm () == 0 ) {
2513
2715
assert (Offset == 0 );
2514
2716
MI->removeOperand (3 );
2515
2717
MI->removeOperand (FIOperandNum);
2516
2718
MI->setDesc (
2517
2719
TII->get (OtherOp.isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2518
2720
}
2519
2721
2520
- assert (!FIOp. isFI ());
2722
+ assert (!FIOp-> isFI ());
2521
2723
return true ;
2522
2724
}
2523
2725
default : {
@@ -2533,7 +2735,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2533
2735
2534
2736
// The offset is always swizzled, just replace it
2535
2737
if (FrameReg)
2536
- FIOp. ChangeToRegister (FrameReg, false );
2738
+ FIOp-> ChangeToRegister (FrameReg, false );
2537
2739
2538
2740
MachineOperand *OffsetOp =
2539
2741
TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2586,18 +2788,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2586
2788
}
2587
2789
2588
2790
if (!FrameReg) {
2589
- FIOp. ChangeToImmediate (Offset);
2590
- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2791
+ FIOp-> ChangeToImmediate (Offset);
2792
+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
2591
2793
return false ;
2592
2794
}
2593
2795
2594
2796
// We need to use register here. Check if we can use an SGPR or need
2595
2797
// a VGPR.
2596
- FIOp. ChangeToRegister (AMDGPU::M0, false );
2597
- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2798
+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2799
+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
2598
2800
2599
2801
if (!Offset && FrameReg && UseSGPR) {
2600
- FIOp. setReg (FrameReg);
2802
+ FIOp-> setReg (FrameReg);
2601
2803
return false ;
2602
2804
}
2603
2805
@@ -2606,8 +2808,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2606
2808
2607
2809
Register TmpReg =
2608
2810
RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2609
- FIOp. setReg (TmpReg);
2610
- FIOp. setIsKill ();
2811
+ FIOp-> setReg (TmpReg);
2812
+ FIOp-> setIsKill ();
2611
2813
2612
2814
if ((!FrameReg || !Offset) && TmpReg) {
2613
2815
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2636,8 +2838,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2636
2838
if (!TmpSReg) {
2637
2839
// Use frame register and restore it after.
2638
2840
TmpSReg = FrameReg;
2639
- FIOp. setReg (FrameReg);
2640
- FIOp. setIsKill (false );
2841
+ FIOp-> setReg (FrameReg);
2842
+ FIOp-> setIsKill (false );
2641
2843
}
2642
2844
2643
2845
if (NeedSaveSCC) {
@@ -2885,7 +3087,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2885
3087
MI->eraseFromParent ();
2886
3088
return true ;
2887
3089
}
2888
- FIOp. ChangeToRegister (ResultReg, false , false , true );
3090
+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
2889
3091
return false ;
2890
3092
}
2891
3093
@@ -2916,13 +3118,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2916
3118
// If the offset is simply too big, don't convert to a scratch wave offset
2917
3119
// relative index.
2918
3120
2919
- FIOp. ChangeToImmediate (Offset);
2920
- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
3121
+ FIOp-> ChangeToImmediate (Offset);
3122
+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
2921
3123
Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
2922
3124
MI, false , 0 );
2923
3125
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
2924
3126
.addImm (Offset);
2925
- FIOp. ChangeToRegister (TmpReg, false , false , true );
3127
+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
2926
3128
}
2927
3129
}
2928
3130
}
0 commit comments