@@ -2250,7 +2250,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2250
2250
assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
2251
2251
" unreserved scratch RSRC register" );
2252
2252
2253
- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2253
+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
2254
2254
int Index = MI->getOperand (FIOperandNum).getIndex ();
2255
2255
2256
2256
Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2432,6 +2432,211 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2432
2432
MI->eraseFromParent ();
2433
2433
return true ;
2434
2434
}
2435
+ case AMDGPU::V_ADD_U32_e32:
2436
+ case AMDGPU::V_ADD_U32_e64:
2437
+ case AMDGPU::V_ADD_CO_U32_e32:
2438
+ case AMDGPU::V_ADD_CO_U32_e64: {
2439
+ // TODO: Handle sub, and, or.
2440
+ unsigned NumDefs = MI->getNumExplicitDefs ();
2441
+ unsigned Src0Idx = NumDefs;
2442
+
2443
+ bool HasClamp = false ;
2444
+ MachineOperand *VCCOp = nullptr ;
2445
+
2446
+ switch (MI->getOpcode ()) {
2447
+ case AMDGPU::V_ADD_U32_e32:
2448
+ break ;
2449
+ case AMDGPU::V_ADD_U32_e64:
2450
+ HasClamp = MI->getOperand (3 ).getImm ();
2451
+ break ;
2452
+ case AMDGPU::V_ADD_CO_U32_e32:
2453
+ VCCOp = &MI->getOperand (3 );
2454
+ break ;
2455
+ case AMDGPU::V_ADD_CO_U32_e64:
2456
+ VCCOp = &MI->getOperand (1 );
2457
+ HasClamp = MI->getOperand (4 ).getImm ();
2458
+ break ;
2459
+ default :
2460
+ break ;
2461
+ }
2462
+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2463
+ MachineOperand &DstOp = MI->getOperand (0 );
2464
+ Register DstReg = DstOp.getReg ();
2465
+
2466
+ unsigned OtherOpIdx =
2467
+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2468
+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2469
+
2470
+ unsigned Src1Idx = Src0Idx + 1 ;
2471
+ Register MaterializedReg = FrameReg;
2472
+ Register ScavengedVGPR;
2473
+
2474
+ if (FrameReg && !ST.enableFlatScratch ()) {
2475
+ // We should just do an in-place update of the result register. However,
2476
+ // the value there may also be used by the add, in which case we need a
2477
+ // temporary register.
2478
+ //
2479
+ // FIXME: The scavenger is not finding the result register in the
2480
+ // common case where the add does not read the register.
2481
+
2482
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2483
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2484
+
2485
+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2486
+ // shift.
2487
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2488
+ .addDef (ScavengedVGPR, RegState::Renamable)
2489
+ .addImm (ST.getWavefrontSizeLog2 ())
2490
+ .addReg (FrameReg);
2491
+ MaterializedReg = ScavengedVGPR;
2492
+ }
2493
+
2494
+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2495
+ // For the non-immediate case, we could fall through to the default
2496
+ // handling, but we do an in-place update of the result register here to
2497
+ // avoid scavenging another register.
2498
+ if (OtherOp->isImm ()) {
2499
+ OtherOp->setImm (OtherOp->getImm () + Offset);
2500
+ Offset = 0 ;
2501
+ }
2502
+
2503
+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2504
+ if (ST.enableFlatScratch () &&
2505
+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2506
+ // We didn't need the shift above, so we have an SGPR for the frame
2507
+ // register, but may have a VGPR only operand.
2508
+ //
2509
+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2510
+ // and use the higher constant bus restriction to avoid this copy.
2511
+
2512
+ if (!ScavengedVGPR) {
2513
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2514
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2515
+ /* SPAdj=*/ 0 );
2516
+ }
2517
+
2518
+ assert (ScavengedVGPR != DstReg);
2519
+
2520
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2521
+ .addReg (MaterializedReg,
2522
+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2523
+ MaterializedReg = ScavengedVGPR;
2524
+ }
2525
+
2526
+ // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2527
+ // is not live, we could use a scalar add + vector add instead of 2
2528
+ // vector adds.
2529
+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2530
+ .addDef (DstReg, RegState::Renamable);
2531
+ if (NumDefs == 2 )
2532
+ AddI32.add (MI->getOperand (1 ));
2533
+
2534
+ unsigned MaterializedRegFlags =
2535
+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2536
+
2537
+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2538
+ // If we know we have a VGPR already, it's more likely the other
2539
+ // operand is a legal vsrc0.
2540
+ AddI32
2541
+ .add (*OtherOp)
2542
+ .addReg (MaterializedReg, MaterializedRegFlags);
2543
+ } else {
2544
+ // Commute operands to avoid violating VOP2 restrictions. This will
2545
+ // typically happen when using scratch.
2546
+ AddI32
2547
+ .addReg (MaterializedReg, MaterializedRegFlags)
2548
+ .add (*OtherOp);
2549
+ }
2550
+
2551
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2552
+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2553
+ AddI32.addImm (0 ); // clamp
2554
+
2555
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2556
+ AddI32.setOperandDead (3 ); // Dead vcc
2557
+
2558
+ MaterializedReg = DstReg;
2559
+
2560
+ OtherOp->ChangeToRegister (MaterializedReg, false );
2561
+ OtherOp->setIsKill (true );
2562
+ FIOp->ChangeToImmediate (Offset);
2563
+ Offset = 0 ;
2564
+ } else if (Offset != 0 ) {
2565
+ assert (!MaterializedReg);
2566
+ FIOp->ChangeToImmediate (Offset);
2567
+ Offset = 0 ;
2568
+ } else {
2569
+ if (DeadVCC && !HasClamp) {
2570
+ assert (Offset == 0 );
2571
+
2572
+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2573
+ // let lowerCopy deal with it?
2574
+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2575
+ // Folded to an identity copy.
2576
+ MI->eraseFromParent ();
2577
+ return true ;
2578
+ }
2579
+
2580
+ // The immediate value should be in OtherOp
2581
+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2582
+ MI->removeOperand (FIOperandNum);
2583
+
2584
+ unsigned NumOps = MI->getNumOperands ();
2585
+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2586
+ MI->removeOperand (I);
2587
+
2588
+ if (NumDefs == 2 )
2589
+ MI->removeOperand (1 );
2590
+
2591
+ // The code below can't deal with a mov.
2592
+ return true ;
2593
+ }
2594
+
2595
+ // This folded to a constant, but we have to keep the add around for
2596
+ // pointless implicit defs or clamp modifier.
2597
+ FIOp->ChangeToImmediate (0 );
2598
+ }
2599
+
2600
+ // Try to improve legality by commuting.
2601
+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2602
+ std::swap (FIOp, OtherOp);
2603
+ std::swap (FIOperandNum, OtherOpIdx);
2604
+ }
2605
+
2606
+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2607
+ // Depending on operand constraints we may need to insert another copy.
2608
+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2609
+ // If commuting didn't make the operands legal, we need to materialize
2610
+ // in a register.
2611
+ // TODO: Can use SGPR on gfx10+ in some cases.
2612
+ if (!ScavengedVGPR) {
2613
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2614
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2615
+ /* SPAdj=*/ 0 );
2616
+ }
2617
+
2618
+ assert (ScavengedVGPR != DstReg);
2619
+
2620
+ MachineOperand &Src = MI->getOperand (SrcIdx);
2621
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2622
+ .add (Src);
2623
+
2624
+ Src.ChangeToRegister (ScavengedVGPR, false );
2625
+ Src.setIsKill (true );
2626
+ }
2627
+ }
2628
+
2629
+ // Fold out add of 0 case that can appear in kernels.
2630
+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2631
+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2632
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2633
+ }
2634
+
2635
+ MI->eraseFromParent ();
2636
+ }
2637
+
2638
+ return true ;
2639
+ }
2435
2640
case AMDGPU::S_ADD_I32: {
2436
2641
// TODO: Handle s_or_b32, s_and_b32.
2437
2642
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1 ;
@@ -2472,9 +2677,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2472
2677
Offset = 0 ;
2473
2678
2474
2679
if (MaterializedReg)
2475
- FIOp. ChangeToRegister (MaterializedReg, false );
2680
+ FIOp-> ChangeToRegister (MaterializedReg, false );
2476
2681
else
2477
- FIOp. ChangeToImmediate (0 );
2682
+ FIOp-> ChangeToImmediate (0 );
2478
2683
} else if (MaterializedReg) {
2479
2684
// If we can't fold the other operand, do another increment.
2480
2685
Register DstReg = DstOp.getReg ();
@@ -2497,27 +2702,27 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2497
2702
OtherOp.ChangeToRegister (MaterializedReg, false );
2498
2703
OtherOp.setIsKill (true );
2499
2704
OtherOp.setIsRenamable (true );
2500
- FIOp. ChangeToImmediate (Offset);
2705
+ FIOp-> ChangeToImmediate (Offset);
2501
2706
} else {
2502
2707
// If we don't have any other offset to apply, we can just directly
2503
2708
// interpret the frame index as the offset.
2504
- FIOp. ChangeToImmediate (Offset);
2709
+ FIOp-> ChangeToImmediate (Offset);
2505
2710
}
2506
2711
2507
2712
if (DeadSCC && OtherOp.isImm () && OtherOp.getImm () == 0 ) {
2508
2713
assert (Offset == 0 );
2509
2714
MI->removeOperand (3 );
2510
2715
MI->removeOperand (OtherOpIdx);
2511
- MI->setDesc (TII->get (FIOp. isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2512
- } else if (DeadSCC && FIOp. isImm () && FIOp. getImm () == 0 ) {
2716
+ MI->setDesc (TII->get (FIOp-> isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2717
+ } else if (DeadSCC && FIOp-> isImm () && FIOp-> getImm () == 0 ) {
2513
2718
assert (Offset == 0 );
2514
2719
MI->removeOperand (3 );
2515
2720
MI->removeOperand (FIOperandNum);
2516
2721
MI->setDesc (
2517
2722
TII->get (OtherOp.isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2518
2723
}
2519
2724
2520
- assert (!FIOp. isFI ());
2725
+ assert (!FIOp-> isFI ());
2521
2726
return true ;
2522
2727
}
2523
2728
default : {
@@ -2533,7 +2738,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2533
2738
2534
2739
// The offset is always swizzled, just replace it
2535
2740
if (FrameReg)
2536
- FIOp. ChangeToRegister (FrameReg, false );
2741
+ FIOp-> ChangeToRegister (FrameReg, false );
2537
2742
2538
2743
MachineOperand *OffsetOp =
2539
2744
TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2586,18 +2791,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2586
2791
}
2587
2792
2588
2793
if (!FrameReg) {
2589
- FIOp. ChangeToImmediate (Offset);
2590
- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2794
+ FIOp-> ChangeToImmediate (Offset);
2795
+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
2591
2796
return false ;
2592
2797
}
2593
2798
2594
2799
// We need to use register here. Check if we can use an SGPR or need
2595
2800
// a VGPR.
2596
- FIOp. ChangeToRegister (AMDGPU::M0, false );
2597
- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2801
+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2802
+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
2598
2803
2599
2804
if (!Offset && FrameReg && UseSGPR) {
2600
- FIOp. setReg (FrameReg);
2805
+ FIOp-> setReg (FrameReg);
2601
2806
return false ;
2602
2807
}
2603
2808
@@ -2606,8 +2811,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2606
2811
2607
2812
Register TmpReg =
2608
2813
RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2609
- FIOp. setReg (TmpReg);
2610
- FIOp. setIsKill ();
2814
+ FIOp-> setReg (TmpReg);
2815
+ FIOp-> setIsKill ();
2611
2816
2612
2817
if ((!FrameReg || !Offset) && TmpReg) {
2613
2818
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2636,8 +2841,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2636
2841
if (!TmpSReg) {
2637
2842
// Use frame register and restore it after.
2638
2843
TmpSReg = FrameReg;
2639
- FIOp. setReg (FrameReg);
2640
- FIOp. setIsKill (false );
2844
+ FIOp-> setReg (FrameReg);
2845
+ FIOp-> setIsKill (false );
2641
2846
}
2642
2847
2643
2848
if (NeedSaveSCC) {
@@ -2885,7 +3090,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2885
3090
MI->eraseFromParent ();
2886
3091
return true ;
2887
3092
}
2888
- FIOp. ChangeToRegister (ResultReg, false , false , true );
3093
+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
2889
3094
return false ;
2890
3095
}
2891
3096
@@ -2916,13 +3121,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2916
3121
// If the offset is simply too big, don't convert to a scratch wave offset
2917
3122
// relative index.
2918
3123
2919
- FIOp. ChangeToImmediate (Offset);
2920
- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
3124
+ FIOp-> ChangeToImmediate (Offset);
3125
+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
2921
3126
Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
2922
3127
MI, false , 0 );
2923
3128
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
2924
3129
.addImm (Offset);
2925
- FIOp. ChangeToRegister (TmpReg, false , false , true );
3130
+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
2926
3131
}
2927
3132
}
2928
3133
}
0 commit comments