@@ -2270,7 +2270,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2270
2270
assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
2271
2271
" unreserved scratch RSRC register" );
2272
2272
2273
- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2273
+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
2274
2274
int Index = MI->getOperand (FIOperandNum).getIndex ();
2275
2275
2276
2276
Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2452,6 +2452,211 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2452
2452
MI->eraseFromParent ();
2453
2453
return true ;
2454
2454
}
2455
+ case AMDGPU::V_ADD_U32_e32:
2456
+ case AMDGPU::V_ADD_U32_e64:
2457
+ case AMDGPU::V_ADD_CO_U32_e32:
2458
+ case AMDGPU::V_ADD_CO_U32_e64: {
2459
+ // TODO: Handle sub, and, or.
2460
+ unsigned NumDefs = MI->getNumExplicitDefs ();
2461
+ unsigned Src0Idx = NumDefs;
2462
+
2463
+ bool HasClamp = false ;
2464
+ MachineOperand *VCCOp = nullptr ;
2465
+
2466
+ switch (MI->getOpcode ()) {
2467
+ case AMDGPU::V_ADD_U32_e32:
2468
+ break ;
2469
+ case AMDGPU::V_ADD_U32_e64:
2470
+ HasClamp = MI->getOperand (3 ).getImm ();
2471
+ break ;
2472
+ case AMDGPU::V_ADD_CO_U32_e32:
2473
+ VCCOp = &MI->getOperand (3 );
2474
+ break ;
2475
+ case AMDGPU::V_ADD_CO_U32_e64:
2476
+ VCCOp = &MI->getOperand (1 );
2477
+ HasClamp = MI->getOperand (4 ).getImm ();
2478
+ break ;
2479
+ default :
2480
+ break ;
2481
+ }
2482
+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2483
+ MachineOperand &DstOp = MI->getOperand (0 );
2484
+ Register DstReg = DstOp.getReg ();
2485
+
2486
+ unsigned OtherOpIdx =
2487
+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2488
+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2489
+
2490
+ unsigned Src1Idx = Src0Idx + 1 ;
2491
+ Register MaterializedReg = FrameReg;
2492
+ Register ScavengedVGPR;
2493
+
2494
+ if (FrameReg && !ST.enableFlatScratch ()) {
2495
+ // We should just do an in-place update of the result register. However,
2496
+ // the value there may also be used by the add, in which case we need a
2497
+ // temporary register.
2498
+ //
2499
+ // FIXME: The scavenger is not finding the result register in the
2500
+ // common case where the add does not read the register.
2501
+
2502
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2503
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2504
+
2505
+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2506
+ // shift.
2507
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2508
+ .addDef (ScavengedVGPR, RegState::Renamable)
2509
+ .addImm (ST.getWavefrontSizeLog2 ())
2510
+ .addReg (FrameReg);
2511
+ MaterializedReg = ScavengedVGPR;
2512
+ }
2513
+
2514
+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2515
+ // For the non-immediate case, we could fall through to the default
2516
+ // handling, but we do an in-place update of the result register here to
2517
+ // avoid scavenging another register.
2518
+ if (OtherOp->isImm ()) {
2519
+ OtherOp->setImm (OtherOp->getImm () + Offset);
2520
+ Offset = 0 ;
2521
+ }
2522
+
2523
+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2524
+ if (ST.enableFlatScratch () &&
2525
+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2526
+ // We didn't need the shift above, so we have an SGPR for the frame
2527
+ // register, but may have a VGPR only operand.
2528
+ //
2529
+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2530
+ // and use the higher constant bus restriction to avoid this copy.
2531
+
2532
+ if (!ScavengedVGPR) {
2533
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2534
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2535
+ /* SPAdj=*/ 0 );
2536
+ }
2537
+
2538
+ assert (ScavengedVGPR != DstReg);
2539
+
2540
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2541
+ .addReg (MaterializedReg,
2542
+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2543
+ MaterializedReg = ScavengedVGPR;
2544
+ }
2545
+
2546
+ // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2547
+ // is not live, we could use a scalar add + vector add instead of 2
2548
+ // vector adds.
2549
+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2550
+ .addDef (DstReg, RegState::Renamable);
2551
+ if (NumDefs == 2 )
2552
+ AddI32.add (MI->getOperand (1 ));
2553
+
2554
+ unsigned MaterializedRegFlags =
2555
+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2556
+
2557
+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2558
+ // If we know we have a VGPR already, it's more likely the other
2559
+ // operand is a legal vsrc0.
2560
+ AddI32
2561
+ .add (*OtherOp)
2562
+ .addReg (MaterializedReg, MaterializedRegFlags);
2563
+ } else {
2564
+ // Commute operands to avoid violating VOP2 restrictions. This will
2565
+ // typically happen when using scratch.
2566
+ AddI32
2567
+ .addReg (MaterializedReg, MaterializedRegFlags)
2568
+ .add (*OtherOp);
2569
+ }
2570
+
2571
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2572
+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2573
+ AddI32.addImm (0 ); // clamp
2574
+
2575
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2576
+ AddI32.setOperandDead (3 ); // Dead vcc
2577
+
2578
+ MaterializedReg = DstReg;
2579
+
2580
+ OtherOp->ChangeToRegister (MaterializedReg, false );
2581
+ OtherOp->setIsKill (true );
2582
+ FIOp->ChangeToImmediate (Offset);
2583
+ Offset = 0 ;
2584
+ } else if (Offset != 0 ) {
2585
+ assert (!MaterializedReg);
2586
+ FIOp->ChangeToImmediate (Offset);
2587
+ Offset = 0 ;
2588
+ } else {
2589
+ if (DeadVCC && !HasClamp) {
2590
+ assert (Offset == 0 );
2591
+
2592
+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2593
+ // let lowerCopy deal with it?
2594
+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2595
+ // Folded to an identity copy.
2596
+ MI->eraseFromParent ();
2597
+ return true ;
2598
+ }
2599
+
2600
+ // The immediate value should be in OtherOp
2601
+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2602
+ MI->removeOperand (FIOperandNum);
2603
+
2604
+ unsigned NumOps = MI->getNumOperands ();
2605
+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2606
+ MI->removeOperand (I);
2607
+
2608
+ if (NumDefs == 2 )
2609
+ MI->removeOperand (1 );
2610
+
2611
+ // The code below can't deal with a mov.
2612
+ return true ;
2613
+ }
2614
+
2615
+ // This folded to a constant, but we have to keep the add around for
2616
+ // pointless implicit defs or clamp modifier.
2617
+ FIOp->ChangeToImmediate (0 );
2618
+ }
2619
+
2620
+ // Try to improve legality by commuting.
2621
+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2622
+ std::swap (FIOp, OtherOp);
2623
+ std::swap (FIOperandNum, OtherOpIdx);
2624
+ }
2625
+
2626
+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2627
+ // Depending on operand constraints we may need to insert another copy.
2628
+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2629
+ // If commuting didn't make the operands legal, we need to materialize
2630
+ // in a register.
2631
+ // TODO: Can use SGPR on gfx10+ in some cases.
2632
+ if (!ScavengedVGPR) {
2633
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2634
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2635
+ /* SPAdj=*/ 0 );
2636
+ }
2637
+
2638
+ assert (ScavengedVGPR != DstReg);
2639
+
2640
+ MachineOperand &Src = MI->getOperand (SrcIdx);
2641
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2642
+ .add (Src);
2643
+
2644
+ Src.ChangeToRegister (ScavengedVGPR, false );
2645
+ Src.setIsKill (true );
2646
+ }
2647
+ }
2648
+
2649
+ // Fold out add of 0 case that can appear in kernels.
2650
+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2651
+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2652
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2653
+ }
2654
+
2655
+ MI->eraseFromParent ();
2656
+ }
2657
+
2658
+ return true ;
2659
+ }
2455
2660
case AMDGPU::S_ADD_I32: {
2456
2661
// TODO: Handle s_or_b32, s_and_b32.
2457
2662
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1 ;
@@ -2492,9 +2697,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2492
2697
Offset = 0 ;
2493
2698
2494
2699
if (MaterializedReg)
2495
- FIOp. ChangeToRegister (MaterializedReg, false );
2700
+ FIOp-> ChangeToRegister (MaterializedReg, false );
2496
2701
else
2497
- FIOp. ChangeToImmediate (0 );
2702
+ FIOp-> ChangeToImmediate (0 );
2498
2703
} else if (MaterializedReg) {
2499
2704
// If we can't fold the other operand, do another increment.
2500
2705
Register DstReg = DstOp.getReg ();
@@ -2517,27 +2722,27 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2517
2722
OtherOp.ChangeToRegister (MaterializedReg, false );
2518
2723
OtherOp.setIsKill (true );
2519
2724
OtherOp.setIsRenamable (true );
2520
- FIOp. ChangeToImmediate (Offset);
2725
+ FIOp-> ChangeToImmediate (Offset);
2521
2726
} else {
2522
2727
// If we don't have any other offset to apply, we can just directly
2523
2728
// interpret the frame index as the offset.
2524
- FIOp. ChangeToImmediate (Offset);
2729
+ FIOp-> ChangeToImmediate (Offset);
2525
2730
}
2526
2731
2527
2732
if (DeadSCC && OtherOp.isImm () && OtherOp.getImm () == 0 ) {
2528
2733
assert (Offset == 0 );
2529
2734
MI->removeOperand (3 );
2530
2735
MI->removeOperand (OtherOpIdx);
2531
- MI->setDesc (TII->get (FIOp. isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2532
- } else if (DeadSCC && FIOp. isImm () && FIOp. getImm () == 0 ) {
2736
+ MI->setDesc (TII->get (FIOp-> isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2737
+ } else if (DeadSCC && FIOp-> isImm () && FIOp-> getImm () == 0 ) {
2533
2738
assert (Offset == 0 );
2534
2739
MI->removeOperand (3 );
2535
2740
MI->removeOperand (FIOperandNum);
2536
2741
MI->setDesc (
2537
2742
TII->get (OtherOp.isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2538
2743
}
2539
2744
2540
- assert (!FIOp. isFI ());
2745
+ assert (!FIOp-> isFI ());
2541
2746
return true ;
2542
2747
}
2543
2748
default : {
@@ -2553,7 +2758,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2553
2758
2554
2759
// The offset is always swizzled, just replace it
2555
2760
if (FrameReg)
2556
- FIOp. ChangeToRegister (FrameReg, false );
2761
+ FIOp-> ChangeToRegister (FrameReg, false );
2557
2762
2558
2763
MachineOperand *OffsetOp =
2559
2764
TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2606,18 +2811,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2606
2811
}
2607
2812
2608
2813
if (!FrameReg) {
2609
- FIOp. ChangeToImmediate (Offset);
2610
- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2814
+ FIOp-> ChangeToImmediate (Offset);
2815
+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
2611
2816
return false ;
2612
2817
}
2613
2818
2614
2819
// We need to use register here. Check if we can use an SGPR or need
2615
2820
// a VGPR.
2616
- FIOp. ChangeToRegister (AMDGPU::M0, false );
2617
- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2821
+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2822
+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
2618
2823
2619
2824
if (!Offset && FrameReg && UseSGPR) {
2620
- FIOp. setReg (FrameReg);
2825
+ FIOp-> setReg (FrameReg);
2621
2826
return false ;
2622
2827
}
2623
2828
@@ -2626,8 +2831,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2626
2831
2627
2832
Register TmpReg =
2628
2833
RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2629
- FIOp. setReg (TmpReg);
2630
- FIOp. setIsKill ();
2834
+ FIOp-> setReg (TmpReg);
2835
+ FIOp-> setIsKill ();
2631
2836
2632
2837
if ((!FrameReg || !Offset) && TmpReg) {
2633
2838
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2656,8 +2861,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2656
2861
if (!TmpSReg) {
2657
2862
// Use frame register and restore it after.
2658
2863
TmpSReg = FrameReg;
2659
- FIOp. setReg (FrameReg);
2660
- FIOp. setIsKill (false );
2864
+ FIOp-> setReg (FrameReg);
2865
+ FIOp-> setIsKill (false );
2661
2866
}
2662
2867
2663
2868
if (NeedSaveSCC) {
@@ -2905,7 +3110,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2905
3110
MI->eraseFromParent ();
2906
3111
return true ;
2907
3112
}
2908
- FIOp. ChangeToRegister (ResultReg, false , false , true );
3113
+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
2909
3114
return false ;
2910
3115
}
2911
3116
@@ -2936,13 +3141,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2936
3141
// If the offset is simply too big, don't convert to a scratch wave offset
2937
3142
// relative index.
2938
3143
2939
- FIOp. ChangeToImmediate (Offset);
2940
- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
3144
+ FIOp-> ChangeToImmediate (Offset);
3145
+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
2941
3146
Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
2942
3147
MI, false , 0 );
2943
3148
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
2944
3149
.addImm (Offset);
2945
- FIOp. ChangeToRegister (TmpReg, false , false , true );
3150
+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
2946
3151
}
2947
3152
}
2948
3153
}
0 commit comments