@@ -2086,7 +2086,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2086
2086
assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
2087
2087
" unreserved scratch RSRC register" );
2088
2088
2089
- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2089
+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
2090
2090
int Index = MI->getOperand (FIOperandNum).getIndex ();
2091
2091
2092
2092
Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2268,6 +2268,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2268
2268
MI->eraseFromParent ();
2269
2269
return true ;
2270
2270
}
2271
+ case AMDGPU::V_ADD_U32_e32:
2272
+ case AMDGPU::V_ADD_U32_e64:
2273
+ case AMDGPU::V_ADD_CO_U32_e32:
2274
+ case AMDGPU::V_ADD_CO_U32_e64: {
2275
+ // TODO: Handle sub, and, or.
2276
+ unsigned NumDefs = MI->getNumExplicitDefs ();
2277
+ unsigned Src0Idx = NumDefs;
2278
+
2279
+ bool HasClamp = false ;
2280
+ MachineOperand *VCCOp = nullptr ;
2281
+
2282
+ switch (MI->getOpcode ()) {
2283
+ case AMDGPU::V_ADD_U32_e32:
2284
+ break ;
2285
+ case AMDGPU::V_ADD_U32_e64:
2286
+ HasClamp = MI->getOperand (3 ).getImm ();
2287
+ break ;
2288
+ case AMDGPU::V_ADD_CO_U32_e32:
2289
+ VCCOp = &MI->getOperand (3 );
2290
+ break ;
2291
+ case AMDGPU::V_ADD_CO_U32_e64:
2292
+ VCCOp = &MI->getOperand (1 );
2293
+ HasClamp = MI->getOperand (4 ).getImm ();
2294
+ break ;
2295
+ default :
2296
+ break ;
2297
+ }
2298
+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2299
+ MachineOperand &DstOp = MI->getOperand (0 );
2300
+ Register DstReg = DstOp.getReg ();
2301
+
2302
+ unsigned OtherOpIdx =
2303
+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2304
+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2305
+
2306
+ unsigned Src1Idx = Src0Idx + 1 ;
2307
+ Register MaterializedReg = FrameReg;
2308
+ Register ScavengedVGPR;
2309
+
2310
+ if (FrameReg && !ST.enableFlatScratch ()) {
2311
+ // We should just do an in-place update of the result register. However,
2312
+ // the value there may also be used by the add, in which case we need a
2313
+ // temporary register.
2314
+ //
2315
+ // FIXME: The scavenger is not finding the result register in the
2316
+ // common case where the add does not read the register.
2317
+
2318
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2319
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2320
+
2321
+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2322
+ // shift.
2323
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2324
+ .addDef (ScavengedVGPR, RegState::Renamable)
2325
+ .addImm (ST.getWavefrontSizeLog2 ())
2326
+ .addReg (FrameReg);
2327
+ MaterializedReg = ScavengedVGPR;
2328
+ }
2329
+
2330
+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2331
+ // For the non-immediate case, we could fall through to the default
2332
+ // handling, but we do an in-place update of the result register here to
2333
+ // avoid scavenging another register.
2334
+ if (OtherOp->isImm ()) {
2335
+ OtherOp->setImm (OtherOp->getImm () + Offset);
2336
+ Offset = 0 ;
2337
+ }
2338
+
2339
+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2340
+ if (ST.enableFlatScratch () &&
2341
+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2342
+ // We didn't need the shift above, so we have an SGPR for the frame
2343
+ // register, but may have a VGPR only operand.
2344
+ //
2345
+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2346
+ // and use the higher constant bus restriction to avoid this copy.
2347
+
2348
+ if (!ScavengedVGPR) {
2349
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2350
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2351
+ /* SPAdj=*/ 0 );
2352
+ }
2353
+
2354
+ assert (ScavengedVGPR != DstReg);
2355
+
2356
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2357
+ .addReg (MaterializedReg,
2358
+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2359
+ MaterializedReg = ScavengedVGPR;
2360
+ }
2361
+
2362
+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2363
+ .addDef (DstReg, RegState::Renamable);
2364
+ if (NumDefs == 2 )
2365
+ AddI32.add (MI->getOperand (1 ));
2366
+
2367
+ unsigned MaterializedRegFlags =
2368
+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2369
+
2370
+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2371
+ // If we know we have a VGPR already, it's more likely the other
2372
+ // operand is a legal vsrc0.
2373
+ AddI32
2374
+ .add (*OtherOp)
2375
+ .addReg (MaterializedReg, MaterializedRegFlags);
2376
+ } else {
2377
+ // Commute operands to avoid violating VOP2 restrictions. This will
2378
+ // typically happen when using scratch.
2379
+ AddI32
2380
+ .addReg (MaterializedReg, MaterializedRegFlags)
2381
+ .add (*OtherOp);
2382
+ }
2383
+
2384
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2385
+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2386
+ AddI32.addImm (0 ); // clamp
2387
+
2388
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2389
+ AddI32.setOperandDead (3 ); // Dead vcc
2390
+
2391
+ MaterializedReg = DstReg;
2392
+
2393
+ OtherOp->ChangeToRegister (MaterializedReg, false );
2394
+ OtherOp->setIsKill (true );
2395
+ FIOp->ChangeToImmediate (Offset);
2396
+ Offset = 0 ;
2397
+ } else if (Offset != 0 ) {
2398
+ assert (!MaterializedReg);
2399
+ FIOp->ChangeToImmediate (Offset);
2400
+ Offset = 0 ;
2401
+ } else {
2402
+ if (DeadVCC && !HasClamp) {
2403
+ assert (Offset == 0 );
2404
+
2405
+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2406
+ // let lowerCopy deal with it?
2407
+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2408
+ // Folded to an identity copy.
2409
+ MI->eraseFromParent ();
2410
+ return true ;
2411
+ }
2412
+
2413
+ // The immediate value should be in OtherOp
2414
+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2415
+ MI->removeOperand (FIOperandNum);
2416
+
2417
+ unsigned NumOps = MI->getNumOperands ();
2418
+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2419
+ MI->removeOperand (I);
2420
+
2421
+ if (NumDefs == 2 )
2422
+ MI->removeOperand (1 );
2423
+
2424
+ // The code below can't deal with a mov.
2425
+ return true ;
2426
+ }
2427
+
2428
+ // This folded to a constant, but we have to keep the add around for
2429
+ // pointless implicit defs or clamp modifier.
2430
+ FIOp->ChangeToImmediate (0 );
2431
+ }
2432
+
2433
+ // Try to improve legality by commuting.
2434
+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2435
+ std::swap (FIOp, OtherOp);
2436
+ std::swap (FIOperandNum, OtherOpIdx);
2437
+ }
2438
+
2439
+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2440
+ // Depending on operand constraints we may need to insert another copy.
2441
+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2442
+ // If commuting didn't make the operands legal, we need to materialize
2443
+ // in a register.
2444
+ // TODO: Can use SGPR on gfx10+ in some cases.
2445
+ if (!ScavengedVGPR) {
2446
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2447
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2448
+ /* SPAdj=*/ 0 );
2449
+ }
2450
+
2451
+ assert (ScavengedVGPR != DstReg);
2452
+
2453
+ MachineOperand &Src = MI->getOperand (SrcIdx);
2454
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2455
+ .add (Src);
2456
+
2457
+ Src.ChangeToRegister (ScavengedVGPR, false );
2458
+ Src.setIsKill (true );
2459
+ }
2460
+ }
2461
+
2462
+ // Fold out add of 0 case that can appear in kernels.
2463
+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2464
+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2465
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2466
+ }
2467
+
2468
+ MI->eraseFromParent ();
2469
+ }
2470
+
2471
+ return true ;
2472
+ }
2271
2473
case AMDGPU::S_ADD_I32:
2272
2474
case AMDGPU::S_OR_B32:
2273
2475
case AMDGPU::S_AND_B32: {
@@ -2336,7 +2538,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2336
2538
} else {
2337
2539
if (MaterializedReg)
2338
2540
OtherOp.ChangeToRegister (MaterializedReg, false );
2339
- FIOp. ChangeToImmediate (NewOffset);
2541
+ FIOp-> ChangeToImmediate (NewOffset);
2340
2542
}
2341
2543
2342
2544
return true ;
@@ -2354,7 +2556,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2354
2556
2355
2557
// The offset is always swizzled, just replace it
2356
2558
if (FrameReg)
2357
- FIOp. ChangeToRegister (FrameReg, false );
2559
+ FIOp-> ChangeToRegister (FrameReg, false );
2358
2560
2359
2561
MachineOperand *OffsetOp =
2360
2562
TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2407,18 +2609,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2407
2609
}
2408
2610
2409
2611
if (!FrameReg) {
2410
- FIOp. ChangeToImmediate (Offset);
2411
- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2612
+ FIOp-> ChangeToImmediate (Offset);
2613
+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
2412
2614
return false ;
2413
2615
}
2414
2616
2415
2617
// We need to use register here. Check if we can use an SGPR or need
2416
2618
// a VGPR.
2417
- FIOp. ChangeToRegister (AMDGPU::M0, false );
2418
- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2619
+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2620
+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
2419
2621
2420
2622
if (!Offset && FrameReg && UseSGPR) {
2421
- FIOp. setReg (FrameReg);
2623
+ FIOp-> setReg (FrameReg);
2422
2624
return false ;
2423
2625
}
2424
2626
@@ -2427,8 +2629,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2427
2629
2428
2630
Register TmpReg =
2429
2631
RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2430
- FIOp. setReg (TmpReg);
2431
- FIOp. setIsKill ();
2632
+ FIOp-> setReg (TmpReg);
2633
+ FIOp-> setIsKill ();
2432
2634
2433
2635
if ((!FrameReg || !Offset) && TmpReg) {
2434
2636
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2457,8 +2659,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2457
2659
if (!TmpSReg) {
2458
2660
// Use frame register and restore it after.
2459
2661
TmpSReg = FrameReg;
2460
- FIOp. setReg (FrameReg);
2461
- FIOp. setIsKill (false );
2662
+ FIOp-> setReg (FrameReg);
2663
+ FIOp-> setIsKill (false );
2462
2664
}
2463
2665
2464
2666
if (NeedSaveSCC) {
@@ -2706,7 +2908,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2706
2908
MI->eraseFromParent ();
2707
2909
return true ;
2708
2910
}
2709
- FIOp. ChangeToRegister (ResultReg, false , false , true );
2911
+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
2710
2912
return false ;
2711
2913
}
2712
2914
@@ -2737,13 +2939,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2737
2939
// If the offset is simply too big, don't convert to a scratch wave offset
2738
2940
// relative index.
2739
2941
2740
- FIOp. ChangeToImmediate (Offset);
2741
- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
2942
+ FIOp-> ChangeToImmediate (Offset);
2943
+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
2742
2944
Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
2743
2945
MI, false , 0 );
2744
2946
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
2745
2947
.addImm (Offset);
2746
- FIOp. ChangeToRegister (TmpReg, false , false , true );
2948
+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
2747
2949
}
2748
2950
}
2749
2951
}
0 commit comments