@@ -2273,37 +2273,148 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2273
2273
MI.eraseFromParent ();
2274
2274
break ;
2275
2275
}
2276
- case AMDGPU::V_SET_INACTIVE_B32: {
2277
- unsigned NotOpc = ST.isWave32 () ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2278
- unsigned Exec = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2279
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2280
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2281
- BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B32_e32), MI.getOperand (0 ).getReg ())
2282
- .add (MI.getOperand (1 ));
2283
- auto FirstNot = BuildMI (MBB, MI, DL, get (NotOpc), Exec).addReg (Exec);
2284
- FirstNot->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2285
- BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B32_e32), MI.getOperand (0 ).getReg ())
2286
- .add (MI.getOperand (2 ));
2287
- BuildMI (MBB, MI, DL, get (NotOpc), Exec)
2288
- .addReg (Exec);
2289
- MI.eraseFromParent ();
2290
- break ;
2291
- }
2276
+ case AMDGPU::V_SET_INACTIVE_B32:
2292
2277
case AMDGPU::V_SET_INACTIVE_B64: {
2293
2278
unsigned NotOpc = ST.isWave32 () ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2294
- unsigned Exec = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2295
- MachineInstr *Copy = BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B64_PSEUDO),
2296
- MI.getOperand (0 ).getReg ())
2297
- .add (MI.getOperand (1 ));
2298
- expandPostRAPseudo (*Copy);
2299
- auto FirstNot = BuildMI (MBB, MI, DL, get (NotOpc), Exec).addReg (Exec);
2300
- FirstNot->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2301
- Copy = BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B64_PSEUDO),
2302
- MI.getOperand (0 ).getReg ())
2303
- .add (MI.getOperand (2 ));
2304
- expandPostRAPseudo (*Copy);
2305
- BuildMI (MBB, MI, DL, get (NotOpc), Exec)
2306
- .addReg (Exec);
2279
+ unsigned MovOpc = ST.isWave32 () ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2280
+ unsigned VMovOpc = MI.getOpcode () == AMDGPU::V_SET_INACTIVE_B64
2281
+ ? AMDGPU::V_MOV_B64_PSEUDO
2282
+ : AMDGPU::V_MOV_B32_e32;
2283
+ Register ExecReg = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2284
+
2285
+ Register DstReg = MI.getOperand (0 ).getReg ();
2286
+ MachineOperand &ActiveSrc = MI.getOperand (1 );
2287
+ MachineOperand &InactiveSrc = MI.getOperand (2 );
2288
+
2289
+ bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2290
+
2291
+ // Find implicit exec src if this is running in WWM.
2292
+ Register ExecSrcReg = 0 ;
2293
+ for (auto &Op : MI.implicit_operands ()) {
2294
+ if (Op.isDef () || !Op.isReg ())
2295
+ continue ;
2296
+ Register OpReg = Op.getReg ();
2297
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2298
+ OpReg == AMDGPU::SCC)
2299
+ continue ;
2300
+ ExecSrcReg = OpReg;
2301
+ break ;
2302
+ }
2303
+
2304
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
2305
+ // constant bus constraints and the presence of literal constants
2306
+ // present an issue.
2307
+ // Fallback to V_MOV base lowering in all but the common cases.
2308
+ bool InWWM = !!ExecSrcReg;
2309
+ bool UseVCndMask = false ;
2310
+ if (InWWM) {
2311
+ const MachineFunction *MF = MI.getParent ()->getParent ();
2312
+ const MachineRegisterInfo &MRI = MF->getRegInfo ();
2313
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2314
+ const MCInstrDesc &Desc = get (Opcode);
2315
+ int Src0Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src0);
2316
+ int Src1Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src1);
2317
+ int ConstantBusLimit = ST.getConstantBusLimit (AMDGPU::V_CNDMASK_B32_e64);
2318
+ int LiteralLimit = ST.hasVOP3Literal () ? 1 : 0 ;
2319
+ int ConstantBusUses = 1 ; // Starts at one for ExecRegSrc
2320
+ int LiteralConstants = 0 ;
2321
+ ConstantBusUses +=
2322
+ usesConstantBus (MRI, ActiveSrc, Desc.operands ()[Src1Idx]) ? 1 : 0 ;
2323
+ ConstantBusUses +=
2324
+ usesConstantBus (MRI, InactiveSrc, Desc.operands ()[Src0Idx]) ? 1 : 0 ;
2325
+ LiteralConstants +=
2326
+ ActiveSrc.isImm () &&
2327
+ !isInlineConstant (ActiveSrc, Desc.operands ()[Src1Idx])
2328
+ ? 1
2329
+ : 0 ;
2330
+ LiteralConstants +=
2331
+ InactiveSrc.isImm () &&
2332
+ !isInlineConstant (InactiveSrc, Desc.operands ()[Src0Idx])
2333
+ ? 1
2334
+ : 0 ;
2335
+ UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
2336
+ LiteralConstants <= LiteralLimit &&
2337
+ (!VMov64 || (ActiveSrc.isReg () && InactiveSrc.isReg ()));
2338
+ }
2339
+
2340
+ if (UseVCndMask && VMov64) {
2341
+ // WWM B64; decompose to two B32 operations.
2342
+ // Test above ensures that both sources are registers.
2343
+ // Note: this is done to avoid falling back to V_MOV multiple times
2344
+ // and introducing exec manipulation for each VGPR separately.
2345
+ assert (ActiveSrc.isReg () && InactiveSrc.isReg ());
2346
+ Register ActiveLo = RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub0);
2347
+ Register ActiveHi = RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub1);
2348
+ Register InactiveLo = RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub0);
2349
+ Register InactiveHi = RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub1);
2350
+ MachineInstr *Tmp;
2351
+ Tmp = BuildMI (MBB, MI, DL, get (AMDGPU::V_SET_INACTIVE_B32),
2352
+ RI.getSubReg (DstReg, AMDGPU::sub0))
2353
+ .addReg (InactiveLo)
2354
+ .addReg (ActiveLo)
2355
+ .addReg (ExecSrcReg, RegState::Implicit)
2356
+ .addReg (DstReg, RegState::ImplicitDefine);
2357
+ expandPostRAPseudo (*Tmp);
2358
+ Tmp = BuildMI (MBB, MI, DL, get (AMDGPU::V_SET_INACTIVE_B32),
2359
+ RI.getSubReg (DstReg, AMDGPU::sub1))
2360
+ .addReg (InactiveHi, InactiveSrc.isKill () ? RegState::Kill : 0 )
2361
+ .addReg (ActiveHi, ActiveSrc.isKill () ? RegState::Kill : 0 )
2362
+ .addReg (ExecSrcReg, RegState::Implicit)
2363
+ .addReg (DstReg, RegState::ImplicitDefine);
2364
+ expandPostRAPseudo (*Tmp);
2365
+ } else if (UseVCndMask) {
2366
+ // WWM B32; use V_CNDMASK.
2367
+ MachineInstr *VCndMask =
2368
+ BuildMI (MBB, MI, DL, get (AMDGPU::V_CNDMASK_B32_e64), DstReg)
2369
+ .addImm (0 )
2370
+ .add (InactiveSrc)
2371
+ .addImm (0 )
2372
+ .add (ActiveSrc)
2373
+ .addReg (ExecSrcReg);
2374
+ // Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
2375
+ for (auto &Op : MI.implicit_operands ()) {
2376
+ if (!Op.isDef ())
2377
+ continue ;
2378
+ VCndMask->addOperand (Op);
2379
+ }
2380
+ } else {
2381
+ // Fallback V_MOV case.
2382
+ // Avoid unnecessary work if a src is the destination.
2383
+ // This can happen if WWM register allocation was efficient.
2384
+ bool SkipActive = ActiveSrc.isReg () && ActiveSrc.getReg () == DstReg;
2385
+ bool SkipInactive = InactiveSrc.isReg () && InactiveSrc.getReg () == DstReg;
2386
+ if (!SkipActive) {
2387
+ if (InWWM) {
2388
+ // Cancel WWM
2389
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addReg (ExecSrcReg);
2390
+ }
2391
+ // Copy active lanes
2392
+ MachineInstr *VMov =
2393
+ BuildMI (MBB, MI, DL, get (VMovOpc), MI.getOperand (0 ).getReg ())
2394
+ .add (ActiveSrc);
2395
+ if (VMov64)
2396
+ expandPostRAPseudo (*VMov);
2397
+ }
2398
+ if (!SkipInactive) {
2399
+ // Set exec mask to inactive lanes
2400
+ MachineInstr *ExecMI = BuildMI (MBB, MI, DL, get (NotOpc), ExecReg)
2401
+ .addReg (InWWM ? ExecSrcReg : ExecReg);
2402
+ ExecMI->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2403
+ // Copy inactive lanes
2404
+ MachineInstr *VMov =
2405
+ BuildMI (MBB, MI, DL, get (VMovOpc), DstReg).add (InactiveSrc);
2406
+ if (VMov64)
2407
+ expandPostRAPseudo (*VMov);
2408
+ if (!InWWM) {
2409
+ // Restore original exec mask
2410
+ BuildMI (MBB, MI, DL, get (NotOpc), ExecReg).addReg (ExecReg);
2411
+ }
2412
+ }
2413
+ if (InWWM) {
2414
+ // Restore WWM
2415
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addImm (-1 );
2416
+ }
2417
+ }
2307
2418
MI.eraseFromParent ();
2308
2419
break ;
2309
2420
}
0 commit comments