@@ -2270,37 +2270,148 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2270
2270
MI.eraseFromParent ();
2271
2271
break ;
2272
2272
}
2273
- case AMDGPU::V_SET_INACTIVE_B32: {
2274
- unsigned NotOpc = ST.isWave32 () ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2275
- unsigned Exec = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2276
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2277
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2278
- BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B32_e32), MI.getOperand (0 ).getReg ())
2279
- .add (MI.getOperand (1 ));
2280
- auto FirstNot = BuildMI (MBB, MI, DL, get (NotOpc), Exec).addReg (Exec);
2281
- FirstNot->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2282
- BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B32_e32), MI.getOperand (0 ).getReg ())
2283
- .add (MI.getOperand (2 ));
2284
- BuildMI (MBB, MI, DL, get (NotOpc), Exec)
2285
- .addReg (Exec);
2286
- MI.eraseFromParent ();
2287
- break ;
2288
- }
2273
+ case AMDGPU::V_SET_INACTIVE_B32:
2289
2274
case AMDGPU::V_SET_INACTIVE_B64: {
2290
2275
unsigned NotOpc = ST.isWave32 () ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2291
- unsigned Exec = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2292
- MachineInstr *Copy = BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B64_PSEUDO),
2293
- MI.getOperand (0 ).getReg ())
2294
- .add (MI.getOperand (1 ));
2295
- expandPostRAPseudo (*Copy);
2296
- auto FirstNot = BuildMI (MBB, MI, DL, get (NotOpc), Exec).addReg (Exec);
2297
- FirstNot->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2298
- Copy = BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B64_PSEUDO),
2299
- MI.getOperand (0 ).getReg ())
2300
- .add (MI.getOperand (2 ));
2301
- expandPostRAPseudo (*Copy);
2302
- BuildMI (MBB, MI, DL, get (NotOpc), Exec)
2303
- .addReg (Exec);
2276
+ unsigned MovOpc = ST.isWave32 () ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2277
+ unsigned VMovOpc = MI.getOpcode () == AMDGPU::V_SET_INACTIVE_B64
2278
+ ? AMDGPU::V_MOV_B64_PSEUDO
2279
+ : AMDGPU::V_MOV_B32_e32;
2280
+ Register ExecReg = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2281
+
2282
+ Register DstReg = MI.getOperand (0 ).getReg ();
2283
+ MachineOperand &ActiveSrc = MI.getOperand (1 );
2284
+ MachineOperand &InactiveSrc = MI.getOperand (2 );
2285
+
2286
+ bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2287
+
2288
+ // Find implicit exec src if this is running in WWM.
2289
+ Register ExecSrcReg = 0 ;
2290
+ for (auto &Op : MI.implicit_operands ()) {
2291
+ if (Op.isDef () || !Op.isReg ())
2292
+ continue ;
2293
+ Register OpReg = Op.getReg ();
2294
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2295
+ OpReg == AMDGPU::SCC)
2296
+ continue ;
2297
+ ExecSrcReg = OpReg;
2298
+ break ;
2299
+ }
2300
+
2301
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
2302
+ // constant bus constraints and the presence of literal constants
2303
+ // present an issue.
2304
+ // Fallback to V_MOV base lowering in all but the common cases.
2305
+ bool InWWM = !!ExecSrcReg;
2306
+ bool UseVCndMask = false ;
2307
+ if (InWWM) {
2308
+ const MachineFunction *MF = MI.getParent ()->getParent ();
2309
+ const MachineRegisterInfo &MRI = MF->getRegInfo ();
2310
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2311
+ const MCInstrDesc &Desc = get (Opcode);
2312
+ int Src0Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src0);
2313
+ int Src1Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src1);
2314
+ int ConstantBusLimit = ST.getConstantBusLimit (AMDGPU::V_CNDMASK_B32_e64);
2315
+ int LiteralLimit = ST.hasVOP3Literal () ? 1 : 0 ;
2316
+ int ConstantBusUses = 1 ; // Starts at one for ExecRegSrc
2317
+ int LiteralConstants = 0 ;
2318
+ ConstantBusUses +=
2319
+ usesConstantBus (MRI, ActiveSrc, Desc.operands ()[Src1Idx]) ? 1 : 0 ;
2320
+ ConstantBusUses +=
2321
+ usesConstantBus (MRI, InactiveSrc, Desc.operands ()[Src0Idx]) ? 1 : 0 ;
2322
+ LiteralConstants +=
2323
+ ActiveSrc.isImm () &&
2324
+ !isInlineConstant (ActiveSrc, Desc.operands ()[Src1Idx])
2325
+ ? 1
2326
+ : 0 ;
2327
+ LiteralConstants +=
2328
+ InactiveSrc.isImm () &&
2329
+ !isInlineConstant (InactiveSrc, Desc.operands ()[Src0Idx])
2330
+ ? 1
2331
+ : 0 ;
2332
+ UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
2333
+ LiteralConstants <= LiteralLimit &&
2334
+ (!VMov64 || (ActiveSrc.isReg () && InactiveSrc.isReg ()));
2335
+ }
2336
+
2337
+ if (UseVCndMask && VMov64) {
2338
+ // WWM B64; decompose to two B32 operations.
2339
+ // Test above ensures that both sources are registers.
2340
+ // Note: this is done to avoid falling back to V_MOV multiple times
2341
+ // and introducing exec manipulation for each VGPR separately.
2342
+ assert (ActiveSrc.isReg () && InactiveSrc.isReg ());
2343
+ Register ActiveLo = RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub0);
2344
+ Register ActiveHi = RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub1);
2345
+ Register InactiveLo = RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub0);
2346
+ Register InactiveHi = RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub1);
2347
+ MachineInstr *Tmp;
2348
+ Tmp = BuildMI (MBB, MI, DL, get (AMDGPU::V_SET_INACTIVE_B32),
2349
+ RI.getSubReg (DstReg, AMDGPU::sub0))
2350
+ .addReg (InactiveLo)
2351
+ .addReg (ActiveLo)
2352
+ .addReg (ExecSrcReg, RegState::Implicit)
2353
+ .addReg (DstReg, RegState::ImplicitDefine);
2354
+ expandPostRAPseudo (*Tmp);
2355
+ Tmp = BuildMI (MBB, MI, DL, get (AMDGPU::V_SET_INACTIVE_B32),
2356
+ RI.getSubReg (DstReg, AMDGPU::sub1))
2357
+ .addReg (InactiveHi, InactiveSrc.isKill () ? RegState::Kill : 0 )
2358
+ .addReg (ActiveHi, ActiveSrc.isKill () ? RegState::Kill : 0 )
2359
+ .addReg (ExecSrcReg, RegState::Implicit)
2360
+ .addReg (DstReg, RegState::ImplicitDefine);
2361
+ expandPostRAPseudo (*Tmp);
2362
+ } else if (UseVCndMask) {
2363
+ // WWM B32; use V_CNDMASK.
2364
+ MachineInstr *VCndMask =
2365
+ BuildMI (MBB, MI, DL, get (AMDGPU::V_CNDMASK_B32_e64), DstReg)
2366
+ .addImm (0 )
2367
+ .add (InactiveSrc)
2368
+ .addImm (0 )
2369
+ .add (ActiveSrc)
2370
+ .addReg (ExecSrcReg);
2371
+ // Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
2372
+ for (auto &Op : MI.implicit_operands ()) {
2373
+ if (!Op.isDef ())
2374
+ continue ;
2375
+ VCndMask->addOperand (Op);
2376
+ }
2377
+ } else {
2378
+ // Fallback V_MOV case.
2379
+ // Avoid unnecessary work if a src is the destination.
2380
+ // This can happen if WWM register allocation was efficient.
2381
+ bool SkipActive = ActiveSrc.isReg () && ActiveSrc.getReg () == DstReg;
2382
+ bool SkipInactive = InactiveSrc.isReg () && InactiveSrc.getReg () == DstReg;
2383
+ if (!SkipActive) {
2384
+ if (InWWM) {
2385
+ // Cancel WWM
2386
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addReg (ExecSrcReg);
2387
+ }
2388
+ // Copy active lanes
2389
+ MachineInstr *VMov =
2390
+ BuildMI (MBB, MI, DL, get (VMovOpc), MI.getOperand (0 ).getReg ())
2391
+ .add (ActiveSrc);
2392
+ if (VMov64)
2393
+ expandPostRAPseudo (*VMov);
2394
+ }
2395
+ if (!SkipInactive) {
2396
+ // Set exec mask to inactive lanes
2397
+ MachineInstr *ExecMI = BuildMI (MBB, MI, DL, get (NotOpc), ExecReg)
2398
+ .addReg (InWWM ? ExecSrcReg : ExecReg);
2399
+ ExecMI->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2400
+ // Copy inactive lanes
2401
+ MachineInstr *VMov =
2402
+ BuildMI (MBB, MI, DL, get (VMovOpc), DstReg).add (InactiveSrc);
2403
+ if (VMov64)
2404
+ expandPostRAPseudo (*VMov);
2405
+ if (!InWWM) {
2406
+ // Restore original exec mask
2407
+ BuildMI (MBB, MI, DL, get (NotOpc), ExecReg).addReg (ExecReg);
2408
+ }
2409
+ }
2410
+ if (InWWM) {
2411
+ // Restore WWM
2412
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addImm (-1 );
2413
+ }
2414
+ }
2304
2415
MI.eraseFromParent ();
2305
2416
break ;
2306
2417
}
0 commit comments