@@ -2278,15 +2278,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2278
2278
? AMDGPU::V_MOV_B64_PSEUDO
2279
2279
: AMDGPU::V_MOV_B32_e32;
2280
2280
Register ExecReg = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2281
-
2282
2281
Register DstReg = MI.getOperand (0 ).getReg ();
2283
2282
MachineOperand &ActiveSrc = MI.getOperand (1 );
2284
2283
MachineOperand &InactiveSrc = MI.getOperand (2 );
2285
2284
2286
- bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2287
-
2288
- // Find implicit exec src if this is running in WWM .
2289
- Register ExecSrcReg = 0 ;
2285
+ // Find implicit register defining lanes active outside WWM.
2286
+ // Note: default here is set to ExecReg so that functional MIR is still
2287
+ // generated if implicit def is not found and assertions are disabled .
2288
+ Register ExecSrcReg = ExecReg ;
2290
2289
for (auto &Op : MI.implicit_operands ()) {
2291
2290
if (Op.isDef () || !Op.isReg ())
2292
2291
continue ;
@@ -2297,120 +2296,135 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2297
2296
ExecSrcReg = OpReg;
2298
2297
break ;
2299
2298
}
2299
+ assert (ExecSrcReg != ExecReg &&
2300
+ " V_SET_INACTIVE must be in known WWM region" );
2300
2301
2301
2302
// Ideally in WWM this operation is lowered to V_CNDMASK; however,
2302
2303
// constant bus constraints and the presence of literal constants
2303
2304
// present an issue.
2304
2305
// Fallback to V_MOV base lowering in all but the common cases.
2305
- bool InWWM = !!ExecSrcReg;
2306
- bool UseVCndMask = false ;
2307
- if (InWWM) {
2308
- const MachineFunction *MF = MI.getParent ()->getParent ();
2309
- const MachineRegisterInfo &MRI = MF->getRegInfo ();
2310
- const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2311
- const MCInstrDesc &Desc = get (Opcode);
2312
- int Src0Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src0);
2313
- int Src1Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src1);
2314
- int ConstantBusLimit = ST.getConstantBusLimit (AMDGPU::V_CNDMASK_B32_e64);
2315
- int LiteralLimit = ST.hasVOP3Literal () ? 1 : 0 ;
2316
- int ConstantBusUses = 1 ; // Starts at one for ExecRegSrc
2317
- int LiteralConstants = 0 ;
2318
- ConstantBusUses +=
2319
- usesConstantBus (MRI, ActiveSrc, Desc.operands ()[Src1Idx]) ? 1 : 0 ;
2320
- ConstantBusUses +=
2321
- usesConstantBus (MRI, InactiveSrc, Desc.operands ()[Src0Idx]) ? 1 : 0 ;
2322
- LiteralConstants +=
2323
- ActiveSrc.isImm () &&
2324
- !isInlineConstant (ActiveSrc, Desc.operands ()[Src1Idx])
2325
- ? 1
2326
- : 0 ;
2327
- LiteralConstants +=
2328
- InactiveSrc.isImm () &&
2329
- !isInlineConstant (InactiveSrc, Desc.operands ()[Src0Idx])
2330
- ? 1
2331
- : 0 ;
2332
- UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
2333
- LiteralConstants <= LiteralLimit &&
2334
- (!VMov64 || (ActiveSrc.isReg () && InactiveSrc.isReg ()));
2306
+ const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2307
+ const MachineFunction *MF = MI.getParent ()->getParent ();
2308
+ const MachineRegisterInfo &MRI = MF->getRegInfo ();
2309
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2310
+ const MCInstrDesc &Desc = get (Opcode);
2311
+
2312
+ const APInt ActiveImm (64 , ActiveSrc.isImm () ? ActiveSrc.getImm () : 0 );
2313
+ const APInt InactiveImm (64 , InactiveSrc.isImm () ? InactiveSrc.getImm () : 0 );
2314
+ const APInt ActiveImmLo (32 , ActiveImm.getLoBits (32 ).getZExtValue ());
2315
+ const APInt ActiveImmHi (32 , ActiveImm.getHiBits (32 ).getZExtValue ());
2316
+ const APInt InactiveImmLo (32 , InactiveImm.getLoBits (32 ).getZExtValue ());
2317
+ const APInt InactiveImmHi (32 , InactiveImm.getHiBits (32 ).getZExtValue ());
2318
+
2319
+ int Src0Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src0);
2320
+ int Src1Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src1);
2321
+
2322
+ int ConstantBusLimit = ST.getConstantBusLimit (AMDGPU::V_CNDMASK_B32_e64);
2323
+ int LiteralLimit = ST.hasVOP3Literal () ? 1 : 0 ;
2324
+ int ConstantBusUses =
2325
+ 1 + // Starts at 1 for ExecSrcReg
2326
+ (usesConstantBus (MRI, ActiveSrc, Desc.operands ()[Src1Idx]) ? 1 : 0 ) +
2327
+ (usesConstantBus (MRI, InactiveSrc, Desc.operands ()[Src0Idx]) ? 1 : 0 );
2328
+ int LiteralConstants =
2329
+ (ActiveSrc.isImm () && !isInlineConstant (ActiveImm) ? 1 : 0 ) +
2330
+ (InactiveSrc.isImm () && !isInlineConstant (InactiveImm) ? 1 : 0 );
2331
+
2332
+ bool UseVCndMask =
2333
+ ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
2334
+ if (VMov64 && UseVCndMask) {
2335
+ // Decomposition must not introduce new literals.
2336
+ UseVCndMask &=
2337
+ ActiveSrc.isReg () ||
2338
+ (isInlineConstant (ActiveImmLo) && isInlineConstant (ActiveImmLo)) ||
2339
+ (!isInlineConstant (ActiveImm));
2340
+ UseVCndMask &= InactiveSrc.isReg () ||
2341
+ (isInlineConstant (InactiveImmLo) &&
2342
+ isInlineConstant (InactiveImmLo)) ||
2343
+ (!isInlineConstant (InactiveImm));
2335
2344
}
2336
2345
2337
2346
if (UseVCndMask && VMov64) {
2338
- // WWM B64; decompose to two B32 operations.
2339
- // Test above ensures that both sources are registers.
2340
- // Note: this is done to avoid falling back to V_MOV multiple times
2341
- // and introducing exec manipulation for each VGPR separately.
2342
- assert (ActiveSrc.isReg () && InactiveSrc.isReg ());
2343
- Register ActiveLo = RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub0);
2344
- Register ActiveHi = RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub1);
2345
- Register InactiveLo = RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub0);
2346
- Register InactiveHi = RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub1);
2347
- MachineInstr *Tmp;
2348
- Tmp = BuildMI (MBB, MI, DL, get (AMDGPU::V_SET_INACTIVE_B32),
2349
- RI.getSubReg (DstReg, AMDGPU::sub0))
2350
- .addReg (InactiveLo)
2351
- .addReg (ActiveLo)
2352
- .addReg (ExecSrcReg, RegState::Implicit)
2353
- .addReg (DstReg, RegState::ImplicitDefine);
2354
- expandPostRAPseudo (*Tmp);
2355
- Tmp = BuildMI (MBB, MI, DL, get (AMDGPU::V_SET_INACTIVE_B32),
2356
- RI.getSubReg (DstReg, AMDGPU::sub1))
2357
- .addReg (InactiveHi, InactiveSrc.isKill () ? RegState::Kill : 0 )
2358
- .addReg (ActiveHi, ActiveSrc.isKill () ? RegState::Kill : 0 )
2359
- .addReg (ExecSrcReg, RegState::Implicit)
2360
- .addReg (DstReg, RegState::ImplicitDefine);
2361
- expandPostRAPseudo (*Tmp);
2347
+ // Dual V_CNDMASK_B32
2348
+ MachineOperand ActiveLo =
2349
+ ActiveSrc.isReg ()
2350
+ ? MachineOperand::CreateReg (
2351
+ RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub0), false ,
2352
+ /* isImp=*/ false , /* isKill*/ false )
2353
+ : MachineOperand::CreateImm (ActiveImmLo.getSExtValue ());
2354
+ MachineOperand ActiveHi =
2355
+ ActiveSrc.isReg ()
2356
+ ? MachineOperand::CreateReg (
2357
+ RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub1), false ,
2358
+ /* isImp=*/ false , /* isKill*/ ActiveSrc.isKill ())
2359
+ : MachineOperand::CreateImm (ActiveImmHi.getSExtValue ());
2360
+ MachineOperand InactiveLo =
2361
+ InactiveSrc.isReg ()
2362
+ ? MachineOperand::CreateReg (
2363
+ RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub0), false ,
2364
+ /* isImp=*/ false , /* isKill*/ false )
2365
+ : MachineOperand::CreateImm (InactiveImmLo.getSExtValue ());
2366
+ MachineOperand InactiveHi =
2367
+ InactiveSrc.isReg ()
2368
+ ? MachineOperand::CreateReg (
2369
+ RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub1), false ,
2370
+ /* isImp=*/ false , /* isKill*/ InactiveSrc.isKill ())
2371
+ : MachineOperand::CreateImm (InactiveImmHi.getSExtValue ());
2372
+ BuildMI (MBB, MI, DL, get (Opcode), RI.getSubReg (DstReg, AMDGPU::sub0))
2373
+ .addImm (0 )
2374
+ .add (InactiveLo)
2375
+ .addImm (0 )
2376
+ .add (ActiveLo)
2377
+ .addReg (ExecSrcReg)
2378
+ .addReg (DstReg, RegState::ImplicitDefine);
2379
+ BuildMI (MBB, MI, DL, get (Opcode), RI.getSubReg (DstReg, AMDGPU::sub1))
2380
+ .addImm (0 )
2381
+ .add (InactiveHi)
2382
+ .addImm (0 )
2383
+ .add (ActiveHi)
2384
+ .addReg (ExecSrcReg)
2385
+ .addReg (DstReg, RegState::ImplicitDefine);
2362
2386
} else if (UseVCndMask) {
2363
- // WWM B32; use V_CNDMASK.
2364
- MachineInstr *VCndMask =
2365
- BuildMI (MBB, MI, DL, get (AMDGPU::V_CNDMASK_B32_e64), DstReg)
2366
- .addImm (0 )
2367
- .add (InactiveSrc)
2368
- .addImm (0 )
2369
- .add (ActiveSrc)
2370
- .addReg (ExecSrcReg);
2371
- // Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
2372
- for (auto &Op : MI.implicit_operands ()) {
2373
- if (!Op.isDef ())
2374
- continue ;
2375
- VCndMask->addOperand (Op);
2376
- }
2387
+ // Single V_CNDMASK_B32
2388
+ BuildMI (MBB, MI, DL, get (Opcode), DstReg)
2389
+ .addImm (0 )
2390
+ .add (InactiveSrc)
2391
+ .addImm (0 )
2392
+ .add (ActiveSrc)
2393
+ .addReg (ExecSrcReg);
2377
2394
} else {
2378
2395
// Fallback V_MOV case.
2379
- // Avoid unnecessary work if a src is the destination.
2396
+ // Avoid unnecessary work if a source VGPR is also the destination.
2380
2397
// This can happen if WWM register allocation was efficient.
2381
- bool SkipActive = ActiveSrc.isReg () && ActiveSrc.getReg () == DstReg;
2382
- bool SkipInactive = InactiveSrc.isReg () && InactiveSrc.getReg () == DstReg;
2383
- if (!SkipActive) {
2384
- if (InWWM) {
2385
- // Cancel WWM
2386
- BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addReg (ExecSrcReg);
2398
+ // Note: this assumes WWM execution.
2399
+ bool DstIsActive = ActiveSrc.isReg () && ActiveSrc.getReg () == DstReg;
2400
+ bool DstIsInactive =
2401
+ InactiveSrc.isReg () && InactiveSrc.getReg () == DstReg;
2402
+ if (!DstIsInactive) {
2403
+ // Set exec mask to inactive lanes,
2404
+ // but only if active lanes would be overwritten.
2405
+ if (DstIsActive) {
2406
+ MachineInstr *ExecMI =
2407
+ BuildMI (MBB, MI, DL, get (NotOpc), ExecReg).addReg (ExecSrcReg);
2408
+ ExecMI->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2387
2409
}
2388
- // Copy active lanes
2410
+ // Copy inactive lanes
2389
2411
MachineInstr *VMov =
2390
- BuildMI (MBB, MI, DL, get (VMovOpc), MI.getOperand (0 ).getReg ())
2391
- .add (ActiveSrc);
2412
+ BuildMI (MBB, MI, DL, get (VMovOpc), DstReg).add (InactiveSrc);
2392
2413
if (VMov64)
2393
2414
expandPostRAPseudo (*VMov);
2394
2415
}
2395
- if (!SkipInactive) {
2396
- // Set exec mask to inactive lanes
2397
- MachineInstr *ExecMI = BuildMI (MBB, MI, DL, get (NotOpc), ExecReg)
2398
- .addReg (InWWM ? ExecSrcReg : ExecReg);
2399
- ExecMI->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2400
- // Copy inactive lanes
2416
+ if (!DstIsActive) {
2417
+ // Set exec mask to active lanes
2418
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addReg (ExecSrcReg);
2419
+ // Copy active lanes
2401
2420
MachineInstr *VMov =
2402
- BuildMI (MBB, MI, DL, get (VMovOpc), DstReg).add (InactiveSrc);
2421
+ BuildMI (MBB, MI, DL, get (VMovOpc), MI.getOperand (0 ).getReg ())
2422
+ .add (ActiveSrc);
2403
2423
if (VMov64)
2404
2424
expandPostRAPseudo (*VMov);
2405
- if (!InWWM) {
2406
- // Restore original exec mask
2407
- BuildMI (MBB, MI, DL, get (NotOpc), ExecReg).addReg (ExecReg);
2408
- }
2409
- }
2410
- if (InWWM) {
2411
- // Restore WWM
2412
- BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addImm (-1 );
2413
2425
}
2426
+ // Restore WWM
2427
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addImm (-1 );
2414
2428
}
2415
2429
MI.eraseFromParent ();
2416
2430
break ;
0 commit comments