@@ -2273,37 +2273,162 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2273
2273
MI.eraseFromParent ();
2274
2274
break ;
2275
2275
}
2276
- case AMDGPU::V_SET_INACTIVE_B32: {
2277
- unsigned NotOpc = ST.isWave32 () ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2278
- unsigned Exec = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2279
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2280
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2281
- BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B32_e32), MI.getOperand (0 ).getReg ())
2282
- .add (MI.getOperand (1 ));
2283
- auto FirstNot = BuildMI (MBB, MI, DL, get (NotOpc), Exec).addReg (Exec);
2284
- FirstNot->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2285
- BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B32_e32), MI.getOperand (0 ).getReg ())
2286
- .add (MI.getOperand (2 ));
2287
- BuildMI (MBB, MI, DL, get (NotOpc), Exec)
2288
- .addReg (Exec);
2289
- MI.eraseFromParent ();
2290
- break ;
2291
- }
2276
+ case AMDGPU::V_SET_INACTIVE_B32:
2292
2277
case AMDGPU::V_SET_INACTIVE_B64: {
2293
2278
unsigned NotOpc = ST.isWave32 () ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2294
- unsigned Exec = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2295
- MachineInstr *Copy = BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B64_PSEUDO),
2296
- MI.getOperand (0 ).getReg ())
2297
- .add (MI.getOperand (1 ));
2298
- expandPostRAPseudo (*Copy);
2299
- auto FirstNot = BuildMI (MBB, MI, DL, get (NotOpc), Exec).addReg (Exec);
2300
- FirstNot->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2301
- Copy = BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B64_PSEUDO),
2302
- MI.getOperand (0 ).getReg ())
2303
- .add (MI.getOperand (2 ));
2304
- expandPostRAPseudo (*Copy);
2305
- BuildMI (MBB, MI, DL, get (NotOpc), Exec)
2306
- .addReg (Exec);
2279
+ unsigned MovOpc = ST.isWave32 () ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2280
+ unsigned VMovOpc = MI.getOpcode () == AMDGPU::V_SET_INACTIVE_B64
2281
+ ? AMDGPU::V_MOV_B64_PSEUDO
2282
+ : AMDGPU::V_MOV_B32_e32;
2283
+ Register ExecReg = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2284
+ Register DstReg = MI.getOperand (0 ).getReg ();
2285
+ MachineOperand &ActiveSrc = MI.getOperand (1 );
2286
+ MachineOperand &InactiveSrc = MI.getOperand (2 );
2287
+
2288
+ // Find implicit register defining lanes active outside WWM.
2289
+ // Note: default here is set to ExecReg so that functional MIR is still
2290
+ // generated if implicit def is not found and assertions are disabled.
2291
+ Register ExecSrcReg = ExecReg;
2292
+ for (auto &Op : MI.implicit_operands ()) {
2293
+ if (Op.isDef () || !Op.isReg ())
2294
+ continue ;
2295
+ Register OpReg = Op.getReg ();
2296
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2297
+ OpReg == AMDGPU::SCC)
2298
+ continue ;
2299
+ ExecSrcReg = OpReg;
2300
+ break ;
2301
+ }
2302
+ assert (ExecSrcReg != ExecReg &&
2303
+ " V_SET_INACTIVE must be in known WWM region" );
2304
+
2305
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
2306
+ // constant bus constraints and the presence of literal constants
2307
+ // present an issue.
2308
+ // Fallback to V_MOV base lowering in all but the common cases.
2309
+ const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2310
+ const MachineFunction *MF = MI.getParent ()->getParent ();
2311
+ const MachineRegisterInfo &MRI = MF->getRegInfo ();
2312
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2313
+ const MCInstrDesc &Desc = get (Opcode);
2314
+
2315
+ const APInt ActiveImm (64 , ActiveSrc.isImm () ? ActiveSrc.getImm () : 0 );
2316
+ const APInt InactiveImm (64 , InactiveSrc.isImm () ? InactiveSrc.getImm () : 0 );
2317
+ const APInt ActiveImmLo (32 , ActiveImm.getLoBits (32 ).getZExtValue ());
2318
+ const APInt ActiveImmHi (32 , ActiveImm.getHiBits (32 ).getZExtValue ());
2319
+ const APInt InactiveImmLo (32 , InactiveImm.getLoBits (32 ).getZExtValue ());
2320
+ const APInt InactiveImmHi (32 , InactiveImm.getHiBits (32 ).getZExtValue ());
2321
+
2322
+ int Src0Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src0);
2323
+ int Src1Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src1);
2324
+
2325
+ int ConstantBusLimit = ST.getConstantBusLimit (AMDGPU::V_CNDMASK_B32_e64);
2326
+ int LiteralLimit = ST.hasVOP3Literal () ? 1 : 0 ;
2327
+ int ConstantBusUses =
2328
+ 1 + // Starts at 1 for ExecSrcReg
2329
+ (usesConstantBus (MRI, ActiveSrc, Desc.operands ()[Src1Idx]) ? 1 : 0 ) +
2330
+ (usesConstantBus (MRI, InactiveSrc, Desc.operands ()[Src0Idx]) ? 1 : 0 );
2331
+ int LiteralConstants =
2332
+ (ActiveSrc.isImm () && !isInlineConstant (ActiveImm) ? 1 : 0 ) +
2333
+ (InactiveSrc.isImm () && !isInlineConstant (InactiveImm) ? 1 : 0 );
2334
+
2335
+ bool UseVCndMask =
2336
+ ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
2337
+ if (VMov64 && UseVCndMask) {
2338
+ // Decomposition must not introduce new literals.
2339
+ UseVCndMask &=
2340
+ ActiveSrc.isReg () ||
2341
+ (isInlineConstant (ActiveImmLo) && isInlineConstant (ActiveImmLo)) ||
2342
+ (!isInlineConstant (ActiveImm));
2343
+ UseVCndMask &= InactiveSrc.isReg () ||
2344
+ (isInlineConstant (InactiveImmLo) &&
2345
+ isInlineConstant (InactiveImmLo)) ||
2346
+ (!isInlineConstant (InactiveImm));
2347
+ }
2348
+
2349
+ if (UseVCndMask && VMov64) {
2350
+ // Dual V_CNDMASK_B32
2351
+ MachineOperand ActiveLo =
2352
+ ActiveSrc.isReg ()
2353
+ ? MachineOperand::CreateReg (
2354
+ RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub0), false ,
2355
+ /* isImp=*/ false , /* isKill*/ false )
2356
+ : MachineOperand::CreateImm (ActiveImmLo.getSExtValue ());
2357
+ MachineOperand ActiveHi =
2358
+ ActiveSrc.isReg ()
2359
+ ? MachineOperand::CreateReg (
2360
+ RI.getSubReg (ActiveSrc.getReg (), AMDGPU::sub1), false ,
2361
+ /* isImp=*/ false , /* isKill*/ ActiveSrc.isKill ())
2362
+ : MachineOperand::CreateImm (ActiveImmHi.getSExtValue ());
2363
+ MachineOperand InactiveLo =
2364
+ InactiveSrc.isReg ()
2365
+ ? MachineOperand::CreateReg (
2366
+ RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub0), false ,
2367
+ /* isImp=*/ false , /* isKill*/ false )
2368
+ : MachineOperand::CreateImm (InactiveImmLo.getSExtValue ());
2369
+ MachineOperand InactiveHi =
2370
+ InactiveSrc.isReg ()
2371
+ ? MachineOperand::CreateReg (
2372
+ RI.getSubReg (InactiveSrc.getReg (), AMDGPU::sub1), false ,
2373
+ /* isImp=*/ false , /* isKill*/ InactiveSrc.isKill ())
2374
+ : MachineOperand::CreateImm (InactiveImmHi.getSExtValue ());
2375
+ BuildMI (MBB, MI, DL, get (Opcode), RI.getSubReg (DstReg, AMDGPU::sub0))
2376
+ .addImm (0 )
2377
+ .add (InactiveLo)
2378
+ .addImm (0 )
2379
+ .add (ActiveLo)
2380
+ .addReg (ExecSrcReg)
2381
+ .addReg (DstReg, RegState::ImplicitDefine);
2382
+ BuildMI (MBB, MI, DL, get (Opcode), RI.getSubReg (DstReg, AMDGPU::sub1))
2383
+ .addImm (0 )
2384
+ .add (InactiveHi)
2385
+ .addImm (0 )
2386
+ .add (ActiveHi)
2387
+ .addReg (ExecSrcReg)
2388
+ .addReg (DstReg, RegState::ImplicitDefine);
2389
+ } else if (UseVCndMask) {
2390
+ // Single V_CNDMASK_B32
2391
+ BuildMI (MBB, MI, DL, get (Opcode), DstReg)
2392
+ .addImm (0 )
2393
+ .add (InactiveSrc)
2394
+ .addImm (0 )
2395
+ .add (ActiveSrc)
2396
+ .addReg (ExecSrcReg);
2397
+ } else {
2398
+ // Fallback V_MOV case.
2399
+ // Avoid unnecessary work if a source VGPR is also the destination.
2400
+ // This can happen if WWM register allocation was efficient.
2401
+ // Note: this assumes WWM execution.
2402
+ bool DstIsActive = ActiveSrc.isReg () && ActiveSrc.getReg () == DstReg;
2403
+ bool DstIsInactive =
2404
+ InactiveSrc.isReg () && InactiveSrc.getReg () == DstReg;
2405
+ if (!DstIsInactive) {
2406
+ // Set exec mask to inactive lanes,
2407
+ // but only if active lanes would be overwritten.
2408
+ if (DstIsActive) {
2409
+ MachineInstr *ExecMI =
2410
+ BuildMI (MBB, MI, DL, get (NotOpc), ExecReg).addReg (ExecSrcReg);
2411
+ ExecMI->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2412
+ }
2413
+ // Copy inactive lanes
2414
+ MachineInstr *VMov =
2415
+ BuildMI (MBB, MI, DL, get (VMovOpc), DstReg).add (InactiveSrc);
2416
+ if (VMov64)
2417
+ expandPostRAPseudo (*VMov);
2418
+ }
2419
+ if (!DstIsActive) {
2420
+ // Set exec mask to active lanes
2421
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addReg (ExecSrcReg);
2422
+ // Copy active lanes
2423
+ MachineInstr *VMov =
2424
+ BuildMI (MBB, MI, DL, get (VMovOpc), MI.getOperand (0 ).getReg ())
2425
+ .add (ActiveSrc);
2426
+ if (VMov64)
2427
+ expandPostRAPseudo (*VMov);
2428
+ }
2429
+ // Restore WWM
2430
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addImm (-1 );
2431
+ }
2307
2432
MI.eraseFromParent ();
2308
2433
break ;
2309
2434
}
0 commit comments