@@ -2098,8 +2098,22 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
2098
2098
}
2099
2099
}
2100
2100
2101
+ Register SIInstrInfo::findSetInactiveMask (const MachineInstr &MI) {
2102
+ assert (MI.getOpcode () == AMDGPU::V_SET_INACTIVE_B32 ||
2103
+ MI.getOpcode () == AMDGPU::V_SET_INACTIVE_B64);
2104
+ for (auto &Op : MI.implicit_operands ()) {
2105
+ if (Op.isDef ())
2106
+ continue ;
2107
+ Register OpReg = Op.getReg ();
2108
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2109
+ OpReg == AMDGPU::SCC)
2110
+ continue ;
2111
+ return OpReg;
2112
+ }
2113
+ return Register ();
2114
+ }
2115
+
2101
2116
bool SIInstrInfo::expandPostRAPseudo (MachineInstr &MI) const {
2102
- const SIRegisterInfo *TRI = ST.getRegisterInfo ();
2103
2117
MachineBasicBlock &MBB = *MI.getParent ();
2104
2118
DebugLoc DL = MBB.findDebugLoc (MI);
2105
2119
switch (MI.getOpcode ()) {
@@ -2273,37 +2287,147 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2273
2287
MI.eraseFromParent ();
2274
2288
break ;
2275
2289
}
2276
- case AMDGPU::V_SET_INACTIVE_B32: {
2277
- unsigned NotOpc = ST.isWave32 () ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2278
- unsigned Exec = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2279
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2280
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2281
- BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B32_e32), MI.getOperand (0 ).getReg ())
2282
- .add (MI.getOperand (1 ));
2283
- auto FirstNot = BuildMI (MBB, MI, DL, get (NotOpc), Exec).addReg (Exec);
2284
- FirstNot->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2285
- BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B32_e32), MI.getOperand (0 ).getReg ())
2286
- .add (MI.getOperand (2 ));
2287
- BuildMI (MBB, MI, DL, get (NotOpc), Exec)
2288
- .addReg (Exec);
2289
- MI.eraseFromParent ();
2290
- break ;
2291
- }
2290
+ case AMDGPU::V_SET_INACTIVE_B32:
2292
2291
case AMDGPU::V_SET_INACTIVE_B64: {
2293
2292
unsigned NotOpc = ST.isWave32 () ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2294
- unsigned Exec = ST.isWave32 () ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2295
- MachineInstr *Copy = BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B64_PSEUDO),
2296
- MI.getOperand (0 ).getReg ())
2297
- .add (MI.getOperand (1 ));
2298
- expandPostRAPseudo (*Copy);
2299
- auto FirstNot = BuildMI (MBB, MI, DL, get (NotOpc), Exec).addReg (Exec);
2300
- FirstNot->addRegisterDead (AMDGPU::SCC, TRI); // SCC is overwritten
2301
- Copy = BuildMI (MBB, MI, DL, get (AMDGPU::V_MOV_B64_PSEUDO),
2302
- MI.getOperand (0 ).getReg ())
2303
- .add (MI.getOperand (2 ));
2304
- expandPostRAPseudo (*Copy);
2305
- BuildMI (MBB, MI, DL, get (NotOpc), Exec)
2306
- .addReg (Exec);
2293
+ unsigned MovOpc = ST.isWave32 () ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2294
+ unsigned VMovOpc = MI.getOpcode () == AMDGPU::V_SET_INACTIVE_B64
2295
+ ? AMDGPU::V_MOV_B64_PSEUDO
2296
+ : AMDGPU::V_MOV_B32_e32;
2297
+ Register ExecReg = RI.getExec ();
2298
+ Register DstReg = MI.getOperand (0 ).getReg ();
2299
+ MachineOperand &ActiveSrc = MI.getOperand (1 );
2300
+ MachineOperand &InactiveSrc = MI.getOperand (2 );
2301
+
2302
+ // Find implicit register defining lanes active outside WWM.
2303
+ Register ExecSrcReg = findSetInactiveMask (MI);
2304
+ assert (ExecSrcReg && " V_SET_INACTIVE must be in known WWM region" );
2305
+ // Note: default here is set to ExecReg so that functional MIR is still
2306
+ // generated if implicit def is not found and assertions are disabled.
2307
+ if (!ExecSrcReg)
2308
+ ExecSrcReg = ExecReg;
2309
+
2310
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
2311
+ // constant bus constraints and the presence of literal constants
2312
+ // present an issue.
2313
+ // Fallback to V_MOV base lowering in all but the common cases.
2314
+ const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2315
+ MachineFunction *MF = MBB.getParent ();
2316
+ MachineRegisterInfo &MRI = MF->getRegInfo ();
2317
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2318
+ const MCInstrDesc &Desc = get (Opcode);
2319
+
2320
+ const APInt ActiveImm (64 , ActiveSrc.isImm () ? ActiveSrc.getImm () : 0 );
2321
+ const APInt InactiveImm (64 , InactiveSrc.isImm () ? InactiveSrc.getImm () : 0 );
2322
+ const APInt ActiveImmLo (32 , ActiveImm.getLoBits (32 ).getZExtValue ());
2323
+ const APInt ActiveImmHi (32 , ActiveImm.getHiBits (32 ).getZExtValue ());
2324
+ const APInt InactiveImmLo (32 , InactiveImm.getLoBits (32 ).getZExtValue ());
2325
+ const APInt InactiveImmHi (32 , InactiveImm.getHiBits (32 ).getZExtValue ());
2326
+
2327
+ int Src0Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src0);
2328
+ int Src1Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src1);
2329
+
2330
+ int ConstantBusLimit = ST.getConstantBusLimit (AMDGPU::V_CNDMASK_B32_e64);
2331
+ int LiteralLimit = ST.hasVOP3Literal () ? 1 : 0 ;
2332
+ int ConstantBusUses =
2333
+ 1 + // Starts at 1 for ExecSrcReg
2334
+ (usesConstantBus (MRI, ActiveSrc, Desc.operands ()[Src1Idx]) ? 1 : 0 ) +
2335
+ (usesConstantBus (MRI, InactiveSrc, Desc.operands ()[Src0Idx]) ? 1 : 0 );
2336
+ int LiteralConstants =
2337
+ ((ActiveSrc.isReg () ||
2338
+ (ActiveSrc.isImm () && isInlineConstant (ActiveImm)))
2339
+ ? 0
2340
+ : 1 ) +
2341
+ ((InactiveSrc.isReg () ||
2342
+ (InactiveSrc.isImm () && isInlineConstant (InactiveImm)))
2343
+ ? 0
2344
+ : 1 );
2345
+
2346
+ bool UseVCndMask =
2347
+ ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
2348
+ if (VMov64 && UseVCndMask) {
2349
+ // Decomposition must not introduce new literals.
2350
+ UseVCndMask &=
2351
+ ActiveSrc.isReg () ||
2352
+ (isInlineConstant (ActiveImmLo) && isInlineConstant (ActiveImmHi)) ||
2353
+ (!isInlineConstant (ActiveImm));
2354
+ UseVCndMask &= InactiveSrc.isReg () ||
2355
+ (isInlineConstant (InactiveImmLo) &&
2356
+ isInlineConstant (InactiveImmHi)) ||
2357
+ (!isInlineConstant (InactiveImm));
2358
+ }
2359
+
2360
+ if (UseVCndMask && VMov64) {
2361
+ // Dual V_CNDMASK_B32
2362
+ MachineOperand ActiveLo = buildExtractSubRegOrImm (
2363
+ MI, MRI, ActiveSrc, nullptr , AMDGPU::sub0, nullptr );
2364
+ MachineOperand ActiveHi = buildExtractSubRegOrImm (
2365
+ MI, MRI, ActiveSrc, nullptr , AMDGPU::sub1, nullptr );
2366
+ MachineOperand InactiveLo = buildExtractSubRegOrImm (
2367
+ MI, MRI, InactiveSrc, nullptr , AMDGPU::sub0, nullptr );
2368
+ MachineOperand InactiveHi = buildExtractSubRegOrImm (
2369
+ MI, MRI, InactiveSrc, nullptr , AMDGPU::sub1, nullptr );
2370
+ if (ActiveSrc.isReg ())
2371
+ ActiveHi.setIsKill (ActiveSrc.isKill ());
2372
+ if (InactiveSrc.isReg ())
2373
+ InactiveHi.setIsKill (InactiveSrc.isKill ());
2374
+ BuildMI (MBB, MI, DL, Desc, RI.getSubReg (DstReg, AMDGPU::sub0))
2375
+ .addImm (0 )
2376
+ .add (InactiveLo)
2377
+ .addImm (0 )
2378
+ .add (ActiveLo)
2379
+ .addReg (ExecSrcReg)
2380
+ .addReg (DstReg, RegState::ImplicitDefine);
2381
+ BuildMI (MBB, MI, DL, Desc, RI.getSubReg (DstReg, AMDGPU::sub1))
2382
+ .addImm (0 )
2383
+ .add (InactiveHi)
2384
+ .addImm (0 )
2385
+ .add (ActiveHi)
2386
+ .addReg (ExecSrcReg)
2387
+ .addReg (DstReg, RegState::ImplicitDefine);
2388
+ } else if (UseVCndMask) {
2389
+ // Single V_CNDMASK_B32
2390
+ BuildMI (MBB, MI, DL, Desc, DstReg)
2391
+ .addImm (0 )
2392
+ .add (InactiveSrc)
2393
+ .addImm (0 )
2394
+ .add (ActiveSrc)
2395
+ .addReg (ExecSrcReg);
2396
+ } else {
2397
+ // Fallback V_MOV case.
2398
+ // Avoid unnecessary work if a source VGPR is also the destination.
2399
+ // This can happen if WWM register allocation was efficient.
2400
+ // Note: this assumes WWM execution.
2401
+ bool DstIsActive = ActiveSrc.isReg () && ActiveSrc.getReg () == DstReg;
2402
+ bool DstIsInactive =
2403
+ InactiveSrc.isReg () && InactiveSrc.getReg () == DstReg;
2404
+ if (!DstIsInactive) {
2405
+ // Set exec mask to inactive lanes,
2406
+ // but only if active lanes would be overwritten.
2407
+ if (DstIsActive) {
2408
+ BuildMI (MBB, MI, DL, get (NotOpc), ExecReg)
2409
+ .addReg (ExecSrcReg)
2410
+ .setOperandDead (3 ); // Dead scc
2411
+ }
2412
+ // Copy inactive lanes
2413
+ MachineInstr *VMov =
2414
+ BuildMI (MBB, MI, DL, get (VMovOpc), DstReg).add (InactiveSrc);
2415
+ if (VMov64)
2416
+ expandPostRAPseudo (*VMov);
2417
+ }
2418
+ if (!DstIsActive) {
2419
+ // Set exec mask to active lanes
2420
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addReg (ExecSrcReg);
2421
+ // Copy active lanes
2422
+ MachineInstr *VMov =
2423
+ BuildMI (MBB, MI, DL, get (VMovOpc), MI.getOperand (0 ).getReg ())
2424
+ .add (ActiveSrc);
2425
+ if (VMov64)
2426
+ expandPostRAPseudo (*VMov);
2427
+ }
2428
+ // Restore WWM
2429
+ BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addImm (-1 );
2430
+ }
2307
2431
MI.eraseFromParent ();
2308
2432
break ;
2309
2433
}
@@ -5647,6 +5771,9 @@ unsigned SIInstrInfo::buildExtractSubReg(
5647
5771
MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
5648
5772
const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5649
5773
unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5774
+ if (!SuperReg.getReg ().isVirtual ())
5775
+ return RI.getSubReg (SuperReg.getReg (), SubIdx);
5776
+
5650
5777
MachineBasicBlock *MBB = MI->getParent ();
5651
5778
DebugLoc DL = MI->getDebugLoc ();
5652
5779
Register SubReg = MRI.createVirtualRegister (SubRC);
0 commit comments