-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] V_SET_INACTIVE optimizations #98864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Currently includes #98858 as it touches many of the same tests. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Carl Ritson (perlfu) ChangesOptimize V_SET_INACTIVE by allow it to run in WWM. Additionally avoid introducing exec manipulation and V_MOVs where Patch is 245.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/98864.diff 17 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cc1b9ac0c9ecd..d551a7887e706 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2273,37 +2273,148 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_SET_INACTIVE_B32: {
- unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
- MI.eraseFromParent();
- break;
- }
+ case AMDGPU::V_SET_INACTIVE_B32:
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- expandPostRAPseudo(*Copy);
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- expandPostRAPseudo(*Copy);
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+ ? AMDGPU::V_MOV_B64_PSEUDO
+ : AMDGPU::V_MOV_B32_e32;
+ Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ Register DstReg = MI.getOperand(0).getReg();
+ MachineOperand &ActiveSrc = MI.getOperand(1);
+ MachineOperand &InactiveSrc = MI.getOperand(2);
+
+ bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+
+ // Find implicit exec src if this is running in WWM.
+ Register ExecSrcReg = 0;
+ for (auto &Op : MI.implicit_operands()) {
+ if (Op.isDef() || !Op.isReg())
+ continue;
+ Register OpReg = Op.getReg();
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+ OpReg == AMDGPU::SCC)
+ continue;
+ ExecSrcReg = OpReg;
+ break;
+ }
+
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+ // constant bus constraints and the presence of literal constants
+ // present an issue.
+ // Fallback to V_MOV base lowering in all but the common cases.
+ bool InWWM = !!ExecSrcReg;
+ bool UseVCndMask = false;
+ if (InWWM) {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+ const MCInstrDesc &Desc = get(Opcode);
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+ int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+ int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ int ConstantBusUses = 1; // Starts at one for ExecRegSrc
+ int LiteralConstants = 0;
+ ConstantBusUses +=
+ usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0;
+ ConstantBusUses +=
+ usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0;
+ LiteralConstants +=
+ ActiveSrc.isImm() &&
+ !isInlineConstant(ActiveSrc, Desc.operands()[Src1Idx])
+ ? 1
+ : 0;
+ LiteralConstants +=
+ InactiveSrc.isImm() &&
+ !isInlineConstant(InactiveSrc, Desc.operands()[Src0Idx])
+ ? 1
+ : 0;
+ UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
+ LiteralConstants <= LiteralLimit &&
+ (!VMov64 || (ActiveSrc.isReg() && InactiveSrc.isReg()));
+ }
+
+ if (UseVCndMask && VMov64) {
+ // WWM B64; decompose to two B32 operations.
+ // Test above ensures that both sources are registers.
+ // Note: this is done to avoid falling back to V_MOV multiple times
+ // and introducing exec manipulation for each VGPR separately.
+ assert(ActiveSrc.isReg() && InactiveSrc.isReg());
+ Register ActiveLo = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0);
+ Register ActiveHi = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1);
+ Register InactiveLo = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0);
+ Register InactiveHi = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1);
+ MachineInstr *Tmp;
+ Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
+ RI.getSubReg(DstReg, AMDGPU::sub0))
+ .addReg(InactiveLo)
+ .addReg(ActiveLo)
+ .addReg(ExecSrcReg, RegState::Implicit)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ expandPostRAPseudo(*Tmp);
+ Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
+ RI.getSubReg(DstReg, AMDGPU::sub1))
+ .addReg(InactiveHi, InactiveSrc.isKill() ? RegState::Kill : 0)
+ .addReg(ActiveHi, ActiveSrc.isKill() ? RegState::Kill : 0)
+ .addReg(ExecSrcReg, RegState::Implicit)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ expandPostRAPseudo(*Tmp);
+ } else if (UseVCndMask) {
+ // WWM B32; use V_CNDMASK.
+ MachineInstr *VCndMask =
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
+ .add(InactiveSrc)
+ .addImm(0)
+ .add(ActiveSrc)
+ .addReg(ExecSrcReg);
+ // Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
+ for (auto &Op : MI.implicit_operands()) {
+ if (!Op.isDef())
+ continue;
+ VCndMask->addOperand(Op);
+ }
+ } else {
+ // Fallback V_MOV case.
+ // Avoid unnecessary work if a src is the destination.
+ // This can happen if WWM register allocation was efficient.
+ bool SkipActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
+ bool SkipInactive = InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
+ if (!SkipActive) {
+ if (InWWM) {
+ // Cancel WWM
+ BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
+ }
+ // Copy active lanes
+ MachineInstr *VMov =
+ BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
+ .add(ActiveSrc);
+ if (VMov64)
+ expandPostRAPseudo(*VMov);
+ }
+ if (!SkipInactive) {
+ // Set exec mask to inactive lanes
+ MachineInstr *ExecMI = BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
+ .addReg(InWWM ? ExecSrcReg : ExecReg);
+ ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+ // Copy inactive lanes
+ MachineInstr *VMov =
+ BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
+ if (VMov64)
+ expandPostRAPseudo(*VMov);
+ if (!InWWM) {
+ // Restore original exec mask
+ BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecReg);
+ }
+ }
+ if (InWWM) {
+ // Restore WWM
+ BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
+ }
+ }
MI.eraseFromParent();
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 19e761ef45b25..e01c045e7ef3d 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -178,6 +178,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
SmallVector<MachineInstr *, 4> KillInstrs;
SmallVector<MachineInstr *, 4> InitExecInstrs;
+ SmallVector<MachineInstr *, 4> SetInactiveInstrs;
void printInfo();
@@ -225,6 +226,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
void lowerInitExec(MachineInstr &MI);
MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
+ void harmonizeTransitions();
+
public:
static char ID;
@@ -477,7 +480,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
- SmallVector<MachineInstr *, 4> SetInactiveInstrs;
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
bool HasImplicitDerivatives =
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -554,6 +556,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
GlobalFlags |= StateStrictWQM;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
+ // Disable strict states here while marking, relax it later.
III.Disabled = StateStrict;
MachineOperand &Inactive = MI.getOperand(2);
if (Inactive.isReg()) {
@@ -564,6 +567,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
}
}
SetInactiveInstrs.push_back(&MI);
+ GlobalFlags |= StateStrictWWM;
+ BBI.NeedsLowering = true;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
@@ -1037,6 +1042,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
SmallVector<MachineInstr *, 4> SplitPoints;
+ Register ActiveLanesReg = 0;
char State = BI.InitialState;
for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1053,6 +1059,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
+ case AMDGPU::ENTER_STRICT_WWM:
+ ActiveLanesReg = MI.getOperand(0).getReg();
+ break;
+ case AMDGPU::EXIT_STRICT_WWM:
+ ActiveLanesReg = 0;
+ break;
+ case AMDGPU::V_SET_INACTIVE_B32:
+ case AMDGPU::V_SET_INACTIVE_B64:
+ if (ActiveLanesReg) {
+ MI.addOperand(*MBB.getParent(),
+ MachineOperand::CreateReg(ActiveLanesReg, false, true));
+ } else
+ assert(State == StateExact || State == StateWQM);
+ break;
default:
break;
}
@@ -1617,6 +1637,40 @@ SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
return InsertPt;
}
+void SIWholeQuadMode::harmonizeTransitions() {
+ // Relax requirements on SET_INACTIVE to allow it in WWM regions.
+ for (MachineInstr *MI : SetInactiveInstrs) {
+ if (MI->getOpcode() == AMDGPU::COPY)
+ continue;
+
+ Instructions[MI].Disabled &= ~StateStrictWWM;
+
+ auto MBB = MI->getParent();
+ auto It = MI->getIterator();
+ if (It == MBB->end())
+ continue;
+
+ bool AddWWM = false;
+ auto NextMI = std::next(It);
+ if (NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+ NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
+ // Groups of SET_INACTIVE are more efficient in WWM.
+ AddWWM = true;
+ } else {
+ // Back propagate WWM needs of next instruction.
+ auto III = Instructions.find(&*NextMI);
+ AddWWM =
+ (III != Instructions.end() && III->second.Needs & StateStrictWWM);
+ }
+
+ if (!AddWWM)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "merge into WWM: " << *MI);
+ Instructions[MI].Needs |= StateStrictWWM;
+ }
+}
+
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
<< " ------------- \n");
@@ -1629,6 +1683,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToMovInstrs.clear();
KillInstrs.clear();
InitExecInstrs.clear();
+ SetInactiveInstrs.clear();
StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1701,6 +1756,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LIS->InsertMachineInstrInMaps(*MI);
lowerKillInstrs(true);
} else {
+ harmonizeTransitions();
for (auto BII : Blocks)
processBlock(*BII.first, BII.first == &Entry);
// Lowering blocks causes block splitting so perform as a second pass.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 0c60be9d94591..8fb4f2cd79a70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -96,15 +96,14 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -117,17 +116,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 0xcccccccd
+; GCN-NEXT: s_mov_b32 s7, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -140,15 +137,14 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -160,15 +156,14 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -181,17 +176,15 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
; GCN-LABEL: set_inactive_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 1
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 1
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -205,17 +198,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
; GCN-LABEL: set_inactive_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 1.0
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 1.0
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -228,15 +219,14 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -249,17 +239,15 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
; GCN-LABEL: set_inactive_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0x10001
-; GCN-NEXT: s_mov...
[truncated]
|
5736b08
to
e9c1dd1
Compare
|
e9c1dd1
to
b290ced
Compare
|
int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); | ||
while (Index >= 0) { | ||
MI->removeOperand(Index); | ||
Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just for my education: Why do we have more than 1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instruction can have both implicit-def $scc
and implicit $scc
.
I wouldn't expect more than 1 of each, although very occasionally odd (broken?) code gen can yield multiple implicit uses of same phys.
If you search the tests you'll find some examples of implicit $exec, implicit $exec
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just noting some places where the codegen could be improved in future.
; GCN-NEXT: s_mov_b64 exec, -1 | ||
; GCN-NEXT: s_mov_b64 exec, s[4:5] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This pair of instructions is redundant. It would be nice to remove them one day.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed.
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 | ||
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] | ||
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 | ||
; GFX9-DPP-NEXT: s_not_b64 exec, exec | ||
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 | ||
; GFX9-DPP-NEXT: s_not_b64 exec, exec | ||
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 | ||
; GFX9-DPP-NEXT: s_mov_b64 exec, -1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even better would be to replace these four instructions with a single v_cndmask.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The GFX9 constant bus/literal limit prevents us merging all 4.
On GFX10+ they are all merged.
In this case we could use v_cndmask_b32 v4, v0, v4, s[0:1]
for the last three as destination is one of the sources.
; GFX11-NEXT: s_mov_b32 exec_lo, s0 | ||
; GFX11-NEXT: s_or_saveexec_b32 s0, -1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This pair of instructions could be removed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM overall though I am not particularly confident reviewing the SIWholeQuadMode parts.
|
||
if (UseVCndMask && VMov64) { | ||
// Dual V_CNDMASK_B32 | ||
MachineOperand ActiveLo = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This feels like SIInstrInfo::buildExtractSubRegOrImm
but I guess that assumes virtual registers and here you need physical registers?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be nicer to extract this into some variant of buildExtractSubRegOrImm that handles physical registers
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've made buildExtractSubReg
do something for physical registers and used it instead for this case.
Optimize V_SET_INACTIVE by always running it in run in WWM. Allows WWM sections to be unbroken, and facilitates V_SET_INACTIVE to be be lower to V_CNDMASK in most cases. Some cases require use of exec manipulation V_MOV as previous code. GFX9 sees slight instruction count increase in edge cases due to smaller constant bus. Additionally: - Avoid introducing exec manipulation and V_MOVs where a source of V_SET_INACTIVE is the destination. - Lower any V_SET_INACTIVE not touched by marking to COPY.
- Add findImplicitExecSrc helper. - Use helper to ignore V_SET_INACTIVE instructions during WQM/WWM processing. This allows other passes to emit V_SET_INACTIVE for already known WWM sections. This supports llvm#105822. - Add test for above.
430bf82
to
d1f19e7
Compare
|
// These are generated by an earlier pass which has seperately ensured | ||
// WWM and provided a mask of inactive lanes. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@perlfu what's the point of this? As far as I can tell it only happens with the hand written MIR test that you added (preloaded_set_inactive in wqm.mir).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was added to facilitate V_SET_INACTIVE
use in #105822.
Optimize V_SET_INACTIVE by allow it to run in WWM.
Hence WWM sections are not broken up for inactive lane setting.
WWM V_SET_INACTIVE can typically be lower to V_CNDMASK.
Some cases require use of exec manipulation V_MOV as previous code.
GFX9 sees slight instruction count increase in edge cases due to
smaller constant bus.
Additionally avoid introducing exec manipulation and V_MOVs where
a source of V_SET_INACTIVE is the destination.
This is a common pattern as WWM register pre-allocation often
assigns the same register.