Skip to content

Commit 4672bac

Browse files
committed
[AMDGPU] Introduce Strict WQM mode
* Add amdgcn_strict_wqm intrinsic. * Add a corresponding STRICT_WQM machine instruction. * The semantic is similar to amdgcn_strict_wwm with a notable difference that not all threads will be forcibly enabled during the computations of the intrinsic's argument, but only all threads in quads that have at least one thread active. * The difference between amdgc_wqm and amdgcn_strict_wqm, is that in the strict mode an inactive lane will always be enabled irrespective of control flow decisions. Reviewed By: critson Differential Revision: https://reviews.llvm.org/D96258
1 parent 5d613e4 commit 4672bac

File tree

10 files changed

+579
-94
lines changed

10 files changed

+579
-94
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,6 +1621,10 @@ def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
16211621
[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
16221622
IntrConvergent, IntrWillReturn]
16231623
>;
1624+
def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
1625+
[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
1626+
IntrConvergent, IntrWillReturn]
1627+
>;
16241628

16251629
// Given a value, copies it while setting all the inactive lanes to a given
16261630
// value. Note that OpenGL helper lanes are considered active, so if the

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2645,6 +2645,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
26452645
case Intrinsic::amdgcn_strict_wwm:
26462646
Opcode = AMDGPU::STRICT_WWM;
26472647
break;
2648+
case Intrinsic::amdgcn_strict_wqm:
2649+
Opcode = AMDGPU::STRICT_WQM;
2650+
break;
26482651
case Intrinsic::amdgcn_interp_p1_f16:
26492652
SelectInterpP1F16(N);
26502653
return;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
930930
case Intrinsic::amdgcn_strict_wwm:
931931
case Intrinsic::amdgcn_wwm:
932932
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
933+
case Intrinsic::amdgcn_strict_wqm:
934+
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
933935
case Intrinsic::amdgcn_writelane:
934936
return selectWritelane(I);
935937
case Intrinsic::amdgcn_div_scale:

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3958,6 +3958,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
39583958
case Intrinsic::amdgcn_mov_dpp:
39593959
case Intrinsic::amdgcn_strict_wwm:
39603960
case Intrinsic::amdgcn_wwm:
3961+
case Intrinsic::amdgcn_strict_wqm:
39613962
case Intrinsic::amdgcn_wqm:
39623963
case Intrinsic::amdgcn_softwqm:
39633964
case Intrinsic::amdgcn_set_inactive:

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
581581
continue;
582582
case AMDGPU::COPY:
583583
case AMDGPU::WQM:
584+
case AMDGPU::STRICT_WQM:
584585
case AMDGPU::SOFT_WQM:
585586
case AMDGPU::STRICT_WWM: {
586587
Register DstReg = MI.getOperand(0).getReg();

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1949,9 +1949,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
19491949
: AMDGPU::S_OR_SAVEEXEC_B64));
19501950
break;
19511951
}
1952-
case AMDGPU::EXIT_STRICT_WWM: {
1952+
case AMDGPU::ENTER_STRICT_WQM: {
19531953
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1954-
// Whole Wave Mode is exited.
1954+
// STRICT_WQM is entered.
1955+
const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1956+
const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
1957+
const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1958+
BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
1959+
BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
1960+
1961+
MI.eraseFromParent();
1962+
break;
1963+
}
1964+
case AMDGPU::EXIT_STRICT_WWM:
1965+
case AMDGPU::EXIT_STRICT_WQM: {
1966+
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1967+
// WWM/STICT_WQM is exited.
19551968
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
19561969
break;
19571970
}
@@ -4407,6 +4420,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
44074420
case AMDGPU::WQM: return AMDGPU::WQM;
44084421
case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
44094422
case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
4423+
case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
44104424
case AMDGPU::S_MOV_B32: {
44114425
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
44124426
return MI.getOperand(1).isReg() ||
@@ -6643,6 +6657,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
66436657
case AMDGPU::WQM:
66446658
case AMDGPU::SOFT_WQM:
66456659
case AMDGPU::STRICT_WWM:
6660+
case AMDGPU::STRICT_WQM:
66466661
case AMDGPU::REG_SEQUENCE:
66476662
case AMDGPU::PHI:
66486663
case AMDGPU::INSERT_SUBREG:
@@ -6800,7 +6815,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
68006815
case AMDGPU::INSERT_SUBREG:
68016816
case AMDGPU::WQM:
68026817
case AMDGPU::SOFT_WQM:
6803-
case AMDGPU::STRICT_WWM: {
6818+
case AMDGPU::STRICT_WWM:
6819+
case AMDGPU::STRICT_WQM: {
68046820
const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
68056821
if (RI.hasAGPRs(SrcRC)) {
68066822
if (RI.hasAGPRs(NewDstRC))

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
125125
// accidentally clobber inactive channels of $vdst.
126126
let Constraints = "@earlyclobber $vdst" in {
127127
def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
128+
def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
128129
}
129130

130131
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
@@ -143,6 +144,20 @@ def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
143144
let mayStore = 0;
144145
}
145146

147+
def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
148+
let Uses = [EXEC];
149+
let Defs = [EXEC, SCC];
150+
let hasSideEffects = 0;
151+
let mayLoad = 0;
152+
let mayStore = 0;
153+
}
154+
155+
def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
156+
let hasSideEffects = 0;
157+
let mayLoad = 0;
158+
let mayStore = 0;
159+
}
160+
146161
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
147162
// restoring it after we're done.
148163
let Defs = [SCC] in {

llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ class SIPreAllocateWWMRegs : public MachineFunctionPass {
3838
RegisterClassInfo RegClassInfo;
3939

4040
std::vector<unsigned> RegsToRewrite;
41+
#ifndef NDEBUG
42+
void printWWMInfo(const MachineInstr &MI);
43+
#endif
4144

4245
public:
4346
static char ID;
@@ -154,6 +157,31 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
154157
MRI->freezeReservedRegs(MF);
155158
}
156159

160+
#ifndef NDEBUG
161+
LLVM_DUMP_METHOD void
162+
SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
163+
164+
unsigned Opc = MI.getOpcode();
165+
166+
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
167+
dbgs() << "Entering ";
168+
} else {
169+
assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
170+
dbgs() << "Exiting ";
171+
}
172+
173+
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
174+
dbgs() << "Strict WWM ";
175+
} else {
176+
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
177+
dbgs() << "Strict WQM ";
178+
}
179+
180+
dbgs() << "region: " << MI;
181+
}
182+
183+
#endif
184+
157185
bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
158186
LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
159187

@@ -185,21 +213,23 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
185213
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
186214
RegsAssigned |= processDef(MI.getOperand(0));
187215

188-
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM) {
189-
LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
216+
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
217+
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
218+
LLVM_DEBUG(printWWMInfo(MI));
190219
InWWM = true;
191220
continue;
192221
}
193222

194-
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM) {
195-
LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
223+
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
224+
MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
225+
LLVM_DEBUG(printWWMInfo(MI));
196226
InWWM = false;
197227
}
198228

199229
if (!InWWM)
200230
continue;
201231

202-
LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
232+
LLVM_DEBUG(dbgs() << "Processing " << MI);
203233

204234
for (MachineOperand &DefOpnd : MI.defs()) {
205235
RegsAssigned |= processDef(DefOpnd);

0 commit comments

Comments
 (0)