Skip to content

Commit 13877db

Browse files
committed
[AMDGPU] Fix shortfalls in WQM marking
When tracking defined lanes through phi nodes in the live range graph each branch of the phi must be handled independently. Also rewrite the marking algorithm to reduce unnecessary operations. Previously a shared set of defined lanes was used which caused marking to stop prematurely. This was observable in existing lit tests, but test patterns did not cover this detail. Reviewed By: piotr Differential Revision: https://reviews.llvm.org/D98614
1 parent 07232f4 commit 13877db

File tree

3 files changed

+111
-28
lines changed

3 files changed

+111
-28
lines changed

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 68 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -318,38 +318,63 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
318318
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319319

320320
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321-
if (!UseLRQ.valueIn())
321+
const VNInfo *Value = UseLRQ.valueIn();
322+
if (!Value)
322323
return;
323324

324325
// Note: this code assumes that lane masks on AMDGPU completely
325326
// cover registers.
327+
const LaneBitmask UseLanes =
328+
SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329+
: (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330+
: LaneBitmask::getNone());
331+
332+
// Perform a depth-first iteration of the LiveRange graph marking defs.
333+
// Stop processing of a given branch when all use lanes have been defined.
334+
// The first definition stops processing for a physical register.
335+
struct PhiEntry {
336+
const VNInfo *Phi;
337+
unsigned PredIdx;
338+
unsigned VisitIdx;
339+
LaneBitmask DefinedLanes;
340+
341+
PhiEntry(const VNInfo *Phi, unsigned PredIdx, unsigned VisitIdx,
342+
LaneBitmask DefinedLanes)
343+
: Phi(Phi), PredIdx(PredIdx), VisitIdx(VisitIdx),
344+
DefinedLanes(DefinedLanes) {}
345+
};
346+
SmallSetVector<const VNInfo *, 4> Visited;
347+
SmallVector<PhiEntry, 2> PhiStack;
326348
LaneBitmask DefinedLanes;
327-
LaneBitmask UseLanes;
328-
if (SubReg) {
329-
UseLanes = TRI->getSubRegIndexLaneMask(SubReg);
330-
} else if (Reg.isVirtual()) {
331-
UseLanes = MRI->getMaxLaneMaskForVReg(Reg);
332-
}
333-
334-
SmallPtrSet<const VNInfo *, 4> Visited;
335-
SmallVector<const VNInfo *, 4> ToProcess;
336-
ToProcess.push_back(UseLRQ.valueIn());
349+
unsigned NextPredIdx; // Only used for processing phi nodes
337350
do {
338-
const VNInfo *Value = ToProcess.pop_back_val();
339-
Visited.insert(Value);
351+
const VNInfo *NextValue = nullptr;
352+
353+
if (!Visited.count(Value)) {
354+
Visited.insert(Value);
355+
// On first visit to a phi then start processing first predecessor
356+
NextPredIdx = 0;
357+
}
340358

341359
if (Value->isPHIDef()) {
342-
// Need to mark all defs used in the PHI node
360+
// Each predecessor node in the phi must be processed as a subgraph
343361
const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
344362
assert(MBB && "Phi-def has no defining MBB");
345-
for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
346-
PE = MBB->pred_end();
347-
PI != PE; ++PI) {
363+
364+
// Find next predecessor to process
365+
unsigned Idx = NextPredIdx;
366+
auto PI = MBB->pred_begin() + Idx;
367+
auto PE = MBB->pred_end();
368+
for (; PI != PE && !NextValue; ++PI, ++Idx) {
348369
if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
349370
if (!Visited.count(VN))
350-
ToProcess.push_back(VN);
371+
NextValue = VN;
351372
}
352373
}
374+
375+
// If there are more predecessors to process; add phi to stack
376+
if (PI != PE)
377+
PhiStack.emplace_back(Value, Idx, Visited.size(), DefinedLanes);
353378
} else {
354379
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
355380
assert(MI && "Def has no defining instruction");
@@ -370,17 +395,20 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
370395
// Record if this instruction defined any of use
371396
HasDef |= Overlap.any();
372397

373-
// Check if all lanes of use have been defined
398+
// Mark any lanes defined
374399
DefinedLanes |= OpLanes;
375-
if ((DefinedLanes & UseLanes) != UseLanes) {
376-
// Definition not complete; need to process input value
377-
LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
378-
if (const VNInfo *VN = LRQ.valueIn()) {
379-
if (!Visited.count(VN))
380-
ToProcess.push_back(VN);
381-
}
400+
}
401+
402+
// Check if all lanes of use have been defined
403+
if ((DefinedLanes & UseLanes) != UseLanes) {
404+
// Definition not complete; need to process input value
405+
LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
406+
if (const VNInfo *VN = LRQ.valueIn()) {
407+
if (!Visited.count(VN))
408+
NextValue = VN;
382409
}
383410
}
411+
384412
// Only mark the instruction if it defines some part of the use
385413
if (HasDef)
386414
markInstruction(*MI, Flag, Worklist);
@@ -389,9 +417,21 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
389417
markInstruction(*MI, Flag, Worklist);
390418
}
391419
}
392-
} while (!ToProcess.empty());
393420

394-
assert(!Reg.isVirtual() || ((DefinedLanes & UseLanes) == UseLanes));
421+
if (!NextValue && !PhiStack.empty()) {
422+
// Reach end of chain; revert to processing last phi
423+
PhiEntry &Entry = PhiStack.back();
424+
NextValue = Entry.Phi;
425+
NextPredIdx = Entry.PredIdx;
426+
DefinedLanes = Entry.DefinedLanes;
427+
// Rewind visited set to correct state
428+
while (Visited.size() > Entry.VisitIdx)
429+
Visited.pop_back();
430+
PhiStack.pop_back();
431+
}
432+
433+
Value = NextValue;
434+
} while (Value);
395435
}
396436

397437
void SIWholeQuadMode::markOperand(const MachineInstr &MI,

llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -859,6 +859,10 @@ main_body:
859859
; CHECK-NEXT: ; %entry
860860
; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
861861
; CHECK: s_wqm_b64 exec, exec
862+
; CHECK: v_mov
863+
; CHECK: v_mov
864+
; CHECK: v_mov
865+
; CHECK: v_mov
862866
; CHECK: s_and_b64 exec, exec, [[LIVE]]
863867
; CHECK: image_store
864868
; CHECK: s_wqm_b64 exec, exec

llvm/test/CodeGen/AMDGPU/wqm.mir

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,3 +259,42 @@ body: |
259259
$vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec
260260
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
261261
...
262+
263+
---
264+
# Check that WQM marking occurs correctly through phi nodes in live range graph.
265+
# If not then initial V_MOV will not be in WQM.
266+
#
267+
#CHECK-LABEL: name: test_wqm_lr_phi
268+
#CHECK: COPY $exec
269+
#CHECK-NEXT: S_WQM
270+
#CHECK-NEXT: V_MOV_B32_e32 -10
271+
#CHECK-NEXT: V_MOV_B32_e32 0
272+
name: test_wqm_lr_phi
273+
tracksRegLiveness: true
274+
body: |
275+
bb.0:
276+
undef %0.sub0:vreg_64 = V_MOV_B32_e32 -10, implicit $exec
277+
%0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
278+
%1:sreg_64 = S_GETPC_B64
279+
%2:sgpr_256 = S_LOAD_DWORDX8_IMM %1:sreg_64, 32, 0, 0
280+
281+
bb.1:
282+
$vcc = V_CMP_LT_U32_e64 4, 4, implicit $exec
283+
S_CBRANCH_VCCNZ %bb.3, implicit $vcc
284+
S_BRANCH %bb.2
285+
286+
bb.2:
287+
%0.sub0:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec
288+
S_BRANCH %bb.3
289+
290+
bb.3:
291+
%0.sub1:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec
292+
S_BRANCH %bb.4
293+
294+
bb.4:
295+
%3:sgpr_128 = IMPLICIT_DEF
296+
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %2:sgpr_256, %3:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource")
297+
$vgpr0 = COPY %4.sub0:vreg_128
298+
$vgpr1 = COPY %4.sub1:vreg_128
299+
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
300+
...

0 commit comments

Comments
 (0)