Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit ab827bd

Browse files
committed
[AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies
Codegen prepare sinks comparisons close to a user is we have only one register for conditions. For AMDGPU we have many SGPRs capable to hold vector conditions. Changed BE to report we have many condition registers. That way IR LICM pass would hoist an invariant comparison out of a loop and codegen prepare will not sink it. With that done a condition is calculated in one block and used in another. Current behavior is to store workitem's condition in a VGPR using v_cndmask_b32 and then restore it with yet another v_cmp instruction from that v_cndmask's result. To mitigate the issue a propagation of source SGPR pair in place of v_cmp is implemented. Additional side effect of this is that we may consume less VGPRs at a cost of more SGPRs in case if holding of multiple conditions is needed, and that is a clear win in most cases. Differential Revision: https://reviews.llvm.org/D26114 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288053 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 46cc792 commit ab827bd

File tree

5 files changed

+146
-11
lines changed

5 files changed

+146
-11
lines changed

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
440440

441441
setSchedulingPreference(Sched::RegPressure);
442442
setJumpIsExpensive(true);
443+
setHasMultipleConditionRegisters(true);
443444

444445
// SI at least has hardware support for floating point exceptions, but no way
445446
// of using or handling them is implemented. They are also optional in OpenCL

lib/Target/AMDGPU/SILowerControlFlow.cpp

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ class SILowerControlFlow : public MachineFunctionPass {
8080
void emitLoop(MachineInstr &MI);
8181
void emitEndCf(MachineInstr &MI);
8282

83+
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
84+
SmallVectorImpl<MachineOperand> &Src) const;
85+
86+
void combineMasks(MachineInstr &MI);
87+
8388
public:
8489
static char ID;
8590

@@ -336,6 +341,62 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
336341
LIS->handleMove(*NewMI);
337342
}
338343

344+
// Returns replace operands for a logical operation, either single result
345+
// for exec or two operands if source was another equivalent operation.
346+
void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
347+
SmallVectorImpl<MachineOperand> &Src) const {
348+
MachineOperand &Op = MI.getOperand(OpNo);
349+
if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
350+
Src.push_back(Op);
351+
return;
352+
}
353+
354+
MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
355+
if (!Def || Def->getParent() != MI.getParent() ||
356+
!(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
357+
return;
358+
359+
// Make sure we do not modify exec between def and use.
360+
// A copy with implcitly defined exec inserted earlier is an exclusion, it
361+
// does not really modify exec.
362+
for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
363+
if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
364+
!(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
365+
return;
366+
367+
for (const auto &SrcOp : Def->explicit_operands())
368+
if (SrcOp.isUse() && (!SrcOp.isReg() ||
369+
TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
370+
SrcOp.getReg() == AMDGPU::EXEC))
371+
Src.push_back(SrcOp);
372+
}
373+
374+
// Search and combine pairs of equivalent instructions, like
375+
// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
376+
// S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
377+
// One of the operands is exec mask.
378+
void SILowerControlFlow::combineMasks(MachineInstr &MI) {
379+
assert(MI.getNumExplicitOperands() == 3);
380+
SmallVector<MachineOperand, 4> Ops;
381+
unsigned OpToReplace = 1;
382+
findMaskOperands(MI, 1, Ops);
383+
if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
384+
findMaskOperands(MI, 2, Ops);
385+
if (Ops.size() != 3) return;
386+
387+
unsigned UniqueOpndIdx;
388+
if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
389+
else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
390+
else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
391+
else return;
392+
393+
unsigned Reg = MI.getOperand(OpToReplace).getReg();
394+
MI.RemoveOperand(OpToReplace);
395+
MI.addOperand(Ops[UniqueOpndIdx]);
396+
if (MRI->use_empty(Reg))
397+
MRI->getUniqueVRegDef(Reg)->eraseFromParent();
398+
}
399+
339400
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
340401
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
341402
TII = ST.getInstrInfo();
@@ -351,9 +412,9 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
351412
NextBB = std::next(BI);
352413
MachineBasicBlock &MBB = *BI;
353414

354-
MachineBasicBlock::iterator I, Next;
415+
MachineBasicBlock::iterator I, Next, Last;
355416

356-
for (I = MBB.begin(); I != MBB.end(); I = Next) {
417+
for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
357418
Next = std::next(I);
358419
MachineInstr &MI = *I;
359420

@@ -386,9 +447,20 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
386447
emitEndCf(MI);
387448
break;
388449

450+
case AMDGPU::S_AND_B64:
451+
case AMDGPU::S_OR_B64:
452+
// Cleanup bit manipulations on exec mask
453+
combineMasks(MI);
454+
Last = I;
455+
continue;
456+
389457
default:
390-
break;
458+
Last = I;
459+
continue;
391460
}
461+
462+
// Replay newly inserted code to combine masks
463+
Next = (Last == MBB.end()) ? MBB.begin() : Last;
392464
}
393465
}
394466

lib/Target/AMDGPU/SILowerI1Copies.cpp

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,12 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
100100
const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
101101
const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
102102

103+
DebugLoc DL = MI.getDebugLoc();
104+
MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
103105
if (DstRC == &AMDGPU::VReg_1RegClass &&
104106
TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
105107
I1Defs.push_back(Dst.getReg());
106-
DebugLoc DL = MI.getDebugLoc();
107108

108-
MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
109109
if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
110110
if (DefInst->getOperand(1).isImm()) {
111111
I1Defs.push_back(Dst.getReg());
@@ -129,10 +129,26 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
129129
MI.eraseFromParent();
130130
} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
131131
SrcRC == &AMDGPU::VReg_1RegClass) {
132-
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_U32_e64))
133-
.addOperand(Dst)
134-
.addOperand(Src)
135-
.addImm(0);
132+
if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
133+
DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
134+
DefInst->getOperand(1).getImm() == 0 &&
135+
DefInst->getOperand(2).getImm() != 0 &&
136+
DefInst->getOperand(3).isReg() &&
137+
TargetRegisterInfo::isVirtualRegister(
138+
DefInst->getOperand(3).getReg()) &&
139+
TRI->getCommonSubClass(
140+
MRI.getRegClass(DefInst->getOperand(3).getReg()),
141+
&AMDGPU::SGPR_64RegClass)) {
142+
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
143+
.addOperand(Dst)
144+
.addReg(AMDGPU::EXEC)
145+
.addOperand(DefInst->getOperand(3));
146+
} else {
147+
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
148+
.addOperand(Dst)
149+
.addOperand(Src)
150+
.addImm(0);
151+
}
136152
MI.eraseFromParent();
137153
}
138154
}

test/CodeGen/AMDGPU/branch-relaxation.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -493,9 +493,9 @@ ret:
493493
; GCN: s_setpc_b64
494494

495495
; GCN: [[LONG_BR_DEST0]]
496-
; GCN: s_cmp_eq_u32
496+
; GCN: v_cmp_ne_u32_e32
497497
; GCN-NEXT: ; implicit-def
498-
; GCN-NEXT: s_cbranch_scc0
498+
; GCN-NEXT: s_cbranch_vccz
499499
; GCN: s_setpc_b64
500500

501501
; GCN: s_endpgm

test/CodeGen/AMDGPU/hoist-cond.ll

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
2+
3+
; Check that invariant compare is hoisted out of the loop.
4+
; At the same time condition shall not be serialized into a VGPR and deserialized later
5+
; using another v_cmp + v_cndmask, but used directly in s_and_saveexec_b64.
6+
7+
; CHECK: v_cmp_{{..}}_u32_e64 [[COND:s\[[0-9]+:[0-9]+\]]]
8+
; CHECK: BB0_1:
9+
; CHECK-NOT: v_cmp
10+
; CHECK_NOT: v_cndmask
11+
; CHECK: s_and_saveexec_b64 s[{{[[0-9]+:[0-9]+}}], [[COND]]
12+
; CHECK: BB0_2:
13+
14+
define amdgpu_kernel void @hoist_cond(float addrspace(1)* nocapture %arg, float addrspace(1)* noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) {
15+
bb:
16+
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
17+
%tmp5 = icmp ult i32 %tmp, %arg3
18+
br label %bb1
19+
20+
bb1: ; preds = %bb3, %bb
21+
%tmp7 = phi i32 [ %arg4, %bb ], [ %tmp16, %bb3 ]
22+
%tmp8 = phi float [ 0.000000e+00, %bb ], [ %tmp15, %bb3 ]
23+
br i1 %tmp5, label %bb2, label %bb3
24+
25+
bb2: ; preds = %bb1
26+
%tmp10 = zext i32 %tmp7 to i64
27+
%tmp11 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %tmp10
28+
%tmp12 = load float, float addrspace(1)* %tmp11, align 4
29+
br label %bb3
30+
31+
bb3: ; preds = %bb2, %bb1
32+
%tmp14 = phi float [ %tmp12, %bb2 ], [ 0.000000e+00, %bb1 ]
33+
%tmp15 = fadd float %tmp8, %tmp14
34+
%tmp16 = add i32 %tmp7, -1
35+
%tmp17 = icmp eq i32 %tmp16, 0
36+
br i1 %tmp17, label %bb4, label %bb1
37+
38+
bb4: ; preds = %bb3
39+
store float %tmp15, float addrspace(1)* %arg, align 4
40+
ret void
41+
}
42+
43+
; Function Attrs: nounwind readnone
44+
declare i32 @llvm.amdgcn.workitem.id.x() #0
45+
46+
attributes #0 = { nounwind readnone }

0 commit comments

Comments
 (0)