Skip to content

Commit 2a68364

Browse files
committed
[AMDGPU] gfx11 waitcnt support for VINTERP and LDSDIR instructions
Reviewed By: rampitec, #amdgpu Differential Revision: https://reviews.llvm.org/D127781
1 parent 6bb4055 commit 2a68364

File tree

3 files changed

+66
-42
lines changed

3 files changed

+66
-42
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,29 +87,29 @@ struct RegisterEncoding {
8787
};
8888

8989
enum WaitEventType {
90-
VMEM_ACCESS, // vector-memory read & write
91-
VMEM_READ_ACCESS, // vector-memory read
92-
VMEM_WRITE_ACCESS,// vector-memory write
93-
LDS_ACCESS, // lds read & write
94-
GDS_ACCESS, // gds read & write
95-
SQ_MESSAGE, // send message
96-
SMEM_ACCESS, // scalar-memory read & write
97-
EXP_GPR_LOCK, // export holding on its data src
98-
GDS_GPR_LOCK, // GDS holding on its data and addr src
99-
EXP_POS_ACCESS, // write to export position
100-
EXP_PARAM_ACCESS, // write to export parameter
101-
VMW_GPR_LOCK, // vector-memory write holding on its data src
90+
VMEM_ACCESS, // vector-memory read & write
91+
VMEM_READ_ACCESS, // vector-memory read
92+
VMEM_WRITE_ACCESS, // vector-memory write
93+
LDS_ACCESS, // lds read & write
94+
GDS_ACCESS, // gds read & write
95+
SQ_MESSAGE, // send message
96+
SMEM_ACCESS, // scalar-memory read & write
97+
EXP_GPR_LOCK, // export holding on its data src
98+
GDS_GPR_LOCK, // GDS holding on its data and addr src
99+
EXP_POS_ACCESS, // write to export position
100+
EXP_PARAM_ACCESS, // write to export parameter
101+
VMW_GPR_LOCK, // vector-memory write holding on its data src
102+
EXP_LDS_ACCESS, // read by ldsdir counting as export
102103
NUM_WAIT_EVENTS,
103104
};
104105

105106
static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
106-
(1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
107-
(1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
108-
(1 << SQ_MESSAGE),
109-
(1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
110-
(1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
111-
(1 << VMEM_WRITE_ACCESS)
112-
};
107+
(1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
108+
(1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
109+
(1 << SQ_MESSAGE),
110+
(1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
111+
(1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
112+
(1 << VMEM_WRITE_ACCESS)};
113113

114114
// The mapping is:
115115
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
@@ -596,6 +596,12 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
596596
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
597597
CurrScore);
598598
}
599+
} else if (TII->isLDSDIR(Inst)) {
600+
// LDSDIR instructions attach the score to the destination.
601+
setExpScore(
602+
&Inst, TII, TRI, MRI,
603+
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
604+
CurrScore);
599605
} else {
600606
if (TII->isEXP(Inst)) {
601607
// For export the destination registers are really temps that
@@ -1135,7 +1141,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
11351141
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
11361142
ScoreBrackets.clearVgprVmemTypes(RegNo);
11371143
}
1138-
if (Op.isDef()) {
1144+
if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
11391145
ScoreBrackets.determineWait(
11401146
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
11411147
}
@@ -1192,6 +1198,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
11921198
ScoreBrackets.applyWaitcnt(Wait);
11931199
}
11941200

1201+
// ExpCnt can be merged into VINTERP.
1202+
if (Wait.ExpCnt != ~0u && SIInstrInfo::isVINTERP(MI)) {
1203+
MachineOperand *WaitExp = TII->getNamedOperand(MI, AMDGPU::OpName::waitexp);
1204+
if (Wait.ExpCnt < WaitExp->getImm()) {
1205+
WaitExp->setImm(Wait.ExpCnt);
1206+
Modified = true;
1207+
}
1208+
Wait.ExpCnt = ~0u;
1209+
1210+
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1211+
<< "Update Instr: " << MI);
1212+
}
1213+
11951214
// Build new waitcnt instructions unless no wait is needed or the old waitcnt
11961215
// instruction was modified to handle the required wait.
11971216
if (Wait.hasWaitExceptVsCnt()) {
@@ -1350,6 +1369,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
13501369
// May need to way wait for anything.
13511370
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
13521371
}
1372+
} else if (SIInstrInfo::isLDSDIR(Inst)) {
1373+
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
1374+
} else if (TII->isVINTERP(Inst)) {
1375+
int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
1376+
ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
13531377
} else if (SIInstrInfo::isEXP(Inst)) {
13541378
unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
13551379
if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
99
; GCN-NEXT: lds_param_load v0, attr0.y
1010
; GCN-NEXT: lds_param_load v1, attr1.x
1111
; GCN-NEXT: v_mov_b32_e32 v4, s1
12-
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7
13-
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7
12+
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
13+
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
1414
; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7
1515
; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7
1616
; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done
@@ -36,10 +36,10 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
3636
; GCN-NEXT: lds_param_load v2, attr2.x
3737
; GCN-NEXT: lds_param_load v3, attr3.x
3838
; GCN-NEXT: v_mov_b32_e32 v5, s1
39-
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7
40-
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7
41-
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7
42-
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7
39+
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
40+
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
41+
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
42+
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
4343
; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7
4444
; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7
4545
; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7
@@ -73,10 +73,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
7373
; GCN-NEXT: lds_param_load v4, attr2.x
7474
; GCN-NEXT: lds_param_load v5, attr3.x
7575
; GCN-NEXT: s_waitcnt vmcnt(0)
76-
; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7
77-
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7
78-
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7
79-
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7
76+
; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
77+
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
78+
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
79+
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
8080
; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7
8181
; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7
8282
; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
@@ -111,7 +111,7 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
111111
; GCN-NEXT: v_mov_b32_e32 v1, s0
112112
; GCN-NEXT: lds_param_load v0, attr0.x
113113
; GCN-NEXT: v_mov_b32_e32 v2, s1
114-
; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7
114+
; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0
115115
; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7
116116
; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7
117117
; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
99
; GCN-NEXT: lds_param_load v0, attr0.y
1010
; GCN-NEXT: lds_param_load v1, attr1.x
1111
; GCN-NEXT: v_mov_b32_e32 v4, s1
12-
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7
13-
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7
12+
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
13+
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
1414
; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7
1515
; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7
1616
; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done
@@ -36,10 +36,10 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
3636
; GCN-NEXT: lds_param_load v2, attr2.x
3737
; GCN-NEXT: lds_param_load v3, attr3.x
3838
; GCN-NEXT: v_mov_b32_e32 v5, s1
39-
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7
40-
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7
41-
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7
42-
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7
39+
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
40+
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
41+
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
42+
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
4343
; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7
4444
; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7
4545
; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7
@@ -73,10 +73,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
7373
; GCN-NEXT: lds_param_load v4, attr2.x
7474
; GCN-NEXT: lds_param_load v5, attr3.x
7575
; GCN-NEXT: s_waitcnt vmcnt(0)
76-
; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7
77-
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7
78-
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7
79-
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7
76+
; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
77+
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
78+
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
79+
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
8080
; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7
8181
; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7
8282
; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
@@ -111,7 +111,7 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
111111
; GCN-NEXT: v_mov_b32_e32 v1, s0
112112
; GCN-NEXT: lds_param_load v0, attr0.x
113113
; GCN-NEXT: v_mov_b32_e32 v2, s1
114-
; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7
114+
; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0
115115
; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7
116116
; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7
117117
; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7

0 commit comments

Comments
 (0)