Skip to content

Commit a34a024

Browse files
authored
[AMDGPU][SIInsertWaitCnts] skip meta instructions early (#145720)
When iterating over a block, meta instructions have no effect on wait counts, but their presence drops the reference to earlier waitcnt instructions before they are processed. This results in spurious wait counts, which do not affect correctness, but are also not required in the resulting program. Skipping meta instructions as soon as they are seen cleans this up.
1 parent 13e6ea7 commit a34a024

10 files changed

+26
-79
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1786,8 +1786,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17861786
bool FlushVmCnt) {
17871787
setForceEmitWaitcnt();
17881788

1789-
if (MI.isMetaInstruction())
1790-
return false;
1789+
assert(!MI.isMetaInstruction());
17911790

17921791
AMDGPU::Waitcnt Wait;
17931792

@@ -2474,6 +2473,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24742473
E = Block.instr_end();
24752474
Iter != E;) {
24762475
MachineInstr &Inst = *Iter;
2476+
if (Inst.isMetaInstruction()) {
2477+
++Iter;
2478+
continue;
2479+
}
24772480

24782481
// Track pre-existing waitcnts that were added in earlier iterations or by
24792482
// the memory legalizer.

llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2250,7 +2250,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
22502250
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
22512251
; GFX9-SDAG-NEXT: s_mov_b32 s34, s12
22522252
; GFX9-SDAG-NEXT: s_mov_b32 s33, s11
2253-
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
22542253
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
22552254
;
22562255
; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
@@ -2317,7 +2316,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
23172316
; GFX9-GISEL-NEXT: s_mov_b32 s32, s34
23182317
; GFX9-GISEL-NEXT: s_mov_b32 s34, s12
23192318
; GFX9-GISEL-NEXT: s_mov_b32 s33, s11
2320-
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
23212319
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
23222320
;
23232321
; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -731,7 +731,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
731731
; GFX9-NEXT: s_waitcnt vmcnt(0)
732732
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
733733
; GFX9-NEXT: .LBB3_3: ; %exit
734-
; GFX9-NEXT: s_waitcnt vmcnt(0)
735734
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
736735
; GFX9-NEXT: s_movk_i32 s4, 0x8000
737736
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -973,7 +972,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
973972
; GFX9-NEXT: s_waitcnt vmcnt(0)
974973
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
975974
; GFX9-NEXT: .LBB4_3: ; %exit
976-
; GFX9-NEXT: s_waitcnt vmcnt(0)
977975
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1]
978976
; GFX9-NEXT: s_movk_i32 s4, 0x8000
979977
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -1217,7 +1215,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
12171215
; GFX9-NEXT: .LBB5_3: ; %exit
12181216
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
12191217
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
1220-
; GFX9-NEXT: s_waitcnt vmcnt(0)
12211218
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
12221219
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800
12231220
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
@@ -1595,7 +1592,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
15951592
; GFX9-NEXT: s_movk_i32 s34, 0x3800
15961593
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
15971594
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
1598-
; GFX9-NEXT: s_waitcnt vmcnt(0)
15991595
; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v7
16001596
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
16011597
; GFX9-NEXT: v_cmp_gt_u16_sdwa vcc, v7, s34 src0_sel:WORD_1 src1_sel:DWORD
@@ -1933,7 +1929,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
19331929
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3800
19341930
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3900
19351931
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3d00
1936-
; GFX9-NEXT: s_waitcnt vmcnt(0)
19371932
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v7
19381933
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
19391934
; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v7, v0 src0_sel:WORD_1 src1_sel:DWORD

llvm/test/CodeGen/AMDGPU/extract-subvector.ll

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
127127
; GCN-NEXT: s_mov_b32 s11, 0xf000
128128
; GCN-NEXT: s_mov_b32 s8, s10
129129
; GCN-NEXT: s_mov_b32 s9, s10
130-
; GCN-NEXT: s_waitcnt vmcnt(0)
131130
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
132131
; GCN-NEXT: s_waitcnt vmcnt(0)
133132
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -138,7 +137,6 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
138137
; GCN-NEXT: s_waitcnt vmcnt(0)
139138
; GCN-NEXT: .LBB1_4: ; %exit
140139
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
141-
; GCN-NEXT: s_waitcnt vmcnt(0)
142140
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
143141
; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
144142
; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
@@ -197,7 +195,6 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
197195
; GCN-NEXT: s_mov_b32 s11, 0xf000
198196
; GCN-NEXT: s_mov_b32 s8, s10
199197
; GCN-NEXT: s_mov_b32 s9, s10
200-
; GCN-NEXT: s_waitcnt vmcnt(0)
201198
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
202199
; GCN-NEXT: s_waitcnt vmcnt(0)
203200
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -208,7 +205,6 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
208205
; GCN-NEXT: s_waitcnt vmcnt(0)
209206
; GCN-NEXT: .LBB2_4: ; %exit
210207
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
211-
; GCN-NEXT: s_waitcnt vmcnt(0)
212208
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
213209
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
214210
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
@@ -305,7 +301,6 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
305301
; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
306302
; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
307303
; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
308-
; GCN-NEXT: s_waitcnt vmcnt(0)
309304
; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
310305
; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
311306
; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
@@ -376,7 +371,6 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
376371
; GCN-NEXT: s_mov_b32 s11, 0xf000
377372
; GCN-NEXT: s_mov_b32 s8, s10
378373
; GCN-NEXT: s_mov_b32 s9, s10
379-
; GCN-NEXT: s_waitcnt vmcnt(0)
380374
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
381375
; GCN-NEXT: s_waitcnt vmcnt(0)
382376
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -387,7 +381,6 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
387381
; GCN-NEXT: s_waitcnt vmcnt(0)
388382
; GCN-NEXT: .LBB4_4: ; %exit
389383
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
390-
; GCN-NEXT: s_waitcnt vmcnt(0)
391384
; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
392385
; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
393386
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, -2.0, vcc
@@ -446,7 +439,6 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
446439
; GCN-NEXT: s_mov_b32 s11, 0xf000
447440
; GCN-NEXT: s_mov_b32 s8, s10
448441
; GCN-NEXT: s_mov_b32 s9, s10
449-
; GCN-NEXT: s_waitcnt vmcnt(0)
450442
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
451443
; GCN-NEXT: s_waitcnt vmcnt(0)
452444
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -457,7 +449,6 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
457449
; GCN-NEXT: s_waitcnt vmcnt(0)
458450
; GCN-NEXT: .LBB5_4: ; %exit
459451
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
460-
; GCN-NEXT: s_waitcnt vmcnt(0)
461452
; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
462453
; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
463454
; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc
@@ -554,7 +545,6 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
554545
; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
555546
; GCN-NEXT: v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
556547
; GCN-NEXT: v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
557-
; GCN-NEXT: s_waitcnt vmcnt(0)
558548
; GCN-NEXT: v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
559549
; GCN-NEXT: v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
560550
; GCN-NEXT: v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]

llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6171,13 +6171,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
61716171
; NOOPT-NEXT: v_mov_b32_e32 v11, v14
61726172
; NOOPT-NEXT: v_mov_b32_e32 v12, v13
61736173
; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:32
6174-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
6174+
; NOOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
61756175
; NOOPT-NEXT: ; implicit-def: $sgpr1
61766176
; NOOPT-NEXT: ; implicit-def: $sgpr1
61776177
; NOOPT-NEXT: ; implicit-def: $sgpr1
61786178
; NOOPT-NEXT: ; implicit-def: $sgpr1
61796179
; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
6180-
; NOOPT-NEXT: s_waitcnt expcnt(0)
61816180
; NOOPT-NEXT: v_mov_b32_e32 v9, v4
61826181
; NOOPT-NEXT: v_mov_b32_e32 v10, v3
61836182
; NOOPT-NEXT: v_mov_b32_e32 v11, v2
@@ -7290,7 +7289,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
72907289
; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
72917290
; NOOPT-NEXT: s_waitcnt vmcnt(0)
72927291
; NOOPT-NEXT: ; implicit-def: $sgpr0
7293-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
72947292
; NOOPT-NEXT: ;;#ASMSTART
72957293
; NOOPT-NEXT: ; reg use v[0:3]
72967294
; NOOPT-NEXT: ;;#ASMEND
@@ -7313,7 +7311,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
73137311
; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
73147312
; NOOPT-NEXT: s_waitcnt vmcnt(0)
73157313
; NOOPT-NEXT: ; implicit-def: $sgpr0
7316-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
73177314
; NOOPT-NEXT: ;;#ASMSTART
73187315
; NOOPT-NEXT: ; reg use v[0:3]
73197316
; NOOPT-NEXT: ;;#ASMEND
@@ -7534,7 +7531,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
75347531
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75357532
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
75367533
; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
7537-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75387534
; NOOPT-NEXT: ;;#ASMSTART
75397535
; NOOPT-NEXT: ; reg use v[0:3]
75407536
; NOOPT-NEXT: ;;#ASMEND
@@ -7558,7 +7554,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
75587554
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75597555
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
75607556
; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
7561-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75627557
; NOOPT-NEXT: ;;#ASMSTART
75637558
; NOOPT-NEXT: ; reg use v[0:3]
75647559
; NOOPT-NEXT: ;;#ASMEND

0 commit comments

Comments
 (0)