Skip to content

Commit bd9635e

Browse files
committed
added cycle reduction for instructions issued between VALU->SGPR and SPGR->SALU
1 parent d273717 commit bd9635e

File tree

9 files changed

+137
-37
lines changed

9 files changed

+137
-37
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,11 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
340340
bool Changed = false;
341341
MachineInstr *LastDelayAlu = nullptr;
342342

343+
bool VALUSALUStall = false;
344+
MCRegUnit lastSgprWrite = 0;
345+
MCRegUnit longestWait = 0;
346+
unsigned deletedCyclesNum = 0;
347+
343348
// Iterate over the contents of bundles, but don't emit any instructions
344349
// inside a bundle.
345350
for (auto &MI : MBB.instrs()) {
@@ -371,15 +376,51 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
371376
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
372377
auto It = State.find(Unit);
373378
if (It != State.end()) {
374-
if (!(SII->isSALU(MI.getOpcode())) ||
375-
!AMDGPU::isSGPR(Op.getReg(), TRI) ||
376-
It->second.VALUCycles == 0)
379+
if (SII->isSALU(MI.getOpcode()) &&
380+
AMDGPU::isSGPR(Op.getReg(), TRI) &&
381+
It->second.VALUCycles > 0) {
382+
deletedCyclesNum = It->second.VALUCycles;
383+
State.erase(Unit);
384+
VALUSALUStall = true;
385+
} else {
377386
Delay.merge(It->second);
378-
State.erase(Unit);
387+
State.erase(Unit);
388+
}
379389
}
380390
}
381391
}
382392
}
393+
unsigned maxCycles = 0;
394+
unsigned lastWrite = 0;
395+
if (Type != OTHER) {
396+
for (const auto &Op : MI.defs()) {
397+
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
398+
if (AMDGPU::isSGPR(Op.getReg(), TRI)) {
399+
maxCycles =
400+
(State.find(longestWait) == State.end())
401+
? std::max(deletedCyclesNum, (unsigned)0)
402+
: std::max(State[longestWait].VALUCycles,
403+
State[longestWait].SALUCycles);
404+
lastWrite =
405+
(State.find(lastSgprWrite) == State.end())
406+
? 0
407+
: std::max(State[lastSgprWrite].VALUCycles,
408+
State[lastSgprWrite].SALUCycles);
409+
if (maxCycles <= lastWrite)
410+
longestWait = lastSgprWrite;
411+
lastSgprWrite = Unit;
412+
}
413+
}
414+
}
415+
}
416+
417+
if (VALUSALUStall) {
418+
State.advance(VALU, maxCycles);
419+
VALUSALUStall = false;
420+
lastSgprWrite = 0;
421+
longestWait = 0;
422+
}
423+
383424
if (Emit && !MI.isBundledWithPred()) {
384425
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
385426
// just ignore them?

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2359,16 +2359,15 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
23592359
; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
23602360
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
23612361
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
2362-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
23632362
; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
2363+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
23642364
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
23652365
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2366-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
23672366
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
23682367
; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
2368+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
23692369
; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
23702370
; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5
2371-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
23722371
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
23732372
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
23742373
; GFX11-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/carryout-selection.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2782,7 +2782,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
27822782
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
27832783
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
27842784
; GFX11-NEXT: s_addc_u32 s0, s0, s7
2785-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2785+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
27862786
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
27872787
; GFX11-NEXT: s_mul_i32 s7, s5, s0
27882788
; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1
@@ -2807,7 +2807,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
28072807
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
28082808
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
28092809
; GFX11-NEXT: s_addc_u32 s0, s0, s5
2810-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2810+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
28112811
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
28122812
; GFX11-NEXT: s_mul_i32 s6, s10, s0
28132813
; GFX11-NEXT: s_mul_hi_u32 s5, s10, s0

llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,10 @@ define i32 @combine_add_zext_xor() {
3939
; GFX1100-NEXT: .LBB0_1: ; %bb9
4040
; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1
4141
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
42-
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
4342
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
4443
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
44+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4545
; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0
46-
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
4746
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
4847
; GFX1100-NEXT: s_cbranch_vccz .LBB0_4
4948
; GFX1100-NEXT: .LBB0_2: ; %.a
@@ -119,11 +118,10 @@ define i32 @combine_sub_zext_xor() {
119118
; GFX1100-NEXT: .LBB1_1: ; %bb9
120119
; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1
121120
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
122-
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
123121
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
124122
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
123+
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
125124
; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0
126-
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
127125
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
128126
; GFX1100-NEXT: s_cbranch_vccz .LBB1_4
129127
; GFX1100-NEXT: .LBB1_2: ; %.a

llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15305,7 +15305,6 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
1530515305
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
1530615306
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1530715307
; GFX12-NEXT: s_wait_alu 0xfffe
15308-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1530915308
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
1531015309
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
1531115310
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15467,7 +15466,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
1546715466
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
1546815467
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1546915468
; GFX12-NEXT: s_wait_alu 0xfffe
15470-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1547115469
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
1547215470
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
1547315471
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15641,7 +15639,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
1564115639
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
1564215640
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1564315641
; GFX12-NEXT: s_wait_alu 0xfffe
15644-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1564515642
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
1564615643
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
1564715644
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15809,7 +15806,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
1580915806
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
1581015807
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1581115808
; GFX12-NEXT: s_wait_alu 0xfffe
15812-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1581315809
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
1581415810
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
1581515811
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15966,7 +15962,6 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
1596615962
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
1596715963
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1596815964
; GFX12-NEXT: s_wait_alu 0xfffe
15969-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1597015965
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
1597115966
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
1597215967
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16122,7 +16117,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
1612216117
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
1612316118
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1612416119
; GFX12-NEXT: s_wait_alu 0xfffe
16125-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1612616120
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
1612716121
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
1612816122
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -16289,7 +16283,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
1628916283
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
1629016284
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1629116285
; GFX12-NEXT: s_wait_alu 0xfffe
16292-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1629316286
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
1629416287
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
1629516288
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16451,7 +16444,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
1645116444
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
1645216445
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1645316446
; GFX12-NEXT: s_wait_alu 0xfffe
16454-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1645516447
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
1645616448
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
1645716449
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6

llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3103,17 +3103,16 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
31033103
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00
31043104
; GFX11-NEXT: v_and_b32_e32 v4, 7, v2
31053105
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3106-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3106+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
31073107
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
31083108
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
31093109
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
31103110
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
31113111
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
3112-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31133112
; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
31143113
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
3114+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
31153115
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
3116-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
31173116
; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
31183117
; GFX11-NEXT: s_setpc_b64 s[30:31]
31193118
%fpround = fptrunc double %a to half
@@ -3252,13 +3251,13 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
32523251
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
32533252
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
32543253
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
3255-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
32563254
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
32573255
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
3256+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
32583257
; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
32593258
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
3260-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
32613259
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
3260+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
32623261
; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
32633262
; GFX11-NEXT: s_setpc_b64 s[30:31]
32643263
%fneg.a = fneg double %a
@@ -3563,13 +3562,13 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
35633562
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
35643563
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5
35653564
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
3566-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
35673565
; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo
35683566
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6
3567+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
35693568
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo
35703569
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
3571-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
35723570
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
3571+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35733572
; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
35743573
; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
35753574
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3716,15 +3715,14 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
37163715
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
37173716
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
37183717
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
3719-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
37203718
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
37213719
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
3720+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
37223721
; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
37233722
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
3724-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
37253723
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
3724+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
37263725
; GFX11-NEXT: v_and_or_b32 v1, 0x8000, v1, v0
3727-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
37283726
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
37293727
; GFX11-NEXT: s_setpc_b64 s[30:31]
37303728
%fpround = fptrunc double %a to half

llvm/test/CodeGen/AMDGPU/idiv-licm.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -959,12 +959,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
959959
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
960960
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
961961
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
962-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
963962
; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
964963
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
965964
; GFX11-NEXT: s_add_i32 s3, s3, 1
966965
; GFX11-NEXT: v_mov_b32_e32 v3, s5
967966
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
967+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
968968
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
969969
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
970970
; GFX11-NEXT: s_cbranch_scc0 .LBB6_1
@@ -1094,17 +1094,16 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
10941094
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
10951095
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
10961096
; GFX11-NEXT: s_cselect_b32 s5, s5, 0
1097-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1097+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
10981098
; GFX11-NEXT: v_add_nc_u32_e32 v2, s5, v2
10991099
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
11001100
; GFX11-NEXT: s_add_i32 s3, s3, 1
11011101
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
1102-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
11031102
; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
11041103
; GFX11-NEXT: v_mov_b32_e32 v3, s5
1104+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
11051105
; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
11061106
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
1107-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11081107
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
11091108
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
11101109
; GFX11-NEXT: s_cbranch_scc0 .LBB7_1

llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,5 +560,79 @@ body: |
560560
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
561561
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
562562
...
563+
564+
# Check if s_delay_alu is added
565+
---
566+
name: redundant_delay_alu_1
567+
body: |
568+
bb.0:
569+
; CHECK-LABEL: redundant_delay_alu_1:
570+
; CHECK: ; %bb.0:
571+
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1
572+
; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
573+
; CHECK-NEXT: s_or_b32 s0, s0, s1
574+
; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
575+
$sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec
576+
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
577+
$sgpr0= S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
578+
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
579+
...
580+
581+
# Check if s_delay_alu is added
582+
---
583+
name: delay_alu
584+
body: |
585+
bb.0:
586+
; CHECK-LABEL: delay_alu:
587+
; CHECK: ; %bb.0:
588+
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
589+
; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
590+
; CHECK-NEXT: s_or_b32 s0, s0, s1
591+
; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
592+
$sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
593+
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
594+
$sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
595+
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
596+
...
597+
598+
# Check if reduntant delay_alu is removed
599+
---
600+
name: redundant_delay_alu_2
601+
body: |
602+
bb.0:
603+
; CHECK-LABEL: redundant_delay_alu_2:
604+
; CHECK: ; %bb.0:
605+
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
606+
; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
607+
; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s6, s7
608+
; CHECK-NEXT: s_or_b32 s0, s0, s1
609+
; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
610+
$sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
611+
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
612+
$sgpr6_sgpr7 = V_CMP_EQ_U32_e64 $sgpr6, $sgpr7, implicit $exec
613+
$sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
614+
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
615+
...
616+
617+
# Check if reduntant delay_alu is removed
618+
---
619+
name: perserved_delay
620+
body: |
621+
bb.0:
622+
; CHECK-LABEL: perserved_delay:
623+
; CHECK: ; %bb.0:
624+
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
625+
; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
626+
; CHECK-NEXT: s_or_b32 s0, s0, s1
627+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
628+
; CHECK-NEXT: s_or_b32 s2, s0, s0
629+
; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
630+
liveins : $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
631+
$sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
632+
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
633+
$sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
634+
$sgpr2 = S_OR_B32 $sgpr0, $sgpr0, implicit-def $scc
635+
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
636+
...
563637
## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
564638
# CHECK: {{.*}}

llvm/test/CodeGen/AMDGPU/load-constant-i1.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4904,12 +4904,11 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
49044904
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
49054905
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
49064906
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10002
4907-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
49084907
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 3, v0
49094908
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
4909+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
49104910
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
49114911
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001
4912-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
49134912
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
49144913
; GFX12-NEXT: s_and_b32 s2, s2, 1
49154914
; GFX12-NEXT: s_wait_alu 0xfffe

0 commit comments

Comments
 (0)