Skip to content

Commit 9da9d32

Browse files
authored
[AMDGPU][True16][CodeGen] sext i16 inreg in true16 mode (#144024)
update sext pattern in true16, setting up proper vgpr16 reg use
1 parent 8b8a369 commit 9da9d32

File tree

8 files changed

+521
-340
lines changed

8 files changed

+521
-340
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2623,6 +2623,8 @@ def : GCNPat<
26232623
(i32 (DivergentSextInreg<i1> i32:$src)),
26242624
(V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
26252625

2626+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
2627+
let True16Predicate = p in {
26262628
def : GCNPat <
26272629
(i16 (DivergentSextInreg<i1> i16:$src)),
26282630
(V_BFE_I32_e64 $src, (i32 0), (i32 1))
@@ -2632,6 +2634,23 @@ def : GCNPat <
26322634
(i16 (DivergentSextInreg<i8> i16:$src)),
26332635
(V_BFE_I32_e64 $src, (i32 0), (i32 8))
26342636
>;
2637+
}
2638+
2639+
let True16Predicate = UseRealTrue16Insts in {
2640+
def : GCNPat <
2641+
(i16 (DivergentSextInreg<i1> i16:$src)),
2642+
(V_BFE_I32_e64
2643+
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
2644+
(i32 0), (i32 1))
2645+
>;
2646+
2647+
def : GCNPat <
2648+
(i16 (DivergentSextInreg<i8> i16:$src)),
2649+
(V_BFE_I32_e64
2650+
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
2651+
(i32 0), (i32 8))
2652+
>;
2653+
}
26352654

26362655
def : GCNPat<
26372656
(i32 (DivergentSextInreg<i8> i32:$src)),

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,11 +319,21 @@ let SchedRW = [Write64Bit] in {
319319
} // End SchedRW = [Write64Bit]
320320
} // End isReMaterializable = 1
321321

322+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
323+
let True16Predicate = p in
322324
def : GCNPat<
323325
(i32 (DivergentUnaryFrag<sext> i16:$src)),
324326
(i32 (V_BFE_I32_e64 i16:$src, (i32 0), (i32 0x10)))
325327
>;
326328

329+
let True16Predicate = UseRealTrue16Insts in
330+
def : GCNPat<
331+
(i32 (DivergentUnaryFrag<sext> i16:$src)),
332+
(i32 (V_BFE_I32_e64
333+
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
334+
(i32 0), (i32 0x10)))
335+
>;
336+
327337
let isReMaterializable = 1 in {
328338
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
329339
defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -423,6 +433,8 @@ def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32
423433

424434
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
425435

436+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
437+
let True16Predicate = p in
426438
def : GCNPat<
427439
(i64 (DivergentUnaryFrag<sext> i16:$src)),
428440
(REG_SEQUENCE VReg_64,
@@ -432,6 +444,18 @@ def : GCNPat<
432444
), VGPR_32)), sub1)
433445
>;
434446

447+
let True16Predicate = UseRealTrue16Insts in
448+
def : GCNPat<
449+
(i64 (DivergentUnaryFrag<sext> i16:$src)),
450+
(REG_SEQUENCE VReg_64,
451+
(i32 (V_BFE_I32_e64
452+
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
453+
(S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
454+
(i32 (COPY_TO_REGCLASS
455+
(V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
456+
), VGPR_32)), sub1)
457+
>;
458+
435459
let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in {
436460
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
437461
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;

llvm/test/CodeGen/AMDGPU/idot4s.ll

Lines changed: 31 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,35 +1165,32 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
11651165
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
11661166
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
11671167
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
1168-
; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
1169-
; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1]
1168+
; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
1169+
; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3]
11701170
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v3, s[4:5]
11711171
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
1172-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
1172+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
11731173
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
1174-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
1175-
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v6.h, 8, v2.l
1176-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
1177-
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v8.h, 8, v1.l
1178-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
1179-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
1180-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.h
1181-
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v2.h, 8, v2.h
1174+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v2, 0, 8
1175+
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v4.h, 8, v1.l
1176+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
1177+
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v7.h, 8, v2.l
1178+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
1179+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
1180+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
1181+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
11821182
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v1.h, 8, v1.h
1183-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
1184-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
1185-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8
1186-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v7, 0, 8
1187-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1188-
; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v6, v6, v8
1189-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
1190-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1183+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v2, 0, 8
1184+
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v2.h, 8, v2.h
1185+
; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v4, v4, v7
1186+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.l
1187+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
11911188
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
11921189
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
1193-
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v6.l, v0.l
1190+
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, v0.l
11941191
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1195-
; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v2, v1
1196-
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v6.h
1192+
; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v2
1193+
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.h
11971194
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11981195
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
11991196
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h
@@ -3435,35 +3432,31 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
34353432
; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1]
34363433
; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3]
34373434
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
3438-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
3439-
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
3435+
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2
3436+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8
34403437
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
3441-
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v3
3442-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
3443-
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v2
3444-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v0, 0, 8
3445-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
34463438
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
3447-
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
3439+
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
3440+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
3441+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
34483442
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.h
3443+
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l
3444+
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v5.l
3445+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
34493446
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
3450-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8
3451-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v7, 0, 8
3447+
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v2
34523448
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3
34533449
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
3454-
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l
3455-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
3456-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
34573450
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
3458-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34593451
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
3452+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
34603453
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
3461-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34623454
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v2.l, v0.l
3455+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
34633456
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
3464-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
34653457
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3.l, v0.l
34663458
; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
3459+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
34673460
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
34683461
; GFX11-DL-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5]
34693462
; GFX11-DL-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/idot4u.ll

Lines changed: 37 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1669,40 +1669,38 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
16691669
; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes:
16701670
; GFX11-DL-TRUE16: ; %bb.0: ; %entry
16711671
; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1672-
; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1672+
; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
16731673
; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1674-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1674+
; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v6, 0
1675+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
16751676
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
16761677
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
16771678
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
1678-
; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
1679-
; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
1680-
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v5, s[4:5]
1679+
; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
1680+
; GFX11-DL-TRUE16-NEXT: global_load_b32 v5, v0, s[2:3]
1681+
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v6, s[4:5]
16811682
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
1682-
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3
1683+
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4
16831684
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
1684-
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v4
1685-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
1686-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
1685+
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v5
1686+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
1687+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
16871688
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
16881689
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16891690
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
1690-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
1691+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
16911692
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1692-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v7, 0, 8
1693+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
16931694
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
16941695
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
1695-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1696-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
1697-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l
1698-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1699-
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v2.l, v0.l
1700-
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v4, v4, 0xc0c0302
1701-
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v3, v3, 0xc0c0302
1696+
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302
1697+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1698+
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l
1699+
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302
17021700
; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
17031701
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
17041702
; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0
1705-
; GFX11-DL-TRUE16-NEXT: global_store_b16 v5, v0, s[4:5]
1703+
; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5]
17061704
; GFX11-DL-TRUE16-NEXT: s_endpgm
17071705
;
17081706
; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes:
@@ -1964,44 +1962,41 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
19641962
; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes2:
19651963
; GFX11-DL-TRUE16: ; %bb.0: ; %entry
19661964
; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1967-
; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1965+
; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
19681966
; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1969-
; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v4, 0
1970-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1967+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
19711968
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
19721969
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
19731970
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
1974-
; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3]
1975-
; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
1976-
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v4, s[4:5]
1971+
; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3]
1972+
; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
1973+
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v5, s[4:5]
19771974
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
1978-
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
1975+
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3
19791976
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
1980-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
1981-
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v3
1982-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
1983-
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1977+
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v4
1978+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v4, 0, 8
1979+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
19841980
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
1985-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
1986-
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
1987-
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2
1988-
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
1981+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
1982+
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
1983+
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
19891984
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
1990-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
1985+
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l
1986+
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3
19911987
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
19921988
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
1993-
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
1994-
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h
1989+
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.h
19951990
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l
1996-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1991+
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v4
1992+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
19971993
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v1.h, v0.l
1998-
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v5, 0, 8
1999-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1994+
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
20001995
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.h, v0.l
1996+
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
20011997
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
2002-
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
20031998
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v1.l, v0.l
2004-
; GFX11-DL-TRUE16-NEXT: global_store_b16 v4, v0, s[4:5]
1999+
; GFX11-DL-TRUE16-NEXT: global_store_b16 v5, v0, s[4:5]
20052000
; GFX11-DL-TRUE16-NEXT: s_endpgm
20062001
;
20072002
; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes2:

0 commit comments

Comments
 (0)