Skip to content

Commit c896f7b

Browse files
authored
[AMDGPU][True16][CodeGen] build_vector pattern in true16 (#118904)
build_vector pattern in true16 SDAG
1 parent 449f84f commit c896f7b

File tree

9 files changed

+7259
-3950
lines changed

9 files changed

+7259
-3950
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
782782
return true;
783783

784784
// TODO: This should probably be a combine somewhere
785-
// (build_vector $src0, undef) -> copy $src0
785+
// (build_vector $src0, undef) -> copy $src0
786786
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
787787
if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
788788
MI.setDesc(TII.get(AMDGPU::COPY));

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3359,6 +3359,8 @@ def : GCNPat <
33593359
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
33603360
>;
33613361

3362+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3363+
let True16Predicate = p in {
33623364
def : GCNPat <
33633365
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
33643366
(COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
@@ -3368,6 +3370,7 @@ def : GCNPat <
33683370
(vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
33693371
(S_LSHL_B32 SReg_32:$src1, (i32 16))
33703372
>;
3373+
}
33713374

33723375
def : GCNPat <
33733376
(vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
@@ -3377,6 +3380,8 @@ def : GCNPat <
33773380
}
33783381

33793382
let SubtargetPredicate = HasVOP3PInsts in {
3383+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3384+
let True16Predicate = p in
33803385
def : GCNPat <
33813386
(v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
33823387
(v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
@@ -3406,12 +3411,24 @@ def : GCNPat <
34063411
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
34073412
>;
34083413

3414+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3415+
let True16Predicate = p in
34093416
// Take the lower 16 bits from each VGPR_32 and concat them
34103417
def : GCNPat <
34113418
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
34123419
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
34133420
>;
34143421

3422+
let True16Predicate = UseRealTrue16Insts in {
3423+
def : GCNPat <
3424+
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
3425+
(REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
3426+
>;
3427+
def : GCNPat <
3428+
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$src0), (Ty undef))),
3429+
(REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
3430+
>;
3431+
}
34153432

34163433
// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
34173434
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
@@ -3437,6 +3454,8 @@ def : GCNPat <
34373454

34383455
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
34393456
// Special case, can use V_ALIGNBIT (always uses encoded literal)
3457+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3458+
let True16Predicate = p in {
34403459
def : GCNPat <
34413460
(vecTy (DivergentBinFrag<build_vector>
34423461
(Ty !if(!eq(Ty, i16),
@@ -3457,7 +3476,7 @@ def : GCNPat <
34573476
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
34583477
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302)))
34593478
>;
3460-
3479+
}
34613480

34623481
} // end foreach Ty
34633482

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 7057 additions & 3802 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

Lines changed: 78 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -587,34 +587,63 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
587587
; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
588588
; FLATSCR_GFX10-NEXT: s_endpgm
589589
;
590-
; GFX11-LABEL: vload2_private:
591-
; GFX11: ; %bb.0: ; %entry
592-
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
593-
; GFX11-NEXT: v_mov_b32_e32 v2, 0
594-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
595-
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1]
596-
; GFX11-NEXT: s_waitcnt vmcnt(0)
597-
; GFX11-NEXT: scratch_store_b16 off, v0, off dlc
598-
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
599-
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
600-
; GFX11-NEXT: s_waitcnt vmcnt(0)
601-
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
602-
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
603-
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
604-
; GFX11-NEXT: s_waitcnt vmcnt(0)
605-
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
606-
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
607-
; GFX11-NEXT: s_clause 0x1
608-
; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2
609-
; GFX11-NEXT: scratch_load_u16 v3, off, off
610-
; GFX11-NEXT: s_waitcnt vmcnt(1)
611-
; GFX11-NEXT: v_mov_b32_e32 v1, v0
612-
; GFX11-NEXT: s_waitcnt vmcnt(0)
613-
; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
614-
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
615-
; GFX11-NEXT: s_waitcnt vmcnt(0)
616-
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
617-
; GFX11-NEXT: s_endpgm
590+
; GFX11-TRUE16-LABEL: vload2_private:
591+
; GFX11-TRUE16: ; %bb.0: ; %entry
592+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
593+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
594+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
595+
; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1]
596+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
597+
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off dlc
598+
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
599+
; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
600+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
601+
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
602+
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
603+
; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
604+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
605+
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
606+
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
607+
; GFX11-TRUE16-NEXT: s_clause 0x1
608+
; GFX11-TRUE16-NEXT: scratch_load_u16 v3, off, off offset:2
609+
; GFX11-TRUE16-NEXT: scratch_load_u16 v0, off, off
610+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
611+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
612+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
613+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
614+
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
615+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
616+
; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
617+
; GFX11-TRUE16-NEXT: s_endpgm
618+
;
619+
; GFX11-FAKE16-LABEL: vload2_private:
620+
; GFX11-FAKE16: ; %bb.0: ; %entry
621+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
622+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
623+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
624+
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1]
625+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
626+
; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off dlc
627+
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
628+
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
629+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
630+
; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
631+
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
632+
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
633+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
634+
; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
635+
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
636+
; GFX11-FAKE16-NEXT: s_clause 0x1
637+
; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, off offset:2
638+
; GFX11-FAKE16-NEXT: scratch_load_u16 v3, off, off
639+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
640+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0
641+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
642+
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
643+
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
644+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
645+
; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
646+
; GFX11-FAKE16-NEXT: s_endpgm
618647
entry:
619648
%loc = alloca [3 x i16], align 2, addrspace(5)
620649
%tmp = load i16, ptr addrspace(1) %in, align 2
@@ -968,16 +997,27 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt
968997
; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
969998
; GFX10-NEXT: s_setpc_b64 s[30:31]
970999
;
971-
; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store:
972-
; GFX11: ; %bb.0: ; %bb
973-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
974-
; GFX11-NEXT: v_mov_b32_e32 v2, 0x7b
975-
; GFX11-NEXT: ds_load_u16 v3, v0
976-
; GFX11-NEXT: ds_store_b16 v1, v2
977-
; GFX11-NEXT: ds_load_u16 v0, v0 offset:2
978-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
979-
; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
980-
; GFX11-NEXT: s_setpc_b64 s[30:31]
1000+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_may_alias_store:
1001+
; GFX11-TRUE16: ; %bb.0: ; %bb
1002+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1003+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0x7b
1004+
; GFX11-TRUE16-NEXT: ds_load_u16 v3, v0
1005+
; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2
1006+
; GFX11-TRUE16-NEXT: ds_load_u16 v0, v0 offset:2
1007+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1008+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
1009+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
1010+
;
1011+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_may_alias_store:
1012+
; GFX11-FAKE16: ; %bb.0: ; %bb
1013+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1014+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0x7b
1015+
; GFX11-FAKE16-NEXT: ds_load_u16 v3, v0
1016+
; GFX11-FAKE16-NEXT: ds_store_b16 v1, v2
1017+
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:2
1018+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1019+
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
1020+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
9811021
bb:
9821022
%gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
9831023
%load_hi = load i16, ptr addrspace(3) %ptr

llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -214,13 +214,21 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
214214
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
215215
; CI-NEXT: s_setpc_b64 s[30:31]
216216
;
217-
; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16:
218-
; GFX11: ; %bb.0:
219-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220-
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
221-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
222-
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
223-
; GFX11-NEXT: s_setpc_b64 s[30:31]
217+
; GFX11-TRUE16-LABEL: v_test_canonicalize_build_vector_v2f16:
218+
; GFX11-TRUE16: ; %bb.0:
219+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
221+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
222+
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
223+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
224+
;
225+
; GFX11-FAKE16-LABEL: v_test_canonicalize_build_vector_v2f16:
226+
; GFX11-FAKE16: ; %bb.0:
227+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228+
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
229+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
230+
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
231+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
224232
%ins0 = insertelement <2 x half> undef, half %lo, i32 0
225233
%ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
226234
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
@@ -2799,14 +2807,23 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
27992807
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
28002808
; CI-NEXT: s_setpc_b64 s[30:31]
28012809
;
2802-
; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2803-
; GFX11: ; %bb.0:
2804-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2805-
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
2806-
; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2807-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2808-
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
2809-
; GFX11-NEXT: s_setpc_b64 s[30:31]
2810+
; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2811+
; GFX11-TRUE16: ; %bb.0:
2812+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2813+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
2814+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2815+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
2816+
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
2817+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
2818+
;
2819+
; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2820+
; GFX11-FAKE16: ; %bb.0:
2821+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2822+
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
2823+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2824+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
2825+
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
2826+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
28102827
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
28112828
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
28122829
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
@@ -2850,7 +2867,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
28502867
; GFX11-TRUE16: ; %bb.0:
28512868
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28522869
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
2853-
; GFX11-TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
2870+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
28542871
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28552872
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0
28562873
; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1

0 commit comments

Comments
 (0)