Skip to content

[AMDGPU][True16][CodeGen] build_vector pattern in true16 #118904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
return true;

// TODO: This should probably be a combine somewhere
// (build_vector $src0, undef) -> copy $src0
// (build_vector $src0, undef) -> copy $src0
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
MI.setDesc(TII.get(AMDGPU::COPY));
Expand Down
21 changes: 20 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3359,6 +3359,8 @@ def : GCNPat <
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
>;

foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
(COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
Expand All @@ -3368,6 +3370,7 @@ def : GCNPat <
(vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i32 16))
>;
}

def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
Expand All @@ -3377,6 +3380,8 @@ def : GCNPat <
}

let SubtargetPredicate = HasVOP3PInsts in {
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
(v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
Expand Down Expand Up @@ -3406,12 +3411,24 @@ def : GCNPat <
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;

foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in
// Take the lower 16 bits from each VGPR_32 and concat them
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
>;

let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
(REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
>;
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$src0), (Ty undef))),
(REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
>;
}

// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
Expand All @@ -3437,6 +3454,8 @@ def : GCNPat <

// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector>
(Ty !if(!eq(Ty, i16),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps the PERM_B32 pat at new line 3456 should also be disabled in Real True16 mode? It seems redundant with the new REG_SEQUENCE pat.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Took a closer look at this. I think these vector patch all might be moving to just fake16/non-16 mode. probably get a patch up in downstream first

Copy link
Contributor Author

@broxigarchen broxigarchen Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked the other pattern, for now just removing them from true16, the codegen does work and select the reg_sequence, but it generates additonal right shifts which is worse than the old pattern right now.

I think there exist better selections but it needs a bit more work. This might involving adding more
true16 pattern and some bitcast pattern.

We could merge this patch first and then follow up on this later

Expand All @@ -3457,7 +3476,7 @@ def : GCNPat <
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302)))
>;

}

} // end foreach Ty

Expand Down
10,859 changes: 7,057 additions & 3,802 deletions llvm/test/CodeGen/AMDGPU/bf16.ll

Large diffs are not rendered by default.

116 changes: 78 additions & 38 deletions llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -587,34 +587,63 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; FLATSCR_GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: vload2_private:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b16 off, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2
; GFX11-NEXT: scratch_load_u16 v3, off, off
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: vload2_private:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_u16 v3, off, off offset:2
; GFX11-TRUE16-NEXT: scratch_load_u16 v0, off, off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: vload2_private:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off dlc
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, off offset:2
; GFX11-FAKE16-NEXT: scratch_load_u16 v3, off, off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%loc = alloca [3 x i16], align 2, addrspace(5)
%tmp = load i16, ptr addrspace(1) %in, align 2
Expand Down Expand Up @@ -968,16 +997,27 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt
; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX11-NEXT: ds_load_u16 v3, v0
; GFX11-NEXT: ds_store_b16 v1, v2
; GFX11-NEXT: ds_load_u16 v0, v0 offset:2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_may_alias_store:
; GFX11-TRUE16: ; %bb.0: ; %bb
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX11-TRUE16-NEXT: ds_load_u16 v3, v0
; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2
; GFX11-TRUE16-NEXT: ds_load_u16 v0, v0 offset:2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_may_alias_store:
; GFX11-FAKE16: ; %bb.0: ; %bb
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX11-FAKE16-NEXT: ds_load_u16 v3, v0
; GFX11-FAKE16-NEXT: ds_store_b16 v1, v2
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
%load_hi = load i16, ptr addrspace(3) %ptr
Expand Down
49 changes: 33 additions & 16 deletions llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,21 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_test_canonicalize_build_vector_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_test_canonicalize_build_vector_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ins0 = insertelement <2 x half> undef, half %lo, i32 0
%ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
Expand Down Expand Up @@ -2799,14 +2807,23 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
Expand Down Expand Up @@ -2850,7 +2867,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
; GFX11-TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0
; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1
Expand Down
Loading