Skip to content

Commit b9bbb43

Browse files
committed
patch2
1 parent 0c85389 commit b9bbb43

File tree

7 files changed

+3662
-3745
lines changed

7 files changed

+3662
-3745
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -782,22 +782,9 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
782782
return true;
783783

784784
// TODO: This should probably be a combine somewhere
785+
// (build_vector $src0, undef) -> copy $src0
785786
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
786787
if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
787-
if (Subtarget->useRealTrue16Insts() && IsVector) {
788-
// (vecTy (DivergentBinFrag<build_vector> Ty:$src0, (Ty undef))),
789-
// -> (vecTy (INSERT_SUBREG (IMPLICIT_DEF), VGPR_16:$src0, lo16))
790-
Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
791-
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
792-
BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::INSERT_SUBREG), Dst)
793-
.addReg(Undef)
794-
.addReg(Src0)
795-
.addImm(AMDGPU::lo16);
796-
MI.eraseFromParent();
797-
return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) &&
798-
RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_16RegClass, *MRI);
799-
}
800-
// (build_vector $src0, undef) -> copy $src0
801788
MI.setDesc(TII.get(AMDGPU::COPY));
802789
MI.removeOperand(2);
803790
const auto &RC =

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3411,9 +3411,8 @@ def : GCNPat <
34113411
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
34123412
(REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
34133413
>;
3414-
// GISel ignores this Pat, but the equivalent is done in selectG_BUILD_VECTOR
34153414
def : GCNPat <
3416-
(vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
3415+
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$src0), (Ty undef))),
34173416
(REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
34183417
>;
34193418
}

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 3555 additions & 3642 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -214,13 +214,21 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
214214
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
215215
; CI-NEXT: s_setpc_b64 s[30:31]
216216
;
217-
; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16:
218-
; GFX11: ; %bb.0:
219-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220-
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
221-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
222-
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
223-
; GFX11-NEXT: s_setpc_b64 s[30:31]
217+
; GFX11-TRUE16-LABEL: v_test_canonicalize_build_vector_v2f16:
218+
; GFX11-TRUE16: ; %bb.0:
219+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
221+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
222+
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
223+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
224+
;
225+
; GFX11-FAKE16-LABEL: v_test_canonicalize_build_vector_v2f16:
226+
; GFX11-FAKE16: ; %bb.0:
227+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228+
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
229+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
230+
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
231+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
224232
%ins0 = insertelement <2 x half> undef, half %lo, i32 0
225233
%ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
226234
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
@@ -2799,14 +2807,23 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
27992807
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
28002808
; CI-NEXT: s_setpc_b64 s[30:31]
28012809
;
2802-
; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2803-
; GFX11: ; %bb.0:
2804-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2805-
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
2806-
; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2807-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2808-
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
2809-
; GFX11-NEXT: s_setpc_b64 s[30:31]
2810+
; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2811+
; GFX11-TRUE16: ; %bb.0:
2812+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2813+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
2814+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2815+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
2816+
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
2817+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
2818+
;
2819+
; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2820+
; GFX11-FAKE16: ; %bb.0:
2821+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2822+
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
2823+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2824+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
2825+
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
2826+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
28102827
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
28112828
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
28122829
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
@@ -2850,7 +2867,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
28502867
; GFX11-TRUE16: ; %bb.0:
28512868
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28522869
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
2853-
; GFX11-TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
2870+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
28542871
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28552872
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0
28562873
; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1

llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll

Lines changed: 24 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -480,11 +480,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
480480
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
481481
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
482482
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
483-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
484-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
485-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
486483
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
487-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
484+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
488485
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
489486
;
490487
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -604,12 +601,9 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
604601
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
605602
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
606603
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
607-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
608-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
609-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
610604
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
611-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
612-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
605+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
606+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
613607
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
614608
;
615609
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -731,13 +725,10 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
731725
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
732726
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l
733727
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
734-
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v4, s0, 0x7fff
735-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
736-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
737-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
738-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
739-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
740-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
728+
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff
729+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
730+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
731+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l
741732
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
742733
;
743734
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -804,13 +795,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
804795
; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v5, 0x7fff
805796
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
806797
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
798+
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v5
807799
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v5
808800
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v3, v5
809-
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v5
810-
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
801+
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
811802
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
803+
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
812804
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
813-
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
814805
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
815806
;
816807
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -877,12 +868,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
877868
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
878869
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
879870
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
880-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
871+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
881872
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v4.l
882-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
883-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
884-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
885-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
873+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
886874
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
887875
;
888876
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -937,8 +925,8 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
937925
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
938926
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
939927
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
940-
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
941928
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
929+
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
942930
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
943931
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v5.l
944932
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1016,27 +1004,21 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
10161004
; GFX11-SDAG-TRUE16: ; %bb.0:
10171005
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10181006
; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000
1019-
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
1007+
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
10201008
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff
10211009
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff
1010+
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
10221011
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
10231012
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff
1024-
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1013+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v6.l, v5.l
10251014
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1026-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
1015+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v3.l
10271016
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
1028-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1017+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
10291018
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
1030-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l
1031-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
1032-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
1033-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
1034-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
1035-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
1036-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
1037-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
1019+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
10381020
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1039-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1
1021+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h
10401022
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
10411023
;
10421024
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1209,20 +1191,14 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
12091191
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
12101192
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
12111193
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1194+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
12121195
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
1213-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v3.l
12141196
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1215-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v6.l, v5.l
1197+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v5.l
12161198
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v4.l
1217-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1218-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
1219-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
1220-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1221-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
1222-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
12231199
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1224-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
1225-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1
1200+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
1201+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h
12261202
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
12271203
;
12281204
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16:

llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4210,18 +4210,45 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
42104210
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
42114211
; GFX10-NEXT: s_endpgm
42124212
;
4213-
; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
4214-
; GFX11: ; %bb.0:
4215-
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4216-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
4217-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4218-
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4219-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4220-
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
4221-
; GFX11-NEXT: s_waitcnt vmcnt(0)
4222-
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
4223-
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
4224-
; GFX11-NEXT: s_endpgm
4213+
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
4214+
; GFX11-SDAG: ; %bb.0:
4215+
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4216+
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
4217+
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
4218+
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4219+
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
4220+
; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
4221+
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
4222+
; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
4223+
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
4224+
; GFX11-SDAG-NEXT: s_endpgm
4225+
;
4226+
; GFX11-GISEL-TRUE16-LABEL: v_test_v2i16_x_add_undef_neg32:
4227+
; GFX11-GISEL-TRUE16: ; %bb.0:
4228+
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4229+
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
4230+
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
4231+
; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4232+
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
4233+
; GFX11-GISEL-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
4234+
; GFX11-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s0, 0xffffffe0
4235+
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
4236+
; GFX11-GISEL-TRUE16-NEXT: v_pk_add_u16 v1, v1, s2
4237+
; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
4238+
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
4239+
;
4240+
; GFX11-GISEL-FAKE16-LABEL: v_test_v2i16_x_add_undef_neg32:
4241+
; GFX11-GISEL-FAKE16: ; %bb.0:
4242+
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4243+
; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
4244+
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
4245+
; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4246+
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
4247+
; GFX11-GISEL-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
4248+
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
4249+
; GFX11-GISEL-FAKE16-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
4250+
; GFX11-GISEL-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
4251+
; GFX11-GISEL-FAKE16-NEXT: s_endpgm
42254252
%tid = call i32 @llvm.amdgcn.workitem.id.x()
42264253
%tid.ext = sext i32 %tid to i64
42274254
%gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext

llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,9 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo
8282
; GFX11-TRUE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
8383
; GFX11-TRUE16: ; %bb.0:
8484
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
86-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
87-
; GFX11-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
85+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v1
86+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
87+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
8888
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
8989
;
9090
; GFX11-FAKE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
@@ -144,11 +144,10 @@ define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x flo
144144
; GFX11-TRUE16-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict:
145145
; GFX11-TRUE16: ; %bb.0:
146146
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
148-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
149-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
147+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.h, v1
148+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v0
150149
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v2
151-
; GFX11-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
150+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
152151
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
153152
;
154153
; GFX11-FAKE16-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict:
@@ -405,10 +404,9 @@ define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi(<2 x flo
405404
; GFX11-TRUE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi:
406405
; GFX11-TRUE16: ; %bb.0:
407406
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
409-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
410-
; GFX11-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
411-
; GFX11-TRUE16-NEXT: global_store_b32 v[2:3], v0, off
407+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v1
408+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
409+
; GFX11-TRUE16-NEXT: global_store_b32 v[2:3], v1, off
412410
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
413411
;
414412
; GFX11-FAKE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi:

0 commit comments

Comments
 (0)