Skip to content

Commit 909095a

Browse files
committed
AMDGPU: Precommit test showing codegen weakness
The code sequence on gfx9 has a lot of useless v_bfi instructions. Differential Revision: https://reviews.llvm.org/D149840
1 parent 3cd230e commit 909095a

File tree

1 file changed

+85
-0
lines changed

1 file changed

+85
-0
lines changed

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -916,3 +916,88 @@ exit:
916916
%r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
917917
ret <4 x half> %r2
918918
}
919+
920+
define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
921+
; SI-LABEL: large_vector:
922+
; SI: ; %bb.0:
923+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
924+
; SI-NEXT: v_lshlrev_b32_e32 v1, 5, v1
925+
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
926+
; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0
927+
; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0
928+
; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0
929+
; SI-NEXT: s_mov_b32 m0, -1
930+
; SI-NEXT: ds_read_b32 v0, v0
931+
; SI-NEXT: ds_read_b32 v2, v1
932+
; SI-NEXT: ds_read_b32 v4, v3
933+
; SI-NEXT: ds_read_b32 v6, v5
934+
; SI-NEXT: s_waitcnt lgkmcnt(3)
935+
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
936+
; SI-NEXT: s_waitcnt lgkmcnt(2)
937+
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
938+
; SI-NEXT: s_waitcnt lgkmcnt(1)
939+
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
940+
; SI-NEXT: s_waitcnt lgkmcnt(0)
941+
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
942+
; SI-NEXT: s_setpc_b64 s[30:31]
943+
;
944+
; GFX9-LABEL: large_vector:
945+
; GFX9: ; %bb.0:
946+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
947+
; GFX9-NEXT: v_lshl_add_u32 v2, v1, 5, v0
948+
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
949+
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
950+
; GFX9-NEXT: s_mov_b32 s4, 0xffff
951+
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
952+
; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v0
953+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
954+
; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v2
955+
; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4
956+
; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4
957+
; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v5
958+
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v5
959+
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
960+
; GFX9-NEXT: s_setpc_b64 s[30:31]
961+
%idx = shl i32 %idxp, 4
962+
963+
%i.0 = or i32 %idx, 0
964+
%p.0 = getelementptr half, ptr addrspace(3) %p, i32 %i.0
965+
%x.0 = load i16, ptr addrspace(3) %p.0, align 4
966+
%v0p = insertelement <8 x i16> poison, i16 %x.0, i32 0
967+
%i.1 = or i32 %idx, 1
968+
%p.1 = getelementptr half, ptr addrspace(3) %p, i32 %i.1
969+
%x.1 = load i16, ptr addrspace(3) %p.1, align 2
970+
%v0 = insertelement <8 x i16> %v0p, i16 %x.1, i32 1
971+
972+
%i.2 = or i32 %idx, 2
973+
%p.2 = getelementptr half, ptr addrspace(3) %p, i32 %i.2
974+
%x.2 = load i16, ptr addrspace(3) %p.2, align 4
975+
%v1p = insertelement <8 x i16> poison, i16 %x.2, i32 0
976+
%i.3 = or i32 %idx, 3
977+
%p.3 = getelementptr half, ptr addrspace(3) %p, i32 %i.3
978+
%x.3 = load i16, ptr addrspace(3) %p.3, align 2
979+
%v1 = insertelement <8 x i16> %v1p, i16 %x.3, i32 1
980+
981+
%i.4 = or i32 %idx, 4
982+
%p.4 = getelementptr half, ptr addrspace(3) %p, i32 %i.4
983+
%x.4 = load i16, ptr addrspace(3) %p.4, align 4
984+
%v2p = insertelement <8 x i16> poison, i16 %x.4, i32 0
985+
%i.5 = or i32 %idx, 5
986+
%p.5 = getelementptr half, ptr addrspace(3) %p, i32 %i.5
987+
%x.5 = load i16, ptr addrspace(3) %p.5, align 2
988+
%v2 = insertelement <8 x i16> %v2p, i16 %x.5, i32 1
989+
990+
%i.6 = or i32 %idx, 6
991+
%p.6 = getelementptr half, ptr addrspace(3) %p, i32 %i.6
992+
%x.6 = load i16, ptr addrspace(3) %p.6, align 4
993+
%v3p = insertelement <8 x i16> poison, i16 %x.6, i32 0
994+
%i.7 = or i32 %idx, 7
995+
%p.7 = getelementptr half, ptr addrspace(3) %p, i32 %i.7
996+
%x.7 = load i16, ptr addrspace(3) %p.7, align 2
997+
%v3 = insertelement <8 x i16> %v3p, i16 %x.7, i32 1
998+
999+
%z.1 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
1000+
%z.2 = shufflevector <8 x i16> %z.1, <8 x i16> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
1001+
%z.3 = shufflevector <8 x i16> %z.2, <8 x i16> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
1002+
ret <8 x i16> %z.3
1003+
}

0 commit comments

Comments
 (0)