Skip to content

Commit 78d71f7

Browse files
committed
flat/global/scratch pattern for true16
1 parent 5f8b256 commit 78d71f7

File tree

11 files changed

+1149
-555
lines changed

11 files changed

+1149
-555
lines changed

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 222 additions & 32 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 219 additions & 143 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

Lines changed: 155 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,25 @@ define <2 x half> @chain_hi_to_lo_private() {
4848
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
4949
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
5050
;
51-
; GFX11-LABEL: chain_hi_to_lo_private:
52-
; GFX11: ; %bb.0: ; %bb
53-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54-
; GFX11-NEXT: s_mov_b32 s0, 2
55-
; GFX11-NEXT: scratch_load_u16 v0, off, s0
56-
; GFX11-NEXT: s_mov_b32 s0, 0
57-
; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0
58-
; GFX11-NEXT: s_waitcnt vmcnt(0)
59-
; GFX11-NEXT: s_setpc_b64 s[30:31]
51+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_private:
52+
; GFX11-TRUE16: ; %bb.0: ; %bb
53+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54+
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2
55+
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s0
56+
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
57+
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0
58+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
59+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
60+
;
61+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_private:
62+
; GFX11-FAKE16: ; %bb.0: ; %bb
63+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64+
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 2
65+
; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s0
66+
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
67+
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0
68+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
69+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
6070
bb:
6171
%gep_lo = getelementptr inbounds half, ptr addrspace(5) null, i64 1
6272
%load_lo = load half, ptr addrspace(5) %gep_lo
@@ -104,13 +114,21 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base
104114
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
105115
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
106116
;
107-
; GFX11-LABEL: chain_hi_to_lo_private_different_bases:
108-
; GFX11: ; %bb.0: ; %bb
109-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110-
; GFX11-NEXT: scratch_load_u16 v0, v0, off
111-
; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off
112-
; GFX11-NEXT: s_waitcnt vmcnt(0)
113-
; GFX11-NEXT: s_setpc_b64 s[30:31]
117+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_private_different_bases:
118+
; GFX11-TRUE16: ; %bb.0: ; %bb
119+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120+
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, v0, off
121+
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off
122+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
123+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
124+
;
125+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_private_different_bases:
126+
; GFX11-FAKE16: ; %bb.0: ; %bb
127+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128+
; GFX11-FAKE16-NEXT: scratch_load_u16 v0, v0, off
129+
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off
130+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
131+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
114132
bb:
115133
%load_lo = load half, ptr addrspace(5) %base_lo
116134
%load_hi = load half, ptr addrspace(5) %base_hi
@@ -288,17 +306,29 @@ define <2 x half> @chain_hi_to_lo_global() {
288306
; GFX10-NEXT: s_waitcnt vmcnt(0)
289307
; GFX10-NEXT: s_setpc_b64 s[30:31]
290308
;
291-
; GFX11-LABEL: chain_hi_to_lo_global:
292-
; GFX11: ; %bb.0: ; %bb
293-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294-
; GFX11-NEXT: v_mov_b32_e32 v0, 2
295-
; GFX11-NEXT: v_mov_b32_e32 v1, 0
296-
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
297-
; GFX11-NEXT: v_mov_b32_e32 v1, 0
298-
; GFX11-NEXT: v_mov_b32_e32 v2, 0
299-
; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
300-
; GFX11-NEXT: s_waitcnt vmcnt(0)
301-
; GFX11-NEXT: s_setpc_b64 s[30:31]
309+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_global:
310+
; GFX11-TRUE16: ; %bb.0: ; %bb
311+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
313+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
314+
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
315+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
316+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
317+
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
318+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
319+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
320+
;
321+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global:
322+
; GFX11-FAKE16: ; %bb.0: ; %bb
323+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
324+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
325+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
326+
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
327+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
328+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
329+
; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
330+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
331+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
302332
bb:
303333
%gep_lo = getelementptr inbounds half, ptr addrspace(1) null, i64 1
304334
%load_lo = load half, ptr addrspace(1) %gep_lo
@@ -328,13 +358,21 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_
328358
; GFX10-NEXT: s_waitcnt vmcnt(0)
329359
; GFX10-NEXT: s_setpc_b64 s[30:31]
330360
;
331-
; GFX11-LABEL: chain_hi_to_lo_global_different_bases:
332-
; GFX11: ; %bb.0: ; %bb
333-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334-
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
335-
; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
336-
; GFX11-NEXT: s_waitcnt vmcnt(0)
337-
; GFX11-NEXT: s_setpc_b64 s[30:31]
361+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_different_bases:
362+
; GFX11-TRUE16: ; %bb.0: ; %bb
363+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364+
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
365+
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
366+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
367+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
368+
;
369+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_different_bases:
370+
; GFX11-FAKE16: ; %bb.0: ; %bb
371+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372+
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
373+
; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
374+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
375+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
338376
bb:
339377
%load_lo = load half, ptr addrspace(1) %base_lo
340378
%load_hi = load half, ptr addrspace(1) %base_hi
@@ -587,34 +625,65 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
587625
; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
588626
; FLATSCR_GFX10-NEXT: s_endpgm
589627
;
590-
; GFX11-LABEL: vload2_private:
591-
; GFX11: ; %bb.0: ; %entry
592-
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
593-
; GFX11-NEXT: v_mov_b32_e32 v2, 0
594-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
595-
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1]
596-
; GFX11-NEXT: s_waitcnt vmcnt(0)
597-
; GFX11-NEXT: scratch_store_b16 off, v0, off dlc
598-
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
599-
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
600-
; GFX11-NEXT: s_waitcnt vmcnt(0)
601-
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
602-
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
603-
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
604-
; GFX11-NEXT: s_waitcnt vmcnt(0)
605-
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
606-
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
607-
; GFX11-NEXT: s_clause 0x1
608-
; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2
609-
; GFX11-NEXT: scratch_load_u16 v3, off, off
610-
; GFX11-NEXT: s_waitcnt vmcnt(1)
611-
; GFX11-NEXT: v_mov_b32_e32 v1, v0
612-
; GFX11-NEXT: s_waitcnt vmcnt(0)
613-
; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
614-
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
615-
; GFX11-NEXT: s_waitcnt vmcnt(0)
616-
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
617-
; GFX11-NEXT: s_endpgm
628+
; GFX11-TRUE16-LABEL: vload2_private:
629+
; GFX11-TRUE16: ; %bb.0: ; %entry
630+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
631+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
632+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
633+
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1]
634+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
635+
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off dlc
636+
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
637+
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:2
638+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
639+
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
640+
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
641+
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:4
642+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
643+
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
644+
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
645+
; GFX11-TRUE16-NEXT: s_clause 0x1
646+
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off offset:2
647+
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, off
648+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
649+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
650+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
651+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
652+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
653+
; GFX11-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
654+
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
655+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
656+
; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
657+
; GFX11-TRUE16-NEXT: s_endpgm
658+
;
659+
; GFX11-FAKE16-LABEL: vload2_private:
660+
; GFX11-FAKE16: ; %bb.0: ; %entry
661+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
662+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
663+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
664+
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1]
665+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
666+
; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off dlc
667+
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
668+
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
669+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
670+
; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
671+
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
672+
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
673+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
674+
; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
675+
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
676+
; GFX11-FAKE16-NEXT: s_clause 0x1
677+
; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, off offset:2
678+
; GFX11-FAKE16-NEXT: scratch_load_u16 v3, off, off
679+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
680+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0
681+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
682+
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
683+
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
684+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
685+
; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
686+
; GFX11-FAKE16-NEXT: s_endpgm
618687
entry:
619688
%loc = alloca [3 x i16], align 2, addrspace(5)
620689
%tmp = load i16, ptr addrspace(1) %in, align 2
@@ -836,17 +905,30 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
836905
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
837906
; GFX10-NEXT: s_setpc_b64 s[30:31]
838907
;
839-
; GFX11-LABEL: chain_hi_to_lo_global_other_dep:
840-
; GFX11: ; %bb.0: ; %bb
841-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
842-
; GFX11-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc
843-
; GFX11-NEXT: s_waitcnt vmcnt(0)
844-
; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
845-
; GFX11-NEXT: s_waitcnt vmcnt(0)
846-
; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
847-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
848-
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
849-
; GFX11-NEXT: s_setpc_b64 s[30:31]
908+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_other_dep:
909+
; GFX11-TRUE16: ; %bb.0: ; %bb
910+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
911+
; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 glc dlc
912+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
913+
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
914+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
915+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
916+
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
917+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
918+
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
919+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
920+
;
921+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
922+
; GFX11-FAKE16: ; %bb.0: ; %bb
923+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
924+
; GFX11-FAKE16-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc
925+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
926+
; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
927+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
928+
; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
929+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
930+
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
931+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
850932
bb:
851933
%gep_lo = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 1
852934
%load_lo = load volatile i16, ptr addrspace(1) %gep_lo

0 commit comments

Comments
 (0)