@@ -48,15 +48,25 @@ define <2 x half> @chain_hi_to_lo_private() {
48
48
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
49
49
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
50
50
;
51
- ; GFX11-LABEL: chain_hi_to_lo_private:
52
- ; GFX11: ; %bb.0: ; %bb
53
- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54
- ; GFX11-NEXT: s_mov_b32 s0, 2
55
- ; GFX11-NEXT: scratch_load_u16 v0, off, s0
56
- ; GFX11-NEXT: s_mov_b32 s0, 0
57
- ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0
58
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
59
- ; GFX11-NEXT: s_setpc_b64 s[30:31]
51
+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_private:
52
+ ; GFX11-TRUE16: ; %bb.0: ; %bb
53
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54
+ ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2
55
+ ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s0
56
+ ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
57
+ ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0
58
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
59
+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
60
+ ;
61
+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_private:
62
+ ; GFX11-FAKE16: ; %bb.0: ; %bb
63
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64
+ ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 2
65
+ ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s0
66
+ ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
67
+ ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0
68
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
69
+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
60
70
bb:
61
71
%gep_lo = getelementptr inbounds half , ptr addrspace (5 ) null , i64 1
62
72
%load_lo = load half , ptr addrspace (5 ) %gep_lo
@@ -104,13 +114,21 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base
104
114
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
105
115
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
106
116
;
107
- ; GFX11-LABEL: chain_hi_to_lo_private_different_bases:
108
- ; GFX11: ; %bb.0: ; %bb
109
- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110
- ; GFX11-NEXT: scratch_load_u16 v0, v0, off
111
- ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off
112
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
113
- ; GFX11-NEXT: s_setpc_b64 s[30:31]
117
+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_private_different_bases:
118
+ ; GFX11-TRUE16: ; %bb.0: ; %bb
119
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120
+ ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, v0, off
121
+ ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off
122
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
123
+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
124
+ ;
125
+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_private_different_bases:
126
+ ; GFX11-FAKE16: ; %bb.0: ; %bb
127
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128
+ ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, v0, off
129
+ ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off
130
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
131
+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
114
132
bb:
115
133
%load_lo = load half , ptr addrspace (5 ) %base_lo
116
134
%load_hi = load half , ptr addrspace (5 ) %base_hi
@@ -288,17 +306,29 @@ define <2 x half> @chain_hi_to_lo_global() {
288
306
; GFX10-NEXT: s_waitcnt vmcnt(0)
289
307
; GFX10-NEXT: s_setpc_b64 s[30:31]
290
308
;
291
- ; GFX11-LABEL: chain_hi_to_lo_global:
292
- ; GFX11: ; %bb.0: ; %bb
293
- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294
- ; GFX11-NEXT: v_mov_b32_e32 v0, 2
295
- ; GFX11-NEXT: v_mov_b32_e32 v1, 0
296
- ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
297
- ; GFX11-NEXT: v_mov_b32_e32 v1, 0
298
- ; GFX11-NEXT: v_mov_b32_e32 v2, 0
299
- ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
300
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
301
- ; GFX11-NEXT: s_setpc_b64 s[30:31]
309
+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global:
310
+ ; GFX11-TRUE16: ; %bb.0: ; %bb
311
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
313
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
314
+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
315
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
316
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
317
+ ; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
318
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
319
+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
320
+ ;
321
+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global:
322
+ ; GFX11-FAKE16: ; %bb.0: ; %bb
323
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
324
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
325
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
326
+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
327
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
328
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
329
+ ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
330
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
331
+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
302
332
bb:
303
333
%gep_lo = getelementptr inbounds half , ptr addrspace (1 ) null , i64 1
304
334
%load_lo = load half , ptr addrspace (1 ) %gep_lo
@@ -328,13 +358,21 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_
328
358
; GFX10-NEXT: s_waitcnt vmcnt(0)
329
359
; GFX10-NEXT: s_setpc_b64 s[30:31]
330
360
;
331
- ; GFX11-LABEL: chain_hi_to_lo_global_different_bases:
332
- ; GFX11: ; %bb.0: ; %bb
333
- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334
- ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
335
- ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
336
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
337
- ; GFX11-NEXT: s_setpc_b64 s[30:31]
361
+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_different_bases:
362
+ ; GFX11-TRUE16: ; %bb.0: ; %bb
363
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364
+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
365
+ ; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
366
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
367
+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
368
+ ;
369
+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_different_bases:
370
+ ; GFX11-FAKE16: ; %bb.0: ; %bb
371
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372
+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
373
+ ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
374
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
375
+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
338
376
bb:
339
377
%load_lo = load half , ptr addrspace (1 ) %base_lo
340
378
%load_hi = load half , ptr addrspace (1 ) %base_hi
@@ -587,34 +625,65 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
587
625
; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
588
626
; FLATSCR_GFX10-NEXT: s_endpgm
589
627
;
590
- ; GFX11-LABEL: vload2_private:
591
- ; GFX11: ; %bb.0: ; %entry
592
- ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
593
- ; GFX11-NEXT: v_mov_b32_e32 v2, 0
594
- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
595
- ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1]
596
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
597
- ; GFX11-NEXT: scratch_store_b16 off, v0, off dlc
598
- ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
599
- ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
600
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
601
- ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
602
- ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
603
- ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
604
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
605
- ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
606
- ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
607
- ; GFX11-NEXT: s_clause 0x1
608
- ; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2
609
- ; GFX11-NEXT: scratch_load_u16 v3, off, off
610
- ; GFX11-NEXT: s_waitcnt vmcnt(1)
611
- ; GFX11-NEXT: v_mov_b32_e32 v1, v0
612
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
613
- ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
614
- ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
615
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
616
- ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
617
- ; GFX11-NEXT: s_endpgm
628
+ ; GFX11-TRUE16-LABEL: vload2_private:
629
+ ; GFX11-TRUE16: ; %bb.0: ; %entry
630
+ ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
631
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
632
+ ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
633
+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1]
634
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
635
+ ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off dlc
636
+ ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
637
+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:2
638
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
639
+ ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
640
+ ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
641
+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:4
642
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
643
+ ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
644
+ ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
645
+ ; GFX11-TRUE16-NEXT: s_clause 0x1
646
+ ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off offset:2
647
+ ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, off
648
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
649
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
650
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
651
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
652
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
653
+ ; GFX11-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
654
+ ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
655
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
656
+ ; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
657
+ ; GFX11-TRUE16-NEXT: s_endpgm
658
+ ;
659
+ ; GFX11-FAKE16-LABEL: vload2_private:
660
+ ; GFX11-FAKE16: ; %bb.0: ; %entry
661
+ ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
662
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
663
+ ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
664
+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1]
665
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
666
+ ; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off dlc
667
+ ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
668
+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
669
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
670
+ ; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
671
+ ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
672
+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
673
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
674
+ ; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
675
+ ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
676
+ ; GFX11-FAKE16-NEXT: s_clause 0x1
677
+ ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, off offset:2
678
+ ; GFX11-FAKE16-NEXT: scratch_load_u16 v3, off, off
679
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
680
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0
681
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
682
+ ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
683
+ ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
684
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
685
+ ; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
686
+ ; GFX11-FAKE16-NEXT: s_endpgm
618
687
entry:
619
688
%loc = alloca [3 x i16 ], align 2 , addrspace (5 )
620
689
%tmp = load i16 , ptr addrspace (1 ) %in , align 2
@@ -836,17 +905,30 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
836
905
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
837
906
; GFX10-NEXT: s_setpc_b64 s[30:31]
838
907
;
839
- ; GFX11-LABEL: chain_hi_to_lo_global_other_dep:
840
- ; GFX11: ; %bb.0: ; %bb
841
- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
842
- ; GFX11-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc
843
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
844
- ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
845
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
846
- ; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
847
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
848
- ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
849
- ; GFX11-NEXT: s_setpc_b64 s[30:31]
908
+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_other_dep:
909
+ ; GFX11-TRUE16: ; %bb.0: ; %bb
910
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
911
+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 glc dlc
912
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
913
+ ; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
914
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
915
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
916
+ ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
917
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
918
+ ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
919
+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
920
+ ;
921
+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
922
+ ; GFX11-FAKE16: ; %bb.0: ; %bb
923
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
924
+ ; GFX11-FAKE16-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc
925
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
926
+ ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
927
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
928
+ ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
929
+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
930
+ ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
931
+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
850
932
bb:
851
933
%gep_lo = getelementptr inbounds i16 , ptr addrspace (1 ) %ptr , i64 1
852
934
%load_lo = load volatile i16 , ptr addrspace (1 ) %gep_lo
0 commit comments