@@ -5719,33 +5719,33 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5719
5719
; GFX6-NEXT: s_mov_b32 s0, s4
5720
5720
; GFX6-NEXT: s_mov_b32 s1, s5
5721
5721
; GFX6-NEXT: s_waitcnt vmcnt(0)
5722
- ; GFX6-NEXT: v_bfe_u32 v2, v29, 11, 1
5723
- ; GFX6-NEXT: v_bfe_u32 v0, v29, 10 , 1
5724
- ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
5725
- ; GFX6-NEXT: v_bfe_u32 v5, v29, 9 , 1
5722
+ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 15, v29
5723
+ ; GFX6-NEXT: v_bfe_u32 v0, v29, 14 , 1
5724
+ ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
5725
+ ; GFX6-NEXT: v_bfe_u32 v5, v29, 13 , 1
5726
5726
; GFX6-NEXT: s_waitcnt expcnt(0)
5727
- ; GFX6-NEXT: v_bfe_u32 v3, v29, 8 , 1
5728
- ; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:64
5729
- ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 15, v29
5727
+ ; GFX6-NEXT: v_bfe_u32 v3, v29, 12 , 1
5728
+ ; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96
5729
+ ; GFX6-NEXT: v_bfe_u32 v8, v29, 11, 1
5730
5730
; GFX6-NEXT: s_waitcnt expcnt(0)
5731
- ; GFX6-NEXT: v_bfe_u32 v6, v29, 14 , 1
5732
- ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
5733
- ; GFX6-NEXT: v_bfe_u32 v27, v29, 5 , 1
5734
- ; GFX6-NEXT: v_bfe_u32 v23, v29, 7 , 1
5735
- ; GFX6-NEXT: v_bfe_u32 v19, v29, 1 , 1
5736
- ; GFX6-NEXT: v_bfe_u32 v15, v29, 3 , 1
5737
- ; GFX6-NEXT: v_bfe_u32 v11, v29, 13 , 1
5738
- ; GFX6-NEXT: v_bfe_u32 v25, v29, 4, 1
5739
- ; GFX6-NEXT: v_bfe_u32 v21, v29, 6 , 1
5740
- ; GFX6-NEXT: v_and_b32_e32 v17, 1, v29
5741
- ; GFX6-NEXT: v_bfe_u32 v13, v29, 2 , 1
5731
+ ; GFX6-NEXT: v_bfe_u32 v6, v29, 10 , 1
5732
+ ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80
5733
+ ; GFX6-NEXT: v_bfe_u32 v27, v29, 1 , 1
5734
+ ; GFX6-NEXT: v_bfe_u32 v23, v29, 3 , 1
5735
+ ; GFX6-NEXT: v_bfe_u32 v19, v29, 5 , 1
5736
+ ; GFX6-NEXT: v_bfe_u32 v15, v29, 7 , 1
5737
+ ; GFX6-NEXT: v_bfe_u32 v11, v29, 9 , 1
5738
+ ; GFX6-NEXT: v_and_b32_e32 v25, 1, v29
5739
+ ; GFX6-NEXT: v_bfe_u32 v21, v29, 2 , 1
5740
+ ; GFX6-NEXT: v_bfe_u32 v17, v29, 4, 1
5741
+ ; GFX6-NEXT: v_bfe_u32 v13, v29, 6 , 1
5742
5742
; GFX6-NEXT: s_waitcnt expcnt(0)
5743
- ; GFX6-NEXT: v_bfe_u32 v9, v29, 12 , 1
5744
- ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96
5745
- ; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:16
5746
- ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0
5747
- ; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48
5748
- ; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:32
5743
+ ; GFX6-NEXT: v_bfe_u32 v9, v29, 8 , 1
5744
+ ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
5745
+ ; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48
5746
+ ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:32
5747
+ ; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16
5748
+ ; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0
5749
5749
; GFX6-NEXT: s_endpgm
5750
5750
;
5751
5751
; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
@@ -5761,7 +5761,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5761
5761
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
5762
5762
; GFX8-NEXT: s_add_u32 s2, s0, 0x70
5763
5763
; GFX8-NEXT: s_addc_u32 s3, s1, 0
5764
- ; GFX8-NEXT: s_add_u32 s4, s0, 0x50
5764
+ ; GFX8-NEXT: s_add_u32 s4, s0, 0x60
5765
5765
; GFX8-NEXT: s_addc_u32 s5, s1, 0
5766
5766
; GFX8-NEXT: v_mov_b32_e32 v23, s5
5767
5767
; GFX8-NEXT: v_mov_b32_e32 v22, s4
@@ -5775,9 +5775,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5775
5775
; GFX8-NEXT: v_mov_b32_e32 v21, v2
5776
5776
; GFX8-NEXT: v_mov_b32_e32 v25, v2
5777
5777
; GFX8-NEXT: s_waitcnt vmcnt(0)
5778
- ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 10 , v0
5778
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 12 , v0
5779
5779
; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
5780
- ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11 , v0
5780
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 13 , v0
5781
5781
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
5782
5782
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v1
5783
5783
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[5:8]
@@ -5786,31 +5786,31 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5786
5786
; GFX8-NEXT: v_mov_b32_e32 v22, s2
5787
5787
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0
5788
5788
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
5789
- ; GFX8-NEXT: s_add_u32 s2, s0, 64
5789
+ ; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4]
5790
5790
; GFX8-NEXT: v_mov_b32_e32 v5, v2
5791
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11, v0
5791
5792
; GFX8-NEXT: v_mov_b32_e32 v7, v2
5792
- ; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4]
5793
5793
; GFX8-NEXT: v_mov_b32_e32 v23, v2
5794
- ; GFX8-NEXT: v_mov_b32_e32 v3, 1
5794
+ ; GFX8-NEXT: s_add_u32 s2, s0, 0x50
5795
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 10, v0
5796
+ ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
5795
5797
; GFX8-NEXT: s_addc_u32 s3, s1, 0
5798
+ ; GFX8-NEXT: v_and_b32_e32 v8, 1, v2
5799
+ ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v1
5796
5800
; GFX8-NEXT: v_mov_b32_e32 v1, s2
5797
5801
; GFX8-NEXT: v_mov_b32_e32 v2, s3
5798
- ; GFX8-NEXT: v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
5799
- ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0
5800
- ; GFX8-NEXT: s_add_u32 s2, s0, 0x60
5801
- ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
5802
+ ; GFX8-NEXT: s_add_u32 s2, s0, 64
5802
5803
; GFX8-NEXT: s_addc_u32 s3, s1, 0
5803
- ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v3
5804
5804
; GFX8-NEXT: v_mov_b32_e32 v4, s3
5805
5805
; GFX8-NEXT: v_mov_b32_e32 v3, s2
5806
5806
; GFX8-NEXT: s_add_u32 s2, s0, 48
5807
+ ; GFX8-NEXT: v_mov_b32_e32 v6, 1
5807
5808
; GFX8-NEXT: s_addc_u32 s3, s1, 0
5808
- ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0
5809
5809
; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[8:11]
5810
5810
; GFX8-NEXT: v_mov_b32_e32 v2, s1
5811
5811
; GFX8-NEXT: v_mov_b32_e32 v9, s3
5812
- ; GFX8-NEXT: v_and_b32_e32 v11, 1 , v6
5813
- ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13 , v0
5812
+ ; GFX8-NEXT: v_and_b32_sdwa v11, v0 , v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
5813
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 9 , v0
5814
5814
; GFX8-NEXT: v_mov_b32_e32 v8, s2
5815
5815
; GFX8-NEXT: s_add_u32 s2, s0, 32
5816
5816
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
@@ -5940,35 +5940,33 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5940
5940
; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
5941
5941
; GFX12-NEXT: s_wait_loadcnt 0x0
5942
5942
; GFX12-NEXT: v_and_b32_e32 v28, 1, v0
5943
- ; GFX12-NEXT: v_lshrrev_b16 v4, 11 , v0
5944
- ; GFX12-NEXT: v_lshrrev_b16 v8, 9 , v0
5945
- ; GFX12-NEXT: v_lshrrev_b16 v12, 13 , v0
5943
+ ; GFX12-NEXT: v_lshrrev_b16 v4, 13 , v0
5944
+ ; GFX12-NEXT: v_lshrrev_b16 v8, 11 , v0
5945
+ ; GFX12-NEXT: v_lshrrev_b16 v12, 9 , v0
5946
5946
; GFX12-NEXT: v_lshrrev_b16 v16, 7, v0
5947
5947
; GFX12-NEXT: v_lshrrev_b16 v2, 15, v0
5948
5948
; GFX12-NEXT: v_lshrrev_b16 v6, 14, v0
5949
- ; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0
5950
5949
; GFX12-NEXT: v_lshrrev_b16 v20, 5, v0
5951
5950
; GFX12-NEXT: v_lshrrev_b16 v24, 3, v0
5952
5951
; GFX12-NEXT: v_lshrrev_b16 v32, 1, v0
5952
+ ; GFX12-NEXT: v_lshrrev_b16 v10, 12, v0
5953
5953
; GFX12-NEXT: v_and_b32_e32 v33, 1, v4
5954
5954
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v34, 1, v8
5955
- ; GFX12-NEXT: v_lshrrev_b16 v14, 8 , v0
5956
- ; GFX12-NEXT: v_lshrrev_b16 v18, 12 , v0
5955
+ ; GFX12-NEXT: v_lshrrev_b16 v14, 10 , v0
5956
+ ; GFX12-NEXT: v_lshrrev_b16 v18, 8 , v0
5957
5957
; GFX12-NEXT: v_and_b32_e32 v35, 1, v12
5958
5958
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v36, 1, v16
5959
5959
; GFX12-NEXT: v_lshrrev_b16 v22, 6, v0
5960
5960
; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v38, 1, v24
5961
5961
; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v32, 1, v32
5962
- ; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v4, 1, v10
5963
5962
; GFX12-NEXT: v_mov_b32_e32 v23, v1
5964
5963
; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v2, 0xffff, v2
5965
- ; GFX12-NEXT: v_mov_b32_e32 v31, v1
5966
5964
; GFX12-NEXT: v_lshrrev_b16 v26, 4, v0
5967
5965
; GFX12-NEXT: v_lshrrev_b16 v30, 2, v0
5968
5966
; GFX12-NEXT: v_and_b32_e32 v37, 1, v20
5969
- ; GFX12-NEXT: v_and_b32_e32 v0, 1, v6
5967
+ ; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v0, 1, v6
5968
+ ; GFX12-NEXT: v_and_b32_e32 v4, 1, v10
5970
5969
; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v6, 0xffff, v33
5971
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
5972
5970
; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v24, 1, v30
5973
5971
; GFX12-NEXT: v_and_b32_e32 v8, 1, v14
5974
5972
; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v10, 0xffff, v34
@@ -5978,13 +5976,13 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5978
5976
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v35
5979
5977
; GFX12-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v18, 0xffff, v36
5980
5978
; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v32
5981
- ; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38
5979
+ ; GFX12-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_and_b32 v26, 0xffff, v38
5982
5980
; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37
5983
5981
; GFX12-NEXT: s_clause 0x7
5984
- ; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80
5985
- ; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64
5986
5982
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
5987
- ; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
5983
+ ; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:96
5984
+ ; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:80
5985
+ ; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:64
5988
5986
; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48
5989
5987
; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
5990
5988
; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16
0 commit comments