@@ -5715,33 +5715,33 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5715
5715
; GFX6-NEXT: s_mov_b32 s0, s4
5716
5716
; GFX6-NEXT: s_mov_b32 s1, s5
5717
5717
; GFX6-NEXT: s_waitcnt vmcnt(0)
5718
- ; GFX6-NEXT: v_bfe_u32 v2, v29, 11, 1
5719
- ; GFX6-NEXT: v_bfe_u32 v0, v29, 10 , 1
5720
- ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
5721
- ; GFX6-NEXT: v_bfe_u32 v5, v29, 9 , 1
5718
+ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 15, v29
5719
+ ; GFX6-NEXT: v_bfe_u32 v0, v29, 14 , 1
5720
+ ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
5721
+ ; GFX6-NEXT: v_bfe_u32 v5, v29, 13 , 1
5722
5722
; GFX6-NEXT: s_waitcnt expcnt(0)
5723
- ; GFX6-NEXT: v_bfe_u32 v3, v29, 8 , 1
5724
- ; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:64
5725
- ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 15, v29
5723
+ ; GFX6-NEXT: v_bfe_u32 v3, v29, 12 , 1
5724
+ ; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96
5725
+ ; GFX6-NEXT: v_bfe_u32 v8, v29, 11, 1
5726
5726
; GFX6-NEXT: s_waitcnt expcnt(0)
5727
- ; GFX6-NEXT: v_bfe_u32 v6, v29, 14 , 1
5728
- ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
5729
- ; GFX6-NEXT: v_bfe_u32 v27, v29, 5 , 1
5730
- ; GFX6-NEXT: v_bfe_u32 v23, v29, 7 , 1
5731
- ; GFX6-NEXT: v_bfe_u32 v19, v29, 1 , 1
5732
- ; GFX6-NEXT: v_bfe_u32 v15, v29, 3 , 1
5733
- ; GFX6-NEXT: v_bfe_u32 v11, v29, 13 , 1
5734
- ; GFX6-NEXT: v_bfe_u32 v25, v29, 4, 1
5735
- ; GFX6-NEXT: v_bfe_u32 v21, v29, 6 , 1
5736
- ; GFX6-NEXT: v_and_b32_e32 v17, 1, v29
5737
- ; GFX6-NEXT: v_bfe_u32 v13, v29, 2 , 1
5727
+ ; GFX6-NEXT: v_bfe_u32 v6, v29, 10 , 1
5728
+ ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80
5729
+ ; GFX6-NEXT: v_bfe_u32 v27, v29, 1 , 1
5730
+ ; GFX6-NEXT: v_bfe_u32 v23, v29, 3 , 1
5731
+ ; GFX6-NEXT: v_bfe_u32 v19, v29, 5 , 1
5732
+ ; GFX6-NEXT: v_bfe_u32 v15, v29, 7 , 1
5733
+ ; GFX6-NEXT: v_bfe_u32 v11, v29, 9 , 1
5734
+ ; GFX6-NEXT: v_and_b32_e32 v25, 1, v29
5735
+ ; GFX6-NEXT: v_bfe_u32 v21, v29, 2 , 1
5736
+ ; GFX6-NEXT: v_bfe_u32 v17, v29, 4, 1
5737
+ ; GFX6-NEXT: v_bfe_u32 v13, v29, 6 , 1
5738
5738
; GFX6-NEXT: s_waitcnt expcnt(0)
5739
- ; GFX6-NEXT: v_bfe_u32 v9, v29, 12 , 1
5740
- ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96
5741
- ; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:16
5742
- ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0
5743
- ; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48
5744
- ; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:32
5739
+ ; GFX6-NEXT: v_bfe_u32 v9, v29, 8 , 1
5740
+ ; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
5741
+ ; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48
5742
+ ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:32
5743
+ ; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16
5744
+ ; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0
5745
5745
; GFX6-NEXT: s_endpgm
5746
5746
;
5747
5747
; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
@@ -5757,7 +5757,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5757
5757
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
5758
5758
; GFX8-NEXT: s_add_u32 s2, s0, 0x70
5759
5759
; GFX8-NEXT: s_addc_u32 s3, s1, 0
5760
- ; GFX8-NEXT: s_add_u32 s4, s0, 0x50
5760
+ ; GFX8-NEXT: s_add_u32 s4, s0, 0x60
5761
5761
; GFX8-NEXT: s_addc_u32 s5, s1, 0
5762
5762
; GFX8-NEXT: v_mov_b32_e32 v23, s5
5763
5763
; GFX8-NEXT: v_mov_b32_e32 v22, s4
@@ -5771,9 +5771,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5771
5771
; GFX8-NEXT: v_mov_b32_e32 v21, v2
5772
5772
; GFX8-NEXT: v_mov_b32_e32 v25, v2
5773
5773
; GFX8-NEXT: s_waitcnt vmcnt(0)
5774
- ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 10 , v0
5774
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 12 , v0
5775
5775
; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
5776
- ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11 , v0
5776
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 13 , v0
5777
5777
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
5778
5778
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v1
5779
5779
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[5:8]
@@ -5782,31 +5782,31 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5782
5782
; GFX8-NEXT: v_mov_b32_e32 v22, s2
5783
5783
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0
5784
5784
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
5785
- ; GFX8-NEXT: s_add_u32 s2, s0, 64
5785
+ ; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4]
5786
5786
; GFX8-NEXT: v_mov_b32_e32 v5, v2
5787
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11, v0
5787
5788
; GFX8-NEXT: v_mov_b32_e32 v7, v2
5788
- ; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4]
5789
5789
; GFX8-NEXT: v_mov_b32_e32 v23, v2
5790
- ; GFX8-NEXT: v_mov_b32_e32 v3, 1
5790
+ ; GFX8-NEXT: s_add_u32 s2, s0, 0x50
5791
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 10, v0
5792
+ ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
5791
5793
; GFX8-NEXT: s_addc_u32 s3, s1, 0
5794
+ ; GFX8-NEXT: v_and_b32_e32 v8, 1, v2
5795
+ ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v1
5792
5796
; GFX8-NEXT: v_mov_b32_e32 v1, s2
5793
5797
; GFX8-NEXT: v_mov_b32_e32 v2, s3
5794
- ; GFX8-NEXT: v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
5795
- ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0
5796
- ; GFX8-NEXT: s_add_u32 s2, s0, 0x60
5797
- ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
5798
+ ; GFX8-NEXT: s_add_u32 s2, s0, 64
5798
5799
; GFX8-NEXT: s_addc_u32 s3, s1, 0
5799
- ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v3
5800
5800
; GFX8-NEXT: v_mov_b32_e32 v4, s3
5801
5801
; GFX8-NEXT: v_mov_b32_e32 v3, s2
5802
5802
; GFX8-NEXT: s_add_u32 s2, s0, 48
5803
+ ; GFX8-NEXT: v_mov_b32_e32 v6, 1
5803
5804
; GFX8-NEXT: s_addc_u32 s3, s1, 0
5804
- ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0
5805
5805
; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[8:11]
5806
5806
; GFX8-NEXT: v_mov_b32_e32 v2, s1
5807
5807
; GFX8-NEXT: v_mov_b32_e32 v9, s3
5808
- ; GFX8-NEXT: v_and_b32_e32 v11, 1 , v6
5809
- ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13 , v0
5808
+ ; GFX8-NEXT: v_and_b32_sdwa v11, v0 , v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
5809
+ ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 9 , v0
5810
5810
; GFX8-NEXT: v_mov_b32_e32 v8, s2
5811
5811
; GFX8-NEXT: s_add_u32 s2, s0, 32
5812
5812
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
@@ -5936,35 +5936,33 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5936
5936
; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
5937
5937
; GFX12-NEXT: s_wait_loadcnt 0x0
5938
5938
; GFX12-NEXT: v_and_b32_e32 v28, 1, v0
5939
- ; GFX12-NEXT: v_lshrrev_b16 v4, 11 , v0
5940
- ; GFX12-NEXT: v_lshrrev_b16 v8, 9 , v0
5941
- ; GFX12-NEXT: v_lshrrev_b16 v12, 13 , v0
5939
+ ; GFX12-NEXT: v_lshrrev_b16 v4, 13 , v0
5940
+ ; GFX12-NEXT: v_lshrrev_b16 v8, 11 , v0
5941
+ ; GFX12-NEXT: v_lshrrev_b16 v12, 9 , v0
5942
5942
; GFX12-NEXT: v_lshrrev_b16 v16, 7, v0
5943
5943
; GFX12-NEXT: v_lshrrev_b16 v2, 15, v0
5944
5944
; GFX12-NEXT: v_lshrrev_b16 v6, 14, v0
5945
- ; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0
5946
5945
; GFX12-NEXT: v_lshrrev_b16 v20, 5, v0
5947
5946
; GFX12-NEXT: v_lshrrev_b16 v24, 3, v0
5948
5947
; GFX12-NEXT: v_lshrrev_b16 v32, 1, v0
5948
+ ; GFX12-NEXT: v_lshrrev_b16 v10, 12, v0
5949
5949
; GFX12-NEXT: v_and_b32_e32 v33, 1, v4
5950
5950
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v34, 1, v8
5951
- ; GFX12-NEXT: v_lshrrev_b16 v14, 8 , v0
5952
- ; GFX12-NEXT: v_lshrrev_b16 v18, 12 , v0
5951
+ ; GFX12-NEXT: v_lshrrev_b16 v14, 10 , v0
5952
+ ; GFX12-NEXT: v_lshrrev_b16 v18, 8 , v0
5953
5953
; GFX12-NEXT: v_and_b32_e32 v35, 1, v12
5954
5954
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v36, 1, v16
5955
5955
; GFX12-NEXT: v_lshrrev_b16 v22, 6, v0
5956
5956
; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v38, 1, v24
5957
5957
; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v32, 1, v32
5958
- ; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v4, 1, v10
5959
5958
; GFX12-NEXT: v_mov_b32_e32 v23, v1
5960
5959
; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v2, 0xffff, v2
5961
- ; GFX12-NEXT: v_mov_b32_e32 v31, v1
5962
5960
; GFX12-NEXT: v_lshrrev_b16 v26, 4, v0
5963
5961
; GFX12-NEXT: v_lshrrev_b16 v30, 2, v0
5964
5962
; GFX12-NEXT: v_and_b32_e32 v37, 1, v20
5965
- ; GFX12-NEXT: v_and_b32_e32 v0, 1, v6
5963
+ ; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v0, 1, v6
5964
+ ; GFX12-NEXT: v_and_b32_e32 v4, 1, v10
5966
5965
; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v6, 0xffff, v33
5967
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
5968
5966
; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v24, 1, v30
5969
5967
; GFX12-NEXT: v_and_b32_e32 v8, 1, v14
5970
5968
; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v10, 0xffff, v34
@@ -5974,13 +5972,13 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
5974
5972
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v35
5975
5973
; GFX12-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v18, 0xffff, v36
5976
5974
; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v32
5977
- ; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38
5975
+ ; GFX12-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_and_b32 v26, 0xffff, v38
5978
5976
; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37
5979
5977
; GFX12-NEXT: s_clause 0x7
5980
- ; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80
5981
- ; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64
5982
5978
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
5983
- ; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
5979
+ ; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:96
5980
+ ; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:80
5981
+ ; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:64
5984
5982
; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48
5985
5983
; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
5986
5984
; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16
0 commit comments