@@ -521,11 +521,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
521
521
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
522
522
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3
523
523
; GFX908-NEXT: s_sub_i32 s8, 0, s3
524
- ; GFX908-NEXT: v_cvt_f32_f16_e32 v17 , s7
525
- ; GFX908-NEXT: v_mov_b32_e32 v19 , 0
524
+ ; GFX908-NEXT: v_cvt_f32_f16_e32 v18 , s7
525
+ ; GFX908-NEXT: v_mov_b32_e32 v17 , 0
526
526
; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0
527
527
; GFX908-NEXT: v_mov_b32_e32 v0, 0
528
528
; GFX908-NEXT: v_mov_b32_e32 v1, 0
529
+ ; GFX908-NEXT: v_mov_b32_e32 v20, -1
529
530
; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
530
531
; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2
531
532
; GFX908-NEXT: v_readfirstlane_b32 s10, v2
@@ -544,7 +545,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
544
545
; GFX908-NEXT: s_cmp_ge_u32 s2, s3
545
546
; GFX908-NEXT: s_cselect_b32 s8, s10, s8
546
547
; GFX908-NEXT: s_lshr_b32 s7, s7, 16
547
- ; GFX908-NEXT: v_cvt_f32_f16_e32 v18 , s7
548
+ ; GFX908-NEXT: v_cvt_f32_f16_e32 v19 , s7
548
549
; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5
549
550
; GFX908-NEXT: s_lshl_b64 s[12:13], s[8:9], 5
550
551
; GFX908-NEXT: s_lshl_b64 s[10:11], s[4:5], 5
@@ -611,37 +612,37 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
611
612
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
612
613
; GFX908-NEXT: s_add_u32 s20, s18, s7
613
614
; GFX908-NEXT: s_addc_u32 s21, s19, s9
614
- ; GFX908-NEXT: global_load_dword v21, v19 , s[20:21] offset:-12 glc
615
+ ; GFX908-NEXT: global_load_dword v22, v17 , s[20:21] offset:-12 glc
615
616
; GFX908-NEXT: s_waitcnt vmcnt(0)
616
- ; GFX908-NEXT: global_load_dword v20, v19 , s[20:21] offset:-8 glc
617
+ ; GFX908-NEXT: global_load_dword v21, v17 , s[20:21] offset:-8 glc
617
618
; GFX908-NEXT: s_waitcnt vmcnt(0)
618
- ; GFX908-NEXT: global_load_dword v12, v19 , s[20:21] offset:-4 glc
619
+ ; GFX908-NEXT: global_load_dword v12, v17 , s[20:21] offset:-4 glc
619
620
; GFX908-NEXT: s_waitcnt vmcnt(0)
620
- ; GFX908-NEXT: global_load_dword v12, v19 , s[20:21] glc
621
+ ; GFX908-NEXT: global_load_dword v12, v17 , s[20:21] glc
621
622
; GFX908-NEXT: s_waitcnt vmcnt(0)
622
- ; GFX908-NEXT: ds_read_b64 v[12:13], v19
623
+ ; GFX908-NEXT: ds_read_b64 v[12:13], v20
623
624
; GFX908-NEXT: ds_read_b64 v[14:15], v0
624
625
; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
625
626
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
626
627
; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
627
628
; GFX908-NEXT: ; %bb.6: ; %bb51
628
629
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
629
- ; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
630
+ ; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
631
+ ; GFX908-NEXT: v_cvt_f32_f16_e32 v22, v22
632
+ ; GFX908-NEXT: v_cvt_f32_f16_sdwa v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
630
633
; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21
631
- ; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
632
- ; GFX908-NEXT: v_cvt_f32_f16_e32 v20, v20
633
- ; GFX908-NEXT: v_add_f32_e32 v24, v17, v12
634
- ; GFX908-NEXT: v_add_f32_e32 v25, v18, v13
635
- ; GFX908-NEXT: v_add_f32_e32 v26, 0, v12
636
- ; GFX908-NEXT: v_add_f32_e32 v27, 0, v13
637
- ; GFX908-NEXT: v_add_f32_e32 v15, v22, v15
638
- ; GFX908-NEXT: v_add_f32_e32 v14, v21, v14
639
- ; GFX908-NEXT: v_add_f32_e32 v13, v23, v13
640
- ; GFX908-NEXT: v_add_f32_e32 v12, v20, v12
641
- ; GFX908-NEXT: v_add_f32_e32 v5, v5, v25
642
- ; GFX908-NEXT: v_add_f32_e32 v4, v4, v24
643
- ; GFX908-NEXT: v_add_f32_e32 v7, v7, v27
644
- ; GFX908-NEXT: v_add_f32_e32 v6, v6, v26
634
+ ; GFX908-NEXT: v_add_f32_e32 v25, v18, v12
635
+ ; GFX908-NEXT: v_add_f32_e32 v26, v19, v13
636
+ ; GFX908-NEXT: v_add_f32_e32 v27, 0, v12
637
+ ; GFX908-NEXT: v_add_f32_e32 v28, 0, v13
638
+ ; GFX908-NEXT: v_add_f32_e32 v15, v23, v15
639
+ ; GFX908-NEXT: v_add_f32_e32 v14, v22, v14
640
+ ; GFX908-NEXT: v_add_f32_e32 v13, v24, v13
641
+ ; GFX908-NEXT: v_add_f32_e32 v12, v21, v12
642
+ ; GFX908-NEXT: v_add_f32_e32 v5, v5, v26
643
+ ; GFX908-NEXT: v_add_f32_e32 v4, v4, v25
644
+ ; GFX908-NEXT: v_add_f32_e32 v7, v7, v28
645
+ ; GFX908-NEXT: v_add_f32_e32 v6, v6, v27
645
646
; GFX908-NEXT: v_add_f32_e32 v8, v8, v14
646
647
; GFX908-NEXT: v_add_f32_e32 v9, v9, v15
647
648
; GFX908-NEXT: v_add_f32_e32 v10, v10, v12
@@ -686,6 +687,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
686
687
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3
687
688
; GFX90A-NEXT: s_sub_i32 s8, 0, s3
688
689
; GFX90A-NEXT: v_mov_b32_e32 v19, 0
690
+ ; GFX90A-NEXT: v_mov_b32_e32 v20, -1
689
691
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0
690
692
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0
691
693
; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
@@ -770,32 +772,32 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
770
772
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
771
773
; GFX90A-NEXT: s_add_u32 s20, s18, s7
772
774
; GFX90A-NEXT: s_addc_u32 s21, s19, s9
773
- ; GFX90A-NEXT: global_load_dword v21 , v19, s[20:21] offset:-12 glc
775
+ ; GFX90A-NEXT: global_load_dword v22 , v19, s[20:21] offset:-12 glc
774
776
; GFX90A-NEXT: s_waitcnt vmcnt(0)
775
- ; GFX90A-NEXT: global_load_dword v20 , v19, s[20:21] offset:-8 glc
777
+ ; GFX90A-NEXT: global_load_dword v21 , v19, s[20:21] offset:-8 glc
776
778
; GFX90A-NEXT: s_waitcnt vmcnt(0)
777
779
; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] offset:-4 glc
778
780
; GFX90A-NEXT: s_waitcnt vmcnt(0)
779
781
; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] glc
780
782
; GFX90A-NEXT: s_waitcnt vmcnt(0)
781
- ; GFX90A-NEXT: ds_read_b64 v[14:15], v19
783
+ ; GFX90A-NEXT: ds_read_b64 v[14:15], v20
782
784
; GFX90A-NEXT: ds_read_b64 v[16:17], v0
783
785
; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1]
784
786
; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21
785
787
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
786
788
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
787
789
; GFX90A-NEXT: ; %bb.6: ; %bb51
788
790
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
789
- ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
790
- ; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21
791
- ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
792
- ; GFX90A-NEXT: v_cvt_f32_f16_e32 v20, v20
793
- ; GFX90A-NEXT: v_pk_add_f32 v[24:25 ], v[2:3], v[14:15]
794
- ; GFX90A-NEXT: v_pk_add_f32 v[26:27 ], v[14:15], 0 op_sel_hi:[1,0]
791
+ ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
792
+ ; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v22
793
+ ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v25, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
794
+ ; GFX90A-NEXT: v_cvt_f32_f16_e32 v24, v21
795
+ ; GFX90A-NEXT: v_pk_add_f32 v[26:27 ], v[2:3], v[14:15]
796
+ ; GFX90A-NEXT: v_pk_add_f32 v[28:29 ], v[14:15], 0 op_sel_hi:[1,0]
795
797
; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17]
796
- ; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21 ], v[14:15]
797
- ; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25 ]
798
- ; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27 ]
798
+ ; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[24:25 ], v[14:15]
799
+ ; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[26:27 ]
800
+ ; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[28:29 ]
799
801
; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17]
800
802
; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15]
801
803
; GFX90A-NEXT: s_mov_b64 s[20:21], -1
0 commit comments