@@ -571,13 +571,44 @@ entry:
571
571
define amdgpu_kernel void @add_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout ) {
572
572
; GFX6-LABEL: add_i32_varying_vdata:
573
573
; GFX6: ; %bb.0: ; %entry
574
- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
575
- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
574
+ ; GFX6-NEXT: s_mov_b64 s[2:3], exec
575
+ ; GFX6-NEXT: s_mov_b32 s4, 0
576
+ ; GFX6-NEXT: ; implicit-def: $vgpr1
577
+ ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
578
+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
579
+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
580
+ ; GFX6-NEXT: s_mov_b32 m0, s5
581
+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
582
+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
583
+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
584
+ ; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
585
+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
586
+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
587
+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
588
+ ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
589
+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
590
+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
591
+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
592
+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
593
+ ; GFX6-NEXT: ; implicit-def: $vgpr0
594
+ ; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
595
+ ; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
596
+ ; GFX6-NEXT: s_cbranch_execz .LBB2_4
597
+ ; GFX6-NEXT: ; %bb.3:
598
+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
599
+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
576
600
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
577
- ; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
601
+ ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
602
+ ; GFX6-NEXT: .LBB2_4:
603
+ ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
604
+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
578
605
; GFX6-NEXT: s_mov_b32 s3, 0xf000
579
606
; GFX6-NEXT: s_mov_b32 s2, -1
580
607
; GFX6-NEXT: s_waitcnt vmcnt(0)
608
+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
609
+ ; GFX6-NEXT: s_waitcnt expcnt(0)
610
+ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
611
+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
581
612
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
582
613
; GFX6-NEXT: s_endpgm
583
614
;
@@ -924,15 +955,46 @@ entry:
924
955
define amdgpu_kernel void @struct_add_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout , i32 %vindex ) {
925
956
; GFX6-LABEL: struct_add_i32_varying_vdata:
926
957
; GFX6: ; %bb.0: ; %entry
927
- ; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11
928
- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
929
- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
958
+ ; GFX6-NEXT: s_mov_b64 s[2:3], exec
959
+ ; GFX6-NEXT: s_mov_b32 s4, 0
960
+ ; GFX6-NEXT: ; implicit-def: $vgpr1
961
+ ; GFX6-NEXT: .LBB3_1: ; %ComputeLoop
962
+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
963
+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
964
+ ; GFX6-NEXT: s_mov_b32 m0, s5
965
+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
966
+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
967
+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
968
+ ; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
969
+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
970
+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
971
+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
972
+ ; GFX6-NEXT: s_cbranch_vccnz .LBB3_1
973
+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
974
+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
975
+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
976
+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
977
+ ; GFX6-NEXT: ; implicit-def: $vgpr0
978
+ ; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
979
+ ; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
980
+ ; GFX6-NEXT: s_cbranch_execz .LBB3_4
981
+ ; GFX6-NEXT: ; %bb.3:
982
+ ; GFX6-NEXT: s_load_dword s5, s[0:1], 0x11
983
+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
984
+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
930
985
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
931
- ; GFX6-NEXT: v_mov_b32_e32 v1, s2
932
- ; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
986
+ ; GFX6-NEXT: v_mov_b32_e32 v2, s5
987
+ ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
988
+ ; GFX6-NEXT: .LBB3_4:
989
+ ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
990
+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
933
991
; GFX6-NEXT: s_mov_b32 s3, 0xf000
934
992
; GFX6-NEXT: s_mov_b32 s2, -1
935
993
; GFX6-NEXT: s_waitcnt vmcnt(0)
994
+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
995
+ ; GFX6-NEXT: s_waitcnt expcnt(0)
996
+ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
997
+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
936
998
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
937
999
; GFX6-NEXT: s_endpgm
938
1000
;
@@ -1953,13 +2015,44 @@ entry:
1953
2015
define amdgpu_kernel void @sub_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout ) {
1954
2016
; GFX6-LABEL: sub_i32_varying_vdata:
1955
2017
; GFX6: ; %bb.0: ; %entry
1956
- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
1957
- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2018
+ ; GFX6-NEXT: s_mov_b64 s[2:3], exec
2019
+ ; GFX6-NEXT: s_mov_b32 s4, 0
2020
+ ; GFX6-NEXT: ; implicit-def: $vgpr1
2021
+ ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop
2022
+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
2023
+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
2024
+ ; GFX6-NEXT: s_mov_b32 m0, s5
2025
+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
2026
+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
2027
+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
2028
+ ; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
2029
+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
2030
+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
2031
+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
2032
+ ; GFX6-NEXT: s_cbranch_vccnz .LBB7_1
2033
+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
2034
+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2035
+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2036
+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2037
+ ; GFX6-NEXT: ; implicit-def: $vgpr0
2038
+ ; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
2039
+ ; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
2040
+ ; GFX6-NEXT: s_cbranch_execz .LBB7_4
2041
+ ; GFX6-NEXT: ; %bb.3:
2042
+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
2043
+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
1958
2044
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1959
- ; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
2045
+ ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
2046
+ ; GFX6-NEXT: .LBB7_4:
2047
+ ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
2048
+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1960
2049
; GFX6-NEXT: s_mov_b32 s3, 0xf000
1961
2050
; GFX6-NEXT: s_mov_b32 s2, -1
1962
2051
; GFX6-NEXT: s_waitcnt vmcnt(0)
2052
+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
2053
+ ; GFX6-NEXT: s_waitcnt expcnt(0)
2054
+ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
2055
+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1963
2056
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
1964
2057
; GFX6-NEXT: s_endpgm
1965
2058
;
0 commit comments