Skip to content

Commit 7a10179

Browse files
committed
Revert "[AMDGPU] Mark mbcnt as convergent"
This reverts commit 3711403. The output of mbcnt does not depend on other active lanes, and hence it is not convergent. The original change was made as a possible fix for ROCm/hip#3172 But changing mbcnt does not fix that issue. Reviewed By: ruiling, foad, yaxunl Differential Revision: https://reviews.llvm.org/D153953
1 parent ea045b9 commit 7a10179

File tree

7 files changed

+416
-381
lines changed

7 files changed

+416
-381
lines changed

clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -690,14 +690,12 @@ kernel void test_gws_sema_p(uint id) {
690690

691691
// CHECK-LABEL: @test_mbcnt_lo(
692692
// CHECK: call i32 @llvm.amdgcn.mbcnt.lo(i32 %src0, i32 %src1)
693-
// CHECK: declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #[[$MBCNT_ATTRS:[0-9]+]]
694693
kernel void test_mbcnt_lo(global uint* out, uint src0, uint src1) {
695694
*out = __builtin_amdgcn_mbcnt_lo(src0, src1);
696695
}
697696

698697
// CHECK-LABEL: @test_mbcnt_hi(
699698
// CHECK: call i32 @llvm.amdgcn.mbcnt.hi(i32 %src0, i32 %src1)
700-
// CHECK: declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #[[$MBCNT_ATTRS]]
701699
kernel void test_mbcnt_hi(global uint* out, uint src0, uint src1) {
702700
*out = __builtin_amdgcn_mbcnt_hi(src0, src1);
703701
}
@@ -834,7 +832,6 @@ void test_atomic_inc_dec(local uint *lptr, global uint *gptr, uint val) {
834832
// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
835833
// CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
836834
// CHECK-DAG: attributes #[[$READ_EXEC_ATTRS]] = { convergent }
837-
// CHECK-DAG: attributes #[[$MBCNT_ATTRS]] = {{.* convergent .*}}
838835
// CHECK-DAG: ![[$EXEC]] = !{!"exec"}
839836
// CHECK-DAG: ![[$EXEC_LO]] = !{!"exec_lo"}
840837
// CHECK-DAG: ![[$EXEC_HI]] = !{!"exec_hi"}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1833,12 +1833,12 @@ def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty],
18331833
def int_amdgcn_mbcnt_lo :
18341834
ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">,
18351835
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1836-
[IntrNoMem, IntrConvergent]>;
1836+
[IntrNoMem]>;
18371837

18381838
def int_amdgcn_mbcnt_hi :
18391839
ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">,
18401840
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1841-
[IntrNoMem, IntrConvergent]>;
1841+
[IntrNoMem]>;
18421842

18431843
// llvm.amdgcn.ds.swizzle src offset
18441844
def int_amdgcn_ds_swizzle :

llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Lines changed: 63 additions & 57 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 42 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -516,10 +516,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
516516
;
517517
; GFX8-LABEL: add_i32_varying:
518518
; GFX8: ; %bb.0: ; %entry
519-
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
520519
; GFX8-NEXT: s_mov_b64 s[2:3], exec
521520
; GFX8-NEXT: s_mov_b32 s6, 0
522-
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1
523521
; GFX8-NEXT: ; implicit-def: $vgpr1
524522
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
525523
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -537,7 +535,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
537535
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
538536
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
539537
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
540-
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
538+
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
539+
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
540+
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
541541
; GFX8-NEXT: ; implicit-def: $vgpr0
542542
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
543543
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -565,10 +565,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
565565
;
566566
; GFX9-LABEL: add_i32_varying:
567567
; GFX9: ; %bb.0: ; %entry
568-
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
569568
; GFX9-NEXT: s_mov_b64 s[2:3], exec
570569
; GFX9-NEXT: s_mov_b32 s6, 0
571-
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1
572570
; GFX9-NEXT: ; implicit-def: $vgpr1
573571
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
574572
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -586,7 +584,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
586584
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
587585
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
588586
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
589-
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
587+
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
588+
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
589+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
590590
; GFX9-NEXT: ; implicit-def: $vgpr0
591591
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
592592
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -614,10 +614,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
614614
;
615615
; GFX1064-LABEL: add_i32_varying:
616616
; GFX1064: ; %bb.0: ; %entry
617-
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
618617
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
619618
; GFX1064-NEXT: s_mov_b32 s6, 0
620-
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1
621619
; GFX1064-NEXT: ; implicit-def: $vgpr1
622620
; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop
623621
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -634,7 +632,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
634632
; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1
635633
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
636634
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
637-
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
635+
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
636+
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
637+
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
638638
; GFX1064-NEXT: ; implicit-def: $vgpr0
639639
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
640640
; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -665,7 +665,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
665665
;
666666
; GFX1032-LABEL: add_i32_varying:
667667
; GFX1032: ; %bb.0: ; %entry
668-
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
669668
; GFX1032-NEXT: s_mov_b32 s2, exec_lo
670669
; GFX1032-NEXT: s_mov_b32 s4, 0
671670
; GFX1032-NEXT: ; implicit-def: $vgpr1
@@ -681,7 +680,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
681680
; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1
682681
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
683682
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
684-
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
683+
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
684+
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
685685
; GFX1032-NEXT: ; implicit-def: $vgpr0
686686
; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
687687
; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
@@ -712,11 +712,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
712712
;
713713
; GFX1164-LABEL: add_i32_varying:
714714
; GFX1164: ; %bb.0: ; %entry
715-
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
716715
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
717716
; GFX1164-NEXT: s_mov_b32 s6, 0
718-
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
719-
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1
720717
; GFX1164-NEXT: ; implicit-def: $vgpr1
721718
; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop
722719
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -735,9 +732,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
735732
; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
736733
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
737734
; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
738-
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
735+
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
736+
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
737+
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
738+
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
739739
; GFX1164-NEXT: ; implicit-def: $vgpr0
740-
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
740+
; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
741+
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
741742
; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
742743
; GFX1164-NEXT: s_cbranch_execz .LBB2_4
743744
; GFX1164-NEXT: ; %bb.3:
@@ -767,7 +768,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
767768
;
768769
; GFX1132-LABEL: add_i32_varying:
769770
; GFX1132: ; %bb.0: ; %entry
770-
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
771771
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
772772
; GFX1132-NEXT: s_mov_b32 s4, 0
773773
; GFX1132-NEXT: ; implicit-def: $vgpr1
@@ -784,9 +784,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
784784
; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1
785785
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
786786
; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
787-
; GFX1132-NEXT: s_mov_b32 s5, exec_lo
787+
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
788+
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
789+
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
788790
; GFX1132-NEXT: ; implicit-def: $vgpr0
789-
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
791+
; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
790792
; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
791793
; GFX1132-NEXT: s_cbranch_execz .LBB2_4
792794
; GFX1132-NEXT: ; %bb.3:
@@ -2016,10 +2018,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
20162018
;
20172019
; GFX8-LABEL: sub_i32_varying:
20182020
; GFX8: ; %bb.0: ; %entry
2019-
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
20202021
; GFX8-NEXT: s_mov_b64 s[2:3], exec
20212022
; GFX8-NEXT: s_mov_b32 s6, 0
2022-
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1
20232023
; GFX8-NEXT: ; implicit-def: $vgpr1
20242024
; GFX8-NEXT: .LBB8_1: ; %ComputeLoop
20252025
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2037,7 +2037,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
20372037
; GFX8-NEXT: s_cbranch_scc1 .LBB8_1
20382038
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
20392039
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2040-
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
2040+
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2041+
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2042+
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
20412043
; GFX8-NEXT: ; implicit-def: $vgpr0
20422044
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
20432045
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -2065,10 +2067,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
20652067
;
20662068
; GFX9-LABEL: sub_i32_varying:
20672069
; GFX9: ; %bb.0: ; %entry
2068-
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
20692070
; GFX9-NEXT: s_mov_b64 s[2:3], exec
20702071
; GFX9-NEXT: s_mov_b32 s6, 0
2071-
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1
20722072
; GFX9-NEXT: ; implicit-def: $vgpr1
20732073
; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
20742074
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2086,7 +2086,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
20862086
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
20872087
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
20882088
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2089-
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
2089+
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2090+
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2091+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
20902092
; GFX9-NEXT: ; implicit-def: $vgpr0
20912093
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
20922094
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -2114,10 +2116,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
21142116
;
21152117
; GFX1064-LABEL: sub_i32_varying:
21162118
; GFX1064: ; %bb.0: ; %entry
2117-
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
21182119
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
21192120
; GFX1064-NEXT: s_mov_b32 s6, 0
2120-
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1
21212121
; GFX1064-NEXT: ; implicit-def: $vgpr1
21222122
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
21232123
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2134,7 +2134,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
21342134
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
21352135
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
21362136
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2137-
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
2137+
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2138+
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2139+
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
21382140
; GFX1064-NEXT: ; implicit-def: $vgpr0
21392141
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
21402142
; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -2165,7 +2167,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
21652167
;
21662168
; GFX1032-LABEL: sub_i32_varying:
21672169
; GFX1032: ; %bb.0: ; %entry
2168-
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
21692170
; GFX1032-NEXT: s_mov_b32 s2, exec_lo
21702171
; GFX1032-NEXT: s_mov_b32 s4, 0
21712172
; GFX1032-NEXT: ; implicit-def: $vgpr1
@@ -2181,7 +2182,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
21812182
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
21822183
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
21832184
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2184-
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
2185+
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2186+
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
21852187
; GFX1032-NEXT: ; implicit-def: $vgpr0
21862188
; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
21872189
; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
@@ -2212,11 +2214,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
22122214
;
22132215
; GFX1164-LABEL: sub_i32_varying:
22142216
; GFX1164: ; %bb.0: ; %entry
2215-
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
22162217
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
22172218
; GFX1164-NEXT: s_mov_b32 s6, 0
2218-
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
2219-
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1
22202219
; GFX1164-NEXT: ; implicit-def: $vgpr1
22212220
; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop
22222221
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2235,9 +2234,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
22352234
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
22362235
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
22372236
; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2238-
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
2237+
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2238+
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2239+
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2240+
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
22392241
; GFX1164-NEXT: ; implicit-def: $vgpr0
2240-
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
2242+
; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
2243+
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
22412244
; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
22422245
; GFX1164-NEXT: s_cbranch_execz .LBB8_4
22432246
; GFX1164-NEXT: ; %bb.3:
@@ -2267,7 +2270,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
22672270
;
22682271
; GFX1132-LABEL: sub_i32_varying:
22692272
; GFX1132: ; %bb.0: ; %entry
2270-
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
22712273
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
22722274
; GFX1132-NEXT: s_mov_b32 s4, 0
22732275
; GFX1132-NEXT: ; implicit-def: $vgpr1
@@ -2284,9 +2286,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
22842286
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
22852287
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
22862288
; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2287-
; GFX1132-NEXT: s_mov_b32 s5, exec_lo
2289+
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2290+
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2291+
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
22882292
; GFX1132-NEXT: ; implicit-def: $vgpr0
2289-
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
2293+
; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
22902294
; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
22912295
; GFX1132-NEXT: s_cbranch_execz .LBB8_4
22922296
; GFX1132-NEXT: ; %bb.3:

0 commit comments

Comments
 (0)