ROCm
diff --git a/‎llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
Lines changed: 57 additions & 0 deletions b/‎llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
Lines changed: 57 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/infinite-loop.ll
Lines changed: 141 additions & 109 deletions b/‎llvm/test/CodeGen/AMDGPU/infinite-loop.ll
Lines changed: 141 additions & 109 deletions
@@ -1,3 +1,10 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; NOTE: The checks for opt are NOT added by the update script. Those
+;       checks are looking for the absence of specific metadata, which
+;       cannot be expressed reliably by the generated checks.
+
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=ISA
 ; RUN: opt --amdgpu-annotate-uniform -S %s |  FileCheck %s -check-prefix=UNIFORM
 ; RUN: opt --amdgpu-annotate-uniform --si-annotate-control-flow -S %s |  FileCheck %s -check-prefix=CONTROLFLOW
 
@@ -9,6 +16,56 @@
 target triple = "amdgcn-mesa-mesa3d"
 
 define amdgpu_ps void @main(i32 %0, float %1) {
+; ISA-LABEL: main:
+; ISA:       ; %bb.0: ; %start
+; ISA-NEXT:    v_readfirstlane_b32 s0, v0
+; ISA-NEXT:    s_mov_b32 m0, s0
+; ISA-NEXT:    s_mov_b32 s0, 0
+; ISA-NEXT:    v_interp_p1_f32_e32 v0, v1, attr0.x
+; ISA-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
+; ISA-NEXT:    s_mov_b64 s[2:3], 0
+; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; ISA-NEXT:    s_branch BB0_3
+; ISA-NEXT:  BB0_1: ; %Flow1
+; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT:    s_add_i32 s0, s0, 1
+; ISA-NEXT:    s_mov_b64 s[8:9], 0
+; ISA-NEXT:  BB0_2: ; %Flow
+; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_and_b64 s[10:11], exec, s[6:7]
+; ISA-NEXT:    s_or_b64 s[2:3], s[10:11], s[2:3]
+; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; ISA-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; ISA-NEXT:    s_cbranch_execz BB0_6
+; ISA-NEXT:  BB0_3: ; %loop
+; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], exec
+; ISA-NEXT:    s_cmp_lt_u32 s0, 32
+; ISA-NEXT:    s_mov_b64 s[8:9], -1
+; ISA-NEXT:    s_cbranch_scc0 BB0_2
+; ISA-NEXT:  ; %bb.4: ; %endif1
+; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_mov_b64 s[6:7], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; ISA-NEXT:    s_cbranch_execz BB0_1
+; ISA-NEXT:  ; %bb.5: ; %endif2
+; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT:    s_branch BB0_1
+; ISA-NEXT:  BB0_6: ; %Flow2
+; ISA-NEXT:    s_or_b64 exec, exec, s[2:3]
+; ISA-NEXT:    v_mov_b32_e32 v1, 0
+; ISA-NEXT:    s_and_saveexec_b64 s[0:1], s[4:5]
+; ISA-NEXT:  ; %bb.7: ; %if1
+; ISA-NEXT:    v_sqrt_f32_e32 v1, v0
+; ISA-NEXT:  ; %bb.8: ; %endloop
+; ISA-NEXT:    s_or_b64 exec, exec, s[0:1]
+; ISA-NEXT:    exp mrt0 v1, v1, v1, v1 done vm
+; ISA-NEXT:    s_endpgm
 start:
   %v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0)
   br label %loop
 
@@ -1,13 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify %s | FileCheck -check-prefix=IR %s
 
-; SI-LABEL: {{^}}infinite_loop:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
-; SI: [[LOOP:BB[0-9]+_[0-9]+]]:  ; %loop
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG]]
-; SI: s_branch [[LOOP]]
 define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
+; SI-LABEL: infinite_loop:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:  BB0_1: ; %loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_branch BB0_1
+; IR-LABEL: @infinite_loop(
+; IR-NEXT:  entry:
+; IR-NEXT:    br label [[LOOP:%.*]]
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    br label [[LOOP]]
+;
 entry:
   br label %loop
 
@@ -16,31 +30,36 @@ loop:
   br label %loop
 }
 
-
-; IR-LABEL: @infinite_loop_ret(
-; IR:  br i1 %cond, label %loop, label %UnifiedReturnBlock
-
-; IR: loop:
-; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4
-; IR: br i1 true, label %loop, label %UnifiedReturnBlock
-
-; IR: UnifiedReturnBlock:
-; IR:  ret void
-
-
-; SI-LABEL: {{^}}infinite_loop_ret:
-; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]]
-
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
-; SI: s_and_b64 vcc, exec, -1
-; SI: [[LOOP:BB[0-9]+_[0-9]+]]:  ; %loop
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG]]
-; SI: s_cbranch_vccnz [[LOOP]]
-
-; SI: [[RET]]:  ; %UnifiedReturnBlock
-; SI: s_endpgm
 define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
+; SI-LABEL: infinite_loop_ret:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_cbranch_execz BB1_3
+; SI-NEXT:  ; %bb.1: ; %loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_and_b64 vcc, exec, -1
+; SI-NEXT:  BB1_2: ; %loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_cbranch_vccnz BB1_2
+; SI-NEXT:  BB1_3: ; %UnifiedReturnBlock
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_ret(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %cond = icmp eq i32 %tmp, 1
@@ -54,44 +73,44 @@ return:
   ret void
 }
 
-
-; IR-LABEL: @infinite_loops(
-; IR: br i1 undef, label %loop1, label %loop2
-
-; IR: loop1:
-; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4
-; IR: br i1 true, label %loop1, label %DummyReturnBlock
-
-; IR: loop2:
-; IR: store volatile i32 888, i32 addrspace(1)* %out, align 4
-; IR: br i1 true, label %loop2, label %DummyReturnBlock
-
-; IR: DummyReturnBlock:
-; IR: ret void
-
-
-; SI-LABEL: {{^}}infinite_loops:
-
-; SI: v_mov_b32_e32 [[REG1:v[0-9]+]], 0x3e7
-; SI: s_and_b64 vcc, exec, -1
-
-; SI: [[LOOP1:BB[0-9]+_[0-9]+]]:  ; %loop1
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG1]]
-; SI: s_cbranch_vccnz [[LOOP1]]
-; SI: s_branch [[RET:BB[0-9]+_[0-9]+]]
-
-; SI: v_mov_b32_e32 [[REG2:v[0-9]+]], 0x378
-; SI: s_and_b64 vcc, exec, -1
-
-; SI: [[LOOP2:BB[0-9]+_[0-9]+]]:  ; %loop2
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG2]]
-; SI: s_cbranch_vccnz [[LOOP2]]
-
-; SI: [[RET]]:  ; %DummyReturnBlock
-; SI: s_endpgm
 define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
+; SI-LABEL: infinite_loops:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_cbranch_scc0 BB2_3
+; SI-NEXT:  ; %bb.1: ; %loop1.preheader
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_and_b64 vcc, exec, -1
+; SI-NEXT:  BB2_2: ; %loop1
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_cbranch_vccnz BB2_2
+; SI-NEXT:    s_branch BB2_5
+; SI-NEXT:  BB2_3:
+; SI-NEXT:    v_mov_b32_e32 v0, 0x378
+; SI-NEXT:    s_and_b64 vcc, exec, -1
+; SI-NEXT:  BB2_4: ; %loop2
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_cbranch_vccnz BB2_4
+; SI-NEXT:  BB2_5: ; %DummyReturnBlock
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loops(
+; IR-NEXT:  entry:
+; IR-NEXT:    br i1 undef, label [[LOOP1:%.*]], label [[LOOP2:%.*]]
+; IR:       loop1:
+; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[LOOP1]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       loop2:
+; IR-NEXT:    store volatile i32 888, i32 addrspace(1)* [[OUT]], align 4
+; IR-NEXT:    br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]]
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
 entry:
   br i1 undef, label %loop1, label %loop2
 
@@ -104,55 +123,68 @@ loop2:
   br label %loop2
 }
 
-
-
-; IR-LABEL: @infinite_loop_nest_ret(
-; IR: br i1 %cond1, label %outer_loop, label %UnifiedReturnBlock
-
-; IR: outer_loop:
-; IR: br label %inner_loop
-
-; IR: inner_loop:
-; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4
-; IR: %cond3 = icmp eq i32 %tmp, 3
-; IR: br i1 true, label %TransitionBlock, label %UnifiedReturnBlock
-
-; IR: TransitionBlock:
-; IR: br i1 %cond3, label %inner_loop, label %outer_loop
-
-; IR: UnifiedReturnBlock:
-; IR: ret void
-
-; SI-LABEL: {{^}}infinite_loop_nest_ret:
-; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]]
-
-; SI: s_mov_b32
-; SI: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]:  ; %outer_loop
-
-; SI: [[INNER_LOOP:BB[0-9]+_[0-9]+]]:  ; %inner_loop
-; SI: s_waitcnt expcnt(0)
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG]]
-
-; SI: s_andn2_b64 exec
-; SI: s_cbranch_execnz [[INNER_LOOP]]
-
-; SI: s_andn2_b64 exec
-; SI: s_cbranch_execnz [[OUTER_LOOP]]
-
-; SI: [[RET]]:  ; %UnifiedReturnBlock
-; SI: s_endpgm
 define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
+; SI-LABEL: infinite_loop_nest_ret:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_cbranch_execz BB3_5
+; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:  BB3_2: ; %outer_loop
+; SI-NEXT:    ; =>This Loop Header: Depth=1
+; SI-NEXT:    ; Child Loop BB3_3 Depth 2
+; SI-NEXT:    s_and_b64 s[8:9], exec, vcc
+; SI-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], 0
+; SI-NEXT:  BB3_3: ; %inner_loop
+; SI-NEXT:    ; Parent Loop BB3_2 Depth=1
+; SI-NEXT:    ; => This Inner Loop Header: Depth=2
+; SI-NEXT:    s_and_b64 s[10:11], exec, s[0:1]
+; SI-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT:    s_cbranch_execnz BB3_3
+; SI-NEXT:  ; %bb.4: ; %Flow
+; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_cbranch_execnz BB3_2
+; SI-NEXT:  BB3_5: ; %UnifiedReturnBlock
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_nest_ret(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND1:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT:    br i1 [[COND1]], label [[OUTER_LOOP:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       outer_loop:
+; IR-NEXT:    br label [[INNER_LOOP:%.*]]
+; IR:       inner_loop:
+; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK]]
+; IR:       TransitionBlock:
+; IR-NEXT:    br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %cond1 = icmp eq i32 %tmp, 1
   br i1 %cond1, label %outer_loop, label %return
 
 outer_loop:
- ; %cond2 = icmp eq i32 %tmp, 2
- ; br i1 %cond2, label %outer_loop, label %inner_loop
- br label %inner_loop
+  ; %cond2 = icmp eq i32 %tmp, 2
+  ; br i1 %cond2, label %outer_loop, label %inner_loop
+  br label %inner_loop
 
 inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
   store volatile i32 999, i32 addrspace(1)* %out, align 4