[AMDGPU] Emit amdgcn.if.break in the same BB as amdgcn.loop #118081

mariusz-sikora-at-amd · 2024-11-29T11:52:50Z

Before this change if.break was placed in wrong loop level which resulted in accumulating values only from last iteration of the inner loop.

llvmbot · 2024-11-29T11:53:22Z

@llvm/pr-subscribers-backend-amdgpu

Author: Mariusz Sikora (mariusz-sikora-at-amd)

Changes

Before this change if.break was placed in wrong loop level which resulted in accumulating values only from last iteration of the inner loop.

Full diff: https://github.com/llvm/llvm-project/pull/118081.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp (+6-1)
(modified) llvm/test/CodeGen/AMDGPU/multilevel-break.ll (+15-11)
(modified) llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll (+76-74)

diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 67012669a6df0c..ecb7f37f5c3a66 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -224,8 +224,13 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
   if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
     BasicBlock *Parent = Inst->getParent();
     Instruction *Insert;
-    if (L->contains(Inst)) {
+    if (LI->getLoopFor(Parent) == L) {
+      // Insert IfBreak in the same BB as Cond, which can help
+      // SILowerControlFlow to know that it does not have to insert an
+      // AND with EXEC.
       Insert = Parent->getTerminator();
+    } else if (L->contains(Inst)) {
+      Insert = Term;
     } else {
       Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
     }
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 65c44768d3d88b..6c62f3f225cd9b 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -27,10 +27,10 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
 ; OPT-NEXT:    [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN]])
 ; OPT-NEXT:    [[TMP7:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]])
-; OPT-NEXT:    [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]])
 ; OPT-NEXT:    br i1 [[TMP7]], label [[FLOW1]], label [[LOOP]]
 ; OPT:       Flow1:
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
+; OPT-NEXT:    [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]])
 ; OPT-NEXT:    [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]])
 ; OPT-NEXT:    br i1 [[TMP9]], label [[IF:%.*]], label [[LOOP_OUTER]]
 ; OPT:       IF:
@@ -57,33 +57,37 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; GCN-NEXT:  .LBB0_2: ; %LOOP.outer
 ; GCN-NEXT:    ; =>This Loop Header: Depth=1
 ; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
-; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GCN-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:    s_branch .LBB0_4
 ; GCN-NEXT:  .LBB0_3: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
-; GCN-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT:    s_and_b64 s[10:11], exec, s[8:9]
+; GCN-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    s_and_b64 s[10:11], s[6:7], exec
+; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_1
 ; GCN-NEXT:  .LBB0_4: ; %LOOP
 ; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v4
-; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], exec
-; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB0_3
 ; GCN-NEXT:  ; %bb.5: ; %ENDIF
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
 ; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
-; GCN-NEXT:    s_and_b64 s[10:11], vcc, exec
-; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
+; GCN-NEXT:    s_andn2_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_and_b64 s[12:13], vcc, exec
+; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GCN-NEXT:    s_branch .LBB0_3
 ; GCN-NEXT:  .LBB0_6: ; %IF
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index a52d9ff526c2ae..bd6ef9e088b12f 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -46,51 +46,52 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap
 ; GCN-NEXT:    s_cbranch_vccz .LBB0_6
 ; GCN-NEXT:  .LBB0_7: ; %DummyReturnBlock
 ; GCN-NEXT:    s_endpgm
-; IR-LABEL: @reduced_nested_loop_conditions(
-; IR-NEXT:  bb:
+; IR-LABEL: define amdgpu_kernel void @reduced_nested_loop_conditions(
+; IR-SAME: ptr addrspace(3) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; IR-NEXT:  [[BB:.*]]:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]]
-; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(3) [[ARG:%.*]], i32 [[MY_TMP]]
+; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(3) [[ARG]], i32 [[MY_TMP]]
 ; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 8
-; IR-NEXT:    br label [[BB5:%.*]]
-; IR:       bb3:
-; IR-NEXT:    br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
-; IR:       bb4:
-; IR-NEXT:    br label [[FLOW:%.*]]
-; IR:       bb5:
-; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], [[BB10:%.*]] ], [ 0, [[BB:%.*]] ]
-; IR-NEXT:    [[MY_TMP6:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP5:%.*]], [[BB10]] ]
+; IR-NEXT:    br label %[[BB5:.*]]
+; IR:       [[BB3:.*]]:
+; IR-NEXT:    br i1 true, label %[[BB4:.*]], label %[[BB13:.*]]
+; IR:       [[BB4]]:
+; IR-NEXT:    br label %[[FLOW:.*]]
+; IR:       [[BB5]]:
+; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], %[[BB10:.*]] ], [ 0, %[[BB]] ]
+; IR-NEXT:    [[MY_TMP6:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB10]] ]
 ; IR-NEXT:    [[MY_TMP7:%.*]] = icmp eq i32 [[MY_TMP6]], 1
 ; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP7]])
 ; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
-; IR-NEXT:    br i1 [[TMP1]], label [[BB8:%.*]], label [[FLOW]]
-; IR:       bb8:
-; IR-NEXT:    br label [[BB13]]
-; IR:       bb9:
-; IR-NEXT:    br i1 false, label [[BB3:%.*]], label [[BB9:%.*]]
-; IR:       bb10:
+; IR-NEXT:    br i1 [[TMP1]], label %[[BB8:.*]], label %[[FLOW]]
+; IR:       [[BB8]]:
+; IR-NEXT:    br label %[[BB13]]
+; IR:       [[BB9:.*]]:
+; IR-NEXT:    br i1 false, label %[[BB3]], label %[[BB9]]
+; IR:       [[BB10]]:
 ; IR-NEXT:    [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]])
-; IR-NEXT:    br i1 [[TMP3]], label [[BB23:%.*]], label [[BB5]]
-; IR:       Flow:
-; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[MY_TMP22:%.*]], [[BB4]] ], [ true, [[BB5]] ]
-; IR-NEXT:    [[TMP5]] = phi i32 [ [[MY_TMP21:%.*]], [[BB4]] ], [ undef, [[BB5]] ]
+; IR-NEXT:    br i1 [[TMP3]], label %[[BB23:.*]], label %[[BB5]]
+; IR:       [[FLOW]]:
+; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[MY_TMP22:%.*]], %[[BB4]] ], [ true, %[[BB5]] ]
+; IR-NEXT:    [[TMP5]] = phi i32 [ [[MY_TMP21:%.*]], %[[BB4]] ], [ undef, %[[BB5]] ]
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
 ; IR-NEXT:    [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN]])
-; IR-NEXT:    br label [[BB10]]
-; IR:       bb13:
-; IR-NEXT:    [[MY_TMP14:%.*]] = phi i1 [ [[MY_TMP22]], [[BB3]] ], [ true, [[BB8]] ]
+; IR-NEXT:    br label %[[BB10]]
+; IR:       [[BB13]]:
+; IR-NEXT:    [[MY_TMP14:%.*]] = phi i1 [ [[MY_TMP22]], %[[BB3]] ], [ true, %[[BB8]] ]
 ; IR-NEXT:    [[MY_TMP15:%.*]] = bitcast i64 [[MY_TMP2]] to <2 x i32>
-; IR-NEXT:    br i1 [[MY_TMP14]], label [[BB16:%.*]], label [[BB20:%.*]]
-; IR:       bb16:
+; IR-NEXT:    br i1 [[MY_TMP14]], label %[[BB16:.*]], label %[[BB20:.*]]
+; IR:       [[BB16]]:
 ; IR-NEXT:    [[MY_TMP17:%.*]] = extractelement <2 x i32> [[MY_TMP15]], i64 1
 ; IR-NEXT:    [[MY_TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(3) undef, i32 [[MY_TMP17]]
 ; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[MY_TMP18]], align 4
-; IR-NEXT:    br label [[BB20]]
-; IR:       bb20:
-; IR-NEXT:    [[MY_TMP21]] = phi i32 [ [[MY_TMP19]], [[BB16]] ], [ 0, [[BB13]] ]
-; IR-NEXT:    [[MY_TMP22]] = phi i1 [ false, [[BB16]] ], [ [[MY_TMP14]], [[BB13]] ]
-; IR-NEXT:    br label [[BB9]]
-; IR:       bb23:
+; IR-NEXT:    br label %[[BB20]]
+; IR:       [[BB20]]:
+; IR-NEXT:    [[MY_TMP21]] = phi i32 [ [[MY_TMP19]], %[[BB16]] ], [ 0, %[[BB13]] ]
+; IR-NEXT:    [[MY_TMP22]] = phi i1 [ false, %[[BB16]] ], [ [[MY_TMP14]], %[[BB13]] ]
+; IR-NEXT:    br label %[[BB9]]
+; IR:       [[BB23]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
 ; IR-NEXT:    ret void
 bb:
@@ -188,66 +189,67 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
-; IR-LABEL: @nested_loop_conditions(
-; IR-NEXT:  bb:
+; IR-LABEL: define amdgpu_kernel void @nested_loop_conditions(
+; IR-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; IR-NEXT:  [[BB:.*]]:
 ; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
-; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
-; IR:       bb14.lr.ph:
+; IR-NEXT:    br i1 [[MY_TMP1235]], label %[[BB14_LR_PH:.*]], label %[[FLOW:.*]]
+; IR:       [[BB14_LR_PH]]:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
-; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[ARG:%.*]], i64 [[MY_TMP1]]
+; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[ARG]], i64 [[MY_TMP1]]
 ; IR-NEXT:    [[MY_TMP3:%.*]] = load i64, ptr addrspace(1) [[MY_TMP2]], align 16
 ; IR-NEXT:    [[MY_TMP932:%.*]] = load <4 x i32>, ptr addrspace(1) undef, align 16
 ; IR-NEXT:    [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
-; IR-NEXT:    br label [[BB14:%.*]]
-; IR:       Flow3:
+; IR-NEXT:    br label %[[BB14:.*]]
+; IR:       [[FLOW3:.*]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP20:%.*]])
 ; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP14:%.*]])
 ; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
-; IR-NEXT:    br i1 [[TMP1]], label [[BB4_BB13_CRIT_EDGE:%.*]], label [[FLOW4:%.*]]
-; IR:       bb4.bb13_crit_edge:
-; IR-NEXT:    br label [[FLOW4]]
-; IR:       Flow4:
-; IR-NEXT:    [[TMP3:%.*]] = phi i1 [ true, [[BB4_BB13_CRIT_EDGE]] ], [ false, [[FLOW3:%.*]] ]
+; IR-NEXT:    br i1 [[TMP1]], label %[[BB4_BB13_CRIT_EDGE:.*]], label %[[FLOW4:.*]]
+; IR:       [[BB4_BB13_CRIT_EDGE]]:
+; IR-NEXT:    br label %[[FLOW4]]
+; IR:       [[FLOW4]]:
+; IR-NEXT:    [[TMP3:%.*]] = phi i1 [ true, %[[BB4_BB13_CRIT_EDGE]] ], [ false, %[[FLOW3]] ]
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
-; IR-NEXT:    br label [[FLOW]]
-; IR:       bb13:
-; IR-NEXT:    br label [[BB31:%.*]]
-; IR:       Flow:
-; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP3]], [[FLOW4]] ], [ true, [[BB:%.*]] ]
+; IR-NEXT:    br label %[[FLOW]]
+; IR:       [[BB13:.*]]:
+; IR-NEXT:    br label %[[BB31:.*]]
+; IR:       [[FLOW]]:
+; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP3]], %[[FLOW4]] ], [ true, %[[BB]] ]
 ; IR-NEXT:    [[TMP5:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = extractvalue { i1, i64 } [[TMP5]], 0
 ; IR-NEXT:    [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP5]], 1
-; IR-NEXT:    br i1 [[TMP6]], label [[BB13:%.*]], label [[BB31]]
-; IR:       bb14:
-; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP16:%.*]], [[FLOW1:%.*]] ], [ 0, [[BB14_LR_PH]] ]
-; IR-NEXT:    [[MY_TMP1037:%.*]] = phi i32 [ [[MY_TMP1033]], [[BB14_LR_PH]] ], [ [[TMP12:%.*]], [[FLOW1]] ]
-; IR-NEXT:    [[MY_TMP936:%.*]] = phi <4 x i32> [ [[MY_TMP932]], [[BB14_LR_PH]] ], [ [[TMP11:%.*]], [[FLOW1]] ]
+; IR-NEXT:    br i1 [[TMP6]], label %[[BB13]], label %[[BB31]]
+; IR:       [[BB14]]:
+; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP16:%.*]], %[[FLOW1:.*]] ], [ 0, %[[BB14_LR_PH]] ]
+; IR-NEXT:    [[MY_TMP1037:%.*]] = phi i32 [ [[MY_TMP1033]], %[[BB14_LR_PH]] ], [ [[TMP12:%.*]], %[[FLOW1]] ]
+; IR-NEXT:    [[MY_TMP936:%.*]] = phi <4 x i32> [ [[MY_TMP932]], %[[BB14_LR_PH]] ], [ [[TMP11:%.*]], %[[FLOW1]] ]
 ; IR-NEXT:    [[MY_TMP15:%.*]] = icmp eq i32 [[MY_TMP1037]], 1
 ; IR-NEXT:    [[TMP8:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP15]])
 ; IR-NEXT:    [[TMP9:%.*]] = extractvalue { i1, i64 } [[TMP8]], 0
 ; IR-NEXT:    [[TMP10:%.*]] = extractvalue { i1, i64 } [[TMP8]], 1
-; IR-NEXT:    br i1 [[TMP9]], label [[BB16:%.*]], label [[FLOW1]]
-; IR:       bb16:
+; IR-NEXT:    br i1 [[TMP9]], label %[[BB16:.*]], label %[[FLOW1]]
+; IR:       [[BB16]]:
 ; IR-NEXT:    [[MY_TMP17:%.*]] = bitcast i64 [[MY_TMP3]] to <2 x i32>
-; IR-NEXT:    br label [[BB18:%.*]]
-; IR:       Flow1:
-; IR-NEXT:    [[TMP11]] = phi <4 x i32> [ [[MY_TMP9:%.*]], [[BB21:%.*]] ], [ undef, [[BB14]] ]
-; IR-NEXT:    [[TMP12]] = phi i32 [ [[MY_TMP10:%.*]], [[BB21]] ], [ undef, [[BB14]] ]
-; IR-NEXT:    [[TMP13:%.*]] = phi i1 [ [[MY_TMP12:%.*]], [[BB21]] ], [ true, [[BB14]] ]
-; IR-NEXT:    [[TMP14]] = phi i1 [ [[MY_TMP12]], [[BB21]] ], [ false, [[BB14]] ]
-; IR-NEXT:    [[TMP15:%.*]] = phi i1 [ false, [[BB21]] ], [ true, [[BB14]] ]
+; IR-NEXT:    br label %[[BB18:.*]]
+; IR:       [[FLOW1]]:
+; IR-NEXT:    [[TMP11]] = phi <4 x i32> [ [[MY_TMP9:%.*]], %[[BB21:.*]] ], [ undef, %[[BB14]] ]
+; IR-NEXT:    [[TMP12]] = phi i32 [ [[MY_TMP10:%.*]], %[[BB21]] ], [ undef, %[[BB14]] ]
+; IR-NEXT:    [[TMP13:%.*]] = phi i1 [ [[MY_TMP12:%.*]], %[[BB21]] ], [ true, %[[BB14]] ]
+; IR-NEXT:    [[TMP14]] = phi i1 [ [[MY_TMP12]], %[[BB21]] ], [ false, %[[BB14]] ]
+; IR-NEXT:    [[TMP15:%.*]] = phi i1 [ false, %[[BB21]] ], [ true, %[[BB14]] ]
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]])
 ; IR-NEXT:    [[TMP16]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP13]], i64 [[PHI_BROKEN]])
 ; IR-NEXT:    [[TMP17:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP16]])
-; IR-NEXT:    br i1 [[TMP17]], label [[FLOW2:%.*]], label [[BB14]]
-; IR:       bb18:
+; IR-NEXT:    br i1 [[TMP17]], label %[[FLOW2:.*]], label %[[BB14]]
+; IR:       [[BB18]]:
 ; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[MY_TMP20:%.*]] = icmp slt i32 [[MY_TMP19]], 9
-; IR-NEXT:    br i1 [[MY_TMP20]], label [[BB21]], label [[BB18]]
-; IR:       bb21:
+; IR-NEXT:    br i1 [[MY_TMP20]], label %[[BB21]], label %[[BB18]]
+; IR:       [[BB21]]:
 ; IR-NEXT:    [[MY_TMP22:%.*]] = extractelement <2 x i32> [[MY_TMP17]], i64 1
 ; IR-NEXT:    [[MY_TMP23:%.*]] = lshr i32 [[MY_TMP22]], 16
 ; IR-NEXT:    [[MY_TMP24:%.*]] = select i1 undef, i32 undef, i32 [[MY_TMP23]]
@@ -263,16 +265,16 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; IR-NEXT:    [[MY_TMP10]] = extractelement <4 x i32> [[MY_TMP9]], i64 0
 ; IR-NEXT:    [[MY_TMP11:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[MY_TMP12]] = icmp sge i32 [[MY_TMP11]], 9
-; IR-NEXT:    br label [[FLOW1]]
-; IR:       Flow2:
+; IR-NEXT:    br label %[[FLOW1]]
+; IR:       [[FLOW2]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP16]])
 ; IR-NEXT:    [[TMP18:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
 ; IR-NEXT:    [[TMP19:%.*]] = extractvalue { i1, i64 } [[TMP18]], 0
 ; IR-NEXT:    [[TMP20]] = extractvalue { i1, i64 } [[TMP18]], 1
-; IR-NEXT:    br i1 [[TMP19]], label [[BB31_LOOPEXIT:%.*]], label [[FLOW3]]
-; IR:       bb31.loopexit:
-; IR-NEXT:    br label [[FLOW3]]
-; IR:       bb31:
+; IR-NEXT:    br i1 [[TMP19]], label %[[BB31_LOOPEXIT:.*]], label %[[FLOW3]]
+; IR:       [[BB31_LOOPEXIT]]:
+; IR-NEXT:    br label %[[FLOW3]]
+; IR:       [[BB31]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
 ; IR-NEXT:    store volatile i32 0, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    ret void

jayfoad

LGTM. It fixes real problems and seems pretty conservative.

I'm not sure if there is a fundamental reason that putting amdgcn.if.break in an innerer loop cannot work, but it certainly does not work as-is. SILowerI1Copies would need to do soemthing with the mask value returned by this intrinsic. (Maybe that would happen automatically if the intrinsic returned a divergent i1 value instead of a uniform i64 ???)

llvm-ci · 2024-12-04T07:49:51Z

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime running on omp-vega20-0 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/11522

Here is the relevant piece of the build log for the reference

Step 7 (Add check check-offload) failure: test (failure)
...
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/test_libc.cpp (951 of 960)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug53727.cpp (952 of 960)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug47654.cpp (953 of 960)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/wtime.c (954 of 960)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/bug49021.cpp (955 of 960)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/std_complex_arithmetic.cpp (956 of 960)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/complex_reduction.cpp (957 of 960)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49021.cpp (958 of 960)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/std_complex_arithmetic.cpp (959 of 960)
TIMEOUT: libomptarget :: amdgcn-amd-amdhsa :: offloading/dynamic-schedule-non-spmd.cpp (960 of 960)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: offloading/dynamic-schedule-non-spmd.cpp' FAILED ********************
Exit Code: -9
Timeout: Reached timeout of 100 seconds

Command Output (stdout):
--
# RUN: at line 2
/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp    -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/dynamic-schedule-non-spmd.cpp -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/dynamic-schedule-non-spmd.cpp.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a && /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/dynamic-schedule-non-spmd.cpp.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/dynamic-schedule-non-spmd.cpp
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/dynamic-schedule-non-spmd.cpp -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/dynamic-schedule-non-spmd.cpp.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# note: command had no output on stdout or stderr
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/dynamic-schedule-non-spmd.cpp.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -9
# error: command reached timeout: True
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/dynamic-schedule-non-spmd.cpp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -9
# error: command reached timeout: True

--

********************
Slowest Tests:
--------------------------------------------------------------------------
100.05s: libomptarget :: amdgcn-amd-amdhsa :: offloading/dynamic-schedule-non-spmd.cpp
16.69s: libomptarget :: amdgcn-amd-amdhsa :: offloading/bug49021.cpp
12.68s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction_max.cpp
12.64s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction_min.cpp
11.72s: libomptarget :: amdgcn-amd-amdhsa :: offloading/complex_reduction.cpp
10.46s: libomptarget :: amdgcn-amd-amdhsa :: jit/empty_kernel_lvl2.c
9.23s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/bug49021.cpp
7.69s: libomptarget :: amdgcn-amd-amdhsa :: offloading/ompx_saxpy_mixed.c
7.54s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/std_complex_arithmetic.cpp
7.38s: libomptarget :: amdgcn-amd-amdhsa :: offloading/barrier_fence.c
7.20s: libomptarget :: amdgcn-amd-amdhsa :: mapping/map_back_race.cpp
6.57s: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49021.cpp
6.50s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/complex_reduction.cpp
6.16s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction.cpp
5.30s: libomptarget :: amdgcn-amd-amdhsa :: offloading/default_thread_limit.c

mariusz-sikora-at-amd · 2024-12-04T08:57:04Z

I triggered openmp-offload-amdgpu-runtime and now it is passing https://lab.llvm.org/buildbot/#/builders/30/builds/11523

[AMDGPU] Emit amdgcn.if.break in the same BB as amdgcn.loop

75a2871

Before this change if.break was placed in wrong loop level which resulted in accumulating values only from last iteration of the inner loop.

llvmbot added the backend:AMDGPU label Nov 29, 2024

mariusz-sikora-at-amd requested review from arsenm, ruiling and jayfoad December 2, 2024 07:22

jayfoad approved these changes Dec 2, 2024

View reviewed changes

mariusz-sikora-at-amd merged commit 455b4fd into llvm:main Dec 4, 2024
10 checks passed

mariusz-sikora-at-amd deleted the pub-upstream-if-break branch December 4, 2024 07:42

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Emit amdgcn.if.break in the same BB as amdgcn.loop #118081

[AMDGPU] Emit amdgcn.if.break in the same BB as amdgcn.loop #118081

Uh oh!

mariusz-sikora-at-amd commented Nov 29, 2024

Uh oh!

llvmbot commented Nov 29, 2024

Uh oh!

jayfoad left a comment

Uh oh!

Uh oh!

llvm-ci commented Dec 4, 2024

Uh oh!

mariusz-sikora-at-amd commented Dec 4, 2024

Uh oh!

Uh oh!

[AMDGPU] Emit amdgcn.if.break in the same BB as amdgcn.loop #118081

[AMDGPU] Emit amdgcn.if.break in the same BB as amdgcn.loop #118081

Uh oh!

Conversation

mariusz-sikora-at-amd commented Nov 29, 2024

Uh oh!

llvmbot commented Nov 29, 2024

Uh oh!

jayfoad left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Dec 4, 2024

Uh oh!

mariusz-sikora-at-amd commented Dec 4, 2024

Uh oh!

Uh oh!