Skip to content

Reapply "StructurizeCFG: Optimize phi insertion during ssa reconstruction (#101301)" #114347

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 1, 2024

Conversation

ruiling
Copy link
Contributor

@ruiling ruiling commented Oct 31, 2024

This reverts commit be40c72.

@llvmbot
Copy link
Member

llvmbot commented Oct 31, 2024

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-transforms

Author: Ruiling, Song (ruiling)

Changes

This reverts commit be40c72.


Patch is 26.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114347.diff

6 Files Affected:

  • (modified) llvm/lib/Transforms/Scalar/StructurizeCFG.cpp (+115-18)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll (+2-4)
  • (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+9-17)
  • (modified) llvm/test/CodeGen/AMDGPU/while-break.ll (+5-8)
  • (modified) llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll (+11-12)
  • (modified) llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll (+26-37)
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 92e47cbc7ae8bf..90c18c0b9c01ce 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Transforms/Scalar/StructurizeCFG.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
@@ -325,6 +326,10 @@ class StructurizeCFG {
   void findUndefBlocks(BasicBlock *PHIBlock,
                        const SmallSet<BasicBlock *, 8> &Incomings,
                        SmallVector<BasicBlock *> &UndefBlks) const;
+
+  void mergeIfCompatible(EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A,
+                         PHINode *B);
+
   void setPhiValues();
 
   void simplifyAffectedPhis();
@@ -755,10 +760,102 @@ void StructurizeCFG::findUndefBlocks(
   }
 }
 
+// If two phi nodes have compatible incoming values (for each
+// incoming block, either they have the same incoming value or only one phi
+// node has an incoming value), let them share the merged incoming values. The
+// merge process is guided by the equivalence information from \p PhiClasses.
+// The function will possibly update the incoming values of leader phi in
+// DeletedPhis.
+void StructurizeCFG::mergeIfCompatible(
+    EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A, PHINode *B) {
+  auto ItA = PhiClasses.findLeader(PhiClasses.insert(A));
+  auto ItB = PhiClasses.findLeader(PhiClasses.insert(B));
+  // They are already in the same class, no work needed.
+  if (ItA == ItB)
+    return;
+
+  PHINode *LeaderA = *ItA;
+  PHINode *LeaderB = *ItB;
+  BBValueVector &IncomingA = DeletedPhis[LeaderA->getParent()][LeaderA];
+  BBValueVector &IncomingB = DeletedPhis[LeaderB->getParent()][LeaderB];
+
+  DenseMap<BasicBlock *, Value *> Mergeable(IncomingA.begin(), IncomingA.end());
+  for (auto [BB, V] : IncomingB) {
+    auto BBIt = Mergeable.find(BB);
+    if (BBIt != Mergeable.end() && BBIt->second != V)
+      return;
+    // Either IncomingA does not have this value or IncomingA has the same
+    // value.
+    Mergeable.insert({BB, V});
+  }
+
+  // Update the incoming value of leaderA.
+  IncomingA.assign(Mergeable.begin(), Mergeable.end());
+  PhiClasses.unionSets(ItA, ItB);
+}
+
 /// Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
   SmallVector<PHINode *, 8> InsertedPhis;
   SSAUpdater Updater(&InsertedPhis);
+  DenseMap<BasicBlock *, SmallVector<BasicBlock *>> UndefBlksMap;
+
+  // Find phi nodes that have compatible incoming values (either they have
+  // the same value for the same block or only one phi node has an incoming
+  // value, see example below). We only search again the phi's that are
+  // referenced by another phi, which is the case we care about.
+  //
+  // For example (-- means no incoming value):
+  // phi1 : BB1:phi2   BB2:v  BB3:--
+  // phi2:  BB1:--     BB2:v  BB3:w
+  //
+  // Then we can merge these incoming values and let phi1, phi2 use the
+  // same set of incoming values:
+  //
+  // phi1&phi2: BB1:phi2  BB2:v  BB3:w
+  //
+  // By doing this, phi1 and phi2 would share more intermediate phi nodes.
+  // This would help reduce the number of phi nodes during SSA reconstruction
+  // and ultimately result in fewer COPY instructions.
+  //
+  // This should be correct, because if a phi node does not have incoming
+  // value from certain block, this means the block is not the predecessor
+  // of the parent block, so we actually don't care about its incoming value.
+  EquivalenceClasses<PHINode *> PhiClasses;
+  for (const auto &[To, From] : AddedPhis) {
+    auto OldPhiIt = DeletedPhis.find(To);
+    if (OldPhiIt == DeletedPhis.end())
+      continue;
+
+    PhiMap &BlkPhis = OldPhiIt->second;
+    SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
+    SmallSet<BasicBlock *, 8> Incomings;
+
+    // Get the undefined blocks shared by all the phi nodes.
+    if (!BlkPhis.empty()) {
+      for (const auto &VI : BlkPhis.front().second)
+        Incomings.insert(VI.first);
+      findUndefBlocks(To, Incomings, UndefBlks);
+    }
+
+    for (const auto &[Phi, Incomings] : OldPhiIt->second) {
+      SmallVector<PHINode *> IncomingPHIs;
+      for (const auto &[BB, V] : Incomings) {
+        // First, for each phi, check whether it has incoming value which is
+        // another phi.
+        if (PHINode *P = dyn_cast<PHINode>(V))
+          IncomingPHIs.push_back(P);
+      }
+
+      for (auto *OtherPhi : IncomingPHIs) {
+        // Skip phis that are unrelated to the phi reconstruction for now.
+        if (!DeletedPhis.contains(OtherPhi->getParent()))
+          continue;
+        mergeIfCompatible(PhiClasses, Phi, OtherPhi);
+      }
+    }
+  }
+
   for (const auto &AddedPhi : AddedPhis) {
     BasicBlock *To = AddedPhi.first;
     const BBVector &From = AddedPhi.second;
@@ -766,28 +863,27 @@ void StructurizeCFG::setPhiValues() {
     if (!DeletedPhis.count(To))
       continue;
 
-    SmallVector<BasicBlock *> UndefBlks;
-    bool CachedUndefs = false;
     PhiMap &Map = DeletedPhis[To];
-    for (const auto &PI : Map) {
-      PHINode *Phi = PI.first;
+    SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
+    for (const auto &[Phi, Incoming] : Map) {
       Value *Undef = UndefValue::get(Phi->getType());
       Updater.Initialize(Phi->getType(), "");
       Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
       Updater.AddAvailableValue(To, Undef);
 
-      SmallSet<BasicBlock *, 8> Incomings;
-      SmallVector<BasicBlock *> ConstantPreds;
-      for (const auto &VI : PI.second) {
-        Incomings.insert(VI.first);
-        Updater.AddAvailableValue(VI.first, VI.second);
-        if (isa<Constant>(VI.second))
-          ConstantPreds.push_back(VI.first);
-      }
+      // Use leader phi's incoming if there is.
+      auto LeaderIt = PhiClasses.findLeader(Phi);
+      bool UseIncomingOfLeader =
+          LeaderIt != PhiClasses.member_end() && *LeaderIt != Phi;
+      const auto &IncomingMap =
+          UseIncomingOfLeader ? DeletedPhis[(*LeaderIt)->getParent()][*LeaderIt]
+                              : Incoming;
 
-      if (!CachedUndefs) {
-        findUndefBlocks(To, Incomings, UndefBlks);
-        CachedUndefs = true;
+      SmallVector<BasicBlock *> ConstantPreds;
+      for (const auto &[BB, V] : IncomingMap) {
+        Updater.AddAvailableValue(BB, V);
+        if (isa<Constant>(V))
+          ConstantPreds.push_back(BB);
       }
 
       for (auto UB : UndefBlks) {
@@ -798,6 +894,10 @@ void StructurizeCFG::setPhiValues() {
         if (any_of(ConstantPreds,
                    [&](BasicBlock *CP) { return DT->dominates(CP, UB); }))
           continue;
+        // Maybe already get a value through sharing with other phi nodes.
+        if (Updater.HasValueForBlock(UB))
+          continue;
+
         Updater.AddAvailableValue(UB, Undef);
       }
 
@@ -805,10 +905,7 @@ void StructurizeCFG::setPhiValues() {
         Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
       AffectedPhis.push_back(Phi);
     }
-
-    DeletedPhis.erase(To);
   }
-  assert(DeletedPhis.empty());
 
   AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index b27d8fdc24ff73..935200d5953072 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -298,7 +298,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr6
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX10-NEXT:    s_branch .LBB4_2
 ; GFX10-NEXT:  .LBB4_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
@@ -312,7 +312,6 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX10-NEXT:  .LBB4_2: ; %cond.block.0
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
 ; GFX10-NEXT:    s_and_saveexec_b32 s7, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_4
@@ -329,12 +328,11 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, v1, v4
 ; GFX10-NEXT:    s_mov_b32 s7, -1
-; GFX10-NEXT:    ; implicit-def: $vgpr5
 ; GFX10-NEXT:    s_and_saveexec_b32 s8, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_1
 ; GFX10-NEXT:  ; %bb.5: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
 ; GFX10-NEXT:    s_and_b32 s7, exec_lo, 0
 ; GFX10-NEXT:    s_or_b32 s7, s4, s7
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 49b450a9af0bc9..4d26453e1a0d6d 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -576,11 +576,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v6
-; GFX908-NEXT:    v_mov_b32_e32 v8, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s8
+; GFX908-NEXT:    v_mov_b32_e32 v8, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s9
-; GFX908-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v7, s9
+; GFX908-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX908-NEXT:    s_mov_b64 s[18:19], s[10:11]
@@ -641,10 +641,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_add_f32_e32 v12, v20, v12
 ; GFX908-NEXT:    v_add_f32_e32 v5, v5, v25
 ; GFX908-NEXT:    v_add_f32_e32 v4, v4, v24
-; GFX908-NEXT:    v_add_f32_e32 v9, v9, v27
-; GFX908-NEXT:    v_add_f32_e32 v8, v8, v26
-; GFX908-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX908-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX908-NEXT:    v_add_f32_e32 v7, v7, v27
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v26
+; GFX908-NEXT:    v_add_f32_e32 v8, v8, v14
+; GFX908-NEXT:    v_add_f32_e32 v9, v9, v15
 ; GFX908-NEXT:    v_add_f32_e32 v10, v10, v12
 ; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_mov_b64 s[20:21], -1
@@ -654,10 +654,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; GFX908-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
@@ -743,8 +739,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v8
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX90A-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -800,8 +796,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[22:23], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[14:15], v[20:21], v[14:15]
 ; GFX90A-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[24:25]
-; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[26:27]
-; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[16:17]
+; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
 ; GFX90A-NEXT:    s_branch .LBB3_4
@@ -810,10 +806,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    ; implicit-def: $vgpr12_vgpr13
-; GFX90A-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX90A-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX90A-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 46254994580d2d..9bb8a2f9f0282c 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -162,8 +162,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:    s_branch .LBB2_2
 ; GCN-NEXT:  .LBB2_1: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GCN-NEXT:    s_and_b32 s1, exec_lo, s4
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GCN-NEXT:    s_and_b32 s1, exec_lo, s1
 ; GCN-NEXT:    s_or_b32 s2, s1, s2
 ; GCN-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
 ; GCN-NEXT:    s_cbranch_execz .LBB2_6
@@ -190,20 +190,17 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:  .LBB2_4: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GCN-NEXT:    v_mov_b32_e32 v7, v6
-; GCN-NEXT:    s_mov_b32 s4, -1
-; GCN-NEXT:    s_and_saveexec_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s1, -1
+; GCN-NEXT:    s_and_saveexec_b32 s4, s3
 ; GCN-NEXT:    s_cbranch_execz .LBB2_1
 ; GCN-NEXT:  ; %bb.5: ; %latch
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v3
-; GCN-NEXT:    v_mov_b32_e32 v7, v0
 ; GCN-NEXT:    s_add_i32 s0, s0, 1
-; GCN-NEXT:    s_orn2_b32 s4, vcc_lo, exec_lo
+; GCN-NEXT:    s_orn2_b32 s1, vcc_lo, exec_lo
 ; GCN-NEXT:    s_branch .LBB2_1
 ; GCN-NEXT:  .LBB2_6: ; %end
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, v7
 ; GCN-NEXT:    v_mov_b32_e32 v1, v6
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
index 10a3e65e5f57d6..385e37e2750d1e 100644
--- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
+++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
@@ -28,7 +28,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[I_INITIAL:%.*]] = load volatile i32, ptr addrspace(1) [[GEP]], align 4
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       LOOP.HEADER:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[FLOW3:%.*]] ]
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x100b
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[I]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 [[TMP12]]
@@ -49,8 +49,8 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       Flow2:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP9:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP6:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]]
 ; CHECK:       INNER_LOOP:
 ; CHECK-NEXT:    [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
@@ -66,20 +66,19 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[LOAD13:%.*]] = icmp uge i32 [[TMP16]], 271
 ; CHECK-NEXT:    br i1 [[LOAD13]], label [[INCREMENT_I]], label [[FLOW1:%.*]]
 ; CHECK:       Flow3:
-; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[TMP3]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
 ; CHECK:       Flow4:
-; CHECK-NEXT:    br i1 [[TMP8:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:    br i1 [[TMP7:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
 ; CHECK:       bb64:
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 42", "~{memory}"() #[[ATTR0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       Flow:
-; CHECK-NEXT:    [[TMP7]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[BB18]], label [[FLOW2]]
+; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[BB18]], label [[FLOW2]]
 ; CHECK:       INCREMENT_I:
 ; CHECK-NEXT:    [[INC_I]] = add i32 [[I]], 1
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1336
diff --git a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
index c832b7d1394a88..46881ec8272861 100644
--- a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
+++ b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
@@ -7,8 +7,8 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[HEADER:.*]]
 ; CHECK:       [[HEADER]]:
-; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW2:.*]] ]
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW2]] ]
+; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW2:.*]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW2]] ]
 ; CHECK-NEXT:    [[CC:%.*]] = icmp sge i32 [[IND]], [[X]]
 ; CHECK-NEXT:    br i1 [[CC]], label %[[ELSE:.*]], label %[[FLOW:.*]]
 ; CHECK:       [[FLOW]]:
@@ -23,20 +23,17 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:    [[CC2]] = icmp slt i32 [[IND]], [[Y]]
 ; CHECK-NEXT:    br label %[[FLOW]]
 ; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ undef, %[[IF]] ], [ [[TMP0]]...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Oct 31, 2024

@llvm/pr-subscribers-llvm-globalisel

Author: Ruiling, Song (ruiling)

Changes

This reverts commit be40c72.


Patch is 26.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114347.diff

6 Files Affected:

  • (modified) llvm/lib/Transforms/Scalar/StructurizeCFG.cpp (+115-18)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll (+2-4)
  • (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+9-17)
  • (modified) llvm/test/CodeGen/AMDGPU/while-break.ll (+5-8)
  • (modified) llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll (+11-12)
  • (modified) llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll (+26-37)
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 92e47cbc7ae8bf..90c18c0b9c01ce 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Transforms/Scalar/StructurizeCFG.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
@@ -325,6 +326,10 @@ class StructurizeCFG {
   void findUndefBlocks(BasicBlock *PHIBlock,
                        const SmallSet<BasicBlock *, 8> &Incomings,
                        SmallVector<BasicBlock *> &UndefBlks) const;
+
+  void mergeIfCompatible(EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A,
+                         PHINode *B);
+
   void setPhiValues();
 
   void simplifyAffectedPhis();
@@ -755,10 +760,102 @@ void StructurizeCFG::findUndefBlocks(
   }
 }
 
+// If two phi nodes have compatible incoming values (for each
+// incoming block, either they have the same incoming value or only one phi
+// node has an incoming value), let them share the merged incoming values. The
+// merge process is guided by the equivalence information from \p PhiClasses.
+// The function will possibly update the incoming values of leader phi in
+// DeletedPhis.
+void StructurizeCFG::mergeIfCompatible(
+    EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A, PHINode *B) {
+  auto ItA = PhiClasses.findLeader(PhiClasses.insert(A));
+  auto ItB = PhiClasses.findLeader(PhiClasses.insert(B));
+  // They are already in the same class, no work needed.
+  if (ItA == ItB)
+    return;
+
+  PHINode *LeaderA = *ItA;
+  PHINode *LeaderB = *ItB;
+  BBValueVector &IncomingA = DeletedPhis[LeaderA->getParent()][LeaderA];
+  BBValueVector &IncomingB = DeletedPhis[LeaderB->getParent()][LeaderB];
+
+  DenseMap<BasicBlock *, Value *> Mergeable(IncomingA.begin(), IncomingA.end());
+  for (auto [BB, V] : IncomingB) {
+    auto BBIt = Mergeable.find(BB);
+    if (BBIt != Mergeable.end() && BBIt->second != V)
+      return;
+    // Either IncomingA does not have this value or IncomingA has the same
+    // value.
+    Mergeable.insert({BB, V});
+  }
+
+  // Update the incoming value of leaderA.
+  IncomingA.assign(Mergeable.begin(), Mergeable.end());
+  PhiClasses.unionSets(ItA, ItB);
+}
+
 /// Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
   SmallVector<PHINode *, 8> InsertedPhis;
   SSAUpdater Updater(&InsertedPhis);
+  DenseMap<BasicBlock *, SmallVector<BasicBlock *>> UndefBlksMap;
+
+  // Find phi nodes that have compatible incoming values (either they have
+  // the same value for the same block or only one phi node has an incoming
+  // value, see example below). We only search again the phi's that are
+  // referenced by another phi, which is the case we care about.
+  //
+  // For example (-- means no incoming value):
+  // phi1 : BB1:phi2   BB2:v  BB3:--
+  // phi2:  BB1:--     BB2:v  BB3:w
+  //
+  // Then we can merge these incoming values and let phi1, phi2 use the
+  // same set of incoming values:
+  //
+  // phi1&phi2: BB1:phi2  BB2:v  BB3:w
+  //
+  // By doing this, phi1 and phi2 would share more intermediate phi nodes.
+  // This would help reduce the number of phi nodes during SSA reconstruction
+  // and ultimately result in fewer COPY instructions.
+  //
+  // This should be correct, because if a phi node does not have incoming
+  // value from certain block, this means the block is not the predecessor
+  // of the parent block, so we actually don't care about its incoming value.
+  EquivalenceClasses<PHINode *> PhiClasses;
+  for (const auto &[To, From] : AddedPhis) {
+    auto OldPhiIt = DeletedPhis.find(To);
+    if (OldPhiIt == DeletedPhis.end())
+      continue;
+
+    PhiMap &BlkPhis = OldPhiIt->second;
+    SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
+    SmallSet<BasicBlock *, 8> Incomings;
+
+    // Get the undefined blocks shared by all the phi nodes.
+    if (!BlkPhis.empty()) {
+      for (const auto &VI : BlkPhis.front().second)
+        Incomings.insert(VI.first);
+      findUndefBlocks(To, Incomings, UndefBlks);
+    }
+
+    for (const auto &[Phi, Incomings] : OldPhiIt->second) {
+      SmallVector<PHINode *> IncomingPHIs;
+      for (const auto &[BB, V] : Incomings) {
+        // First, for each phi, check whether it has incoming value which is
+        // another phi.
+        if (PHINode *P = dyn_cast<PHINode>(V))
+          IncomingPHIs.push_back(P);
+      }
+
+      for (auto *OtherPhi : IncomingPHIs) {
+        // Skip phis that are unrelated to the phi reconstruction for now.
+        if (!DeletedPhis.contains(OtherPhi->getParent()))
+          continue;
+        mergeIfCompatible(PhiClasses, Phi, OtherPhi);
+      }
+    }
+  }
+
   for (const auto &AddedPhi : AddedPhis) {
     BasicBlock *To = AddedPhi.first;
     const BBVector &From = AddedPhi.second;
@@ -766,28 +863,27 @@ void StructurizeCFG::setPhiValues() {
     if (!DeletedPhis.count(To))
       continue;
 
-    SmallVector<BasicBlock *> UndefBlks;
-    bool CachedUndefs = false;
     PhiMap &Map = DeletedPhis[To];
-    for (const auto &PI : Map) {
-      PHINode *Phi = PI.first;
+    SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
+    for (const auto &[Phi, Incoming] : Map) {
       Value *Undef = UndefValue::get(Phi->getType());
       Updater.Initialize(Phi->getType(), "");
       Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
       Updater.AddAvailableValue(To, Undef);
 
-      SmallSet<BasicBlock *, 8> Incomings;
-      SmallVector<BasicBlock *> ConstantPreds;
-      for (const auto &VI : PI.second) {
-        Incomings.insert(VI.first);
-        Updater.AddAvailableValue(VI.first, VI.second);
-        if (isa<Constant>(VI.second))
-          ConstantPreds.push_back(VI.first);
-      }
+      // Use leader phi's incoming if there is.
+      auto LeaderIt = PhiClasses.findLeader(Phi);
+      bool UseIncomingOfLeader =
+          LeaderIt != PhiClasses.member_end() && *LeaderIt != Phi;
+      const auto &IncomingMap =
+          UseIncomingOfLeader ? DeletedPhis[(*LeaderIt)->getParent()][*LeaderIt]
+                              : Incoming;
 
-      if (!CachedUndefs) {
-        findUndefBlocks(To, Incomings, UndefBlks);
-        CachedUndefs = true;
+      SmallVector<BasicBlock *> ConstantPreds;
+      for (const auto &[BB, V] : IncomingMap) {
+        Updater.AddAvailableValue(BB, V);
+        if (isa<Constant>(V))
+          ConstantPreds.push_back(BB);
       }
 
       for (auto UB : UndefBlks) {
@@ -798,6 +894,10 @@ void StructurizeCFG::setPhiValues() {
         if (any_of(ConstantPreds,
                    [&](BasicBlock *CP) { return DT->dominates(CP, UB); }))
           continue;
+        // Maybe already get a value through sharing with other phi nodes.
+        if (Updater.HasValueForBlock(UB))
+          continue;
+
         Updater.AddAvailableValue(UB, Undef);
       }
 
@@ -805,10 +905,7 @@ void StructurizeCFG::setPhiValues() {
         Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
       AffectedPhis.push_back(Phi);
     }
-
-    DeletedPhis.erase(To);
   }
-  assert(DeletedPhis.empty());
 
   AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index b27d8fdc24ff73..935200d5953072 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -298,7 +298,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr6
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX10-NEXT:    s_branch .LBB4_2
 ; GFX10-NEXT:  .LBB4_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
@@ -312,7 +312,6 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX10-NEXT:  .LBB4_2: ; %cond.block.0
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
 ; GFX10-NEXT:    s_and_saveexec_b32 s7, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_4
@@ -329,12 +328,11 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, v1, v4
 ; GFX10-NEXT:    s_mov_b32 s7, -1
-; GFX10-NEXT:    ; implicit-def: $vgpr5
 ; GFX10-NEXT:    s_and_saveexec_b32 s8, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_1
 ; GFX10-NEXT:  ; %bb.5: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
 ; GFX10-NEXT:    s_and_b32 s7, exec_lo, 0
 ; GFX10-NEXT:    s_or_b32 s7, s4, s7
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 49b450a9af0bc9..4d26453e1a0d6d 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -576,11 +576,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v6
-; GFX908-NEXT:    v_mov_b32_e32 v8, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s8
+; GFX908-NEXT:    v_mov_b32_e32 v8, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s9
-; GFX908-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v7, s9
+; GFX908-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX908-NEXT:    s_mov_b64 s[18:19], s[10:11]
@@ -641,10 +641,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_add_f32_e32 v12, v20, v12
 ; GFX908-NEXT:    v_add_f32_e32 v5, v5, v25
 ; GFX908-NEXT:    v_add_f32_e32 v4, v4, v24
-; GFX908-NEXT:    v_add_f32_e32 v9, v9, v27
-; GFX908-NEXT:    v_add_f32_e32 v8, v8, v26
-; GFX908-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX908-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX908-NEXT:    v_add_f32_e32 v7, v7, v27
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v26
+; GFX908-NEXT:    v_add_f32_e32 v8, v8, v14
+; GFX908-NEXT:    v_add_f32_e32 v9, v9, v15
 ; GFX908-NEXT:    v_add_f32_e32 v10, v10, v12
 ; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_mov_b64 s[20:21], -1
@@ -654,10 +654,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; GFX908-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
@@ -743,8 +739,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v8
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX90A-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -800,8 +796,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[22:23], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[14:15], v[20:21], v[14:15]
 ; GFX90A-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[24:25]
-; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[26:27]
-; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[16:17]
+; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
 ; GFX90A-NEXT:    s_branch .LBB3_4
@@ -810,10 +806,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    ; implicit-def: $vgpr12_vgpr13
-; GFX90A-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX90A-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX90A-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 46254994580d2d..9bb8a2f9f0282c 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -162,8 +162,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:    s_branch .LBB2_2
 ; GCN-NEXT:  .LBB2_1: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GCN-NEXT:    s_and_b32 s1, exec_lo, s4
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GCN-NEXT:    s_and_b32 s1, exec_lo, s1
 ; GCN-NEXT:    s_or_b32 s2, s1, s2
 ; GCN-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
 ; GCN-NEXT:    s_cbranch_execz .LBB2_6
@@ -190,20 +190,17 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:  .LBB2_4: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GCN-NEXT:    v_mov_b32_e32 v7, v6
-; GCN-NEXT:    s_mov_b32 s4, -1
-; GCN-NEXT:    s_and_saveexec_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s1, -1
+; GCN-NEXT:    s_and_saveexec_b32 s4, s3
 ; GCN-NEXT:    s_cbranch_execz .LBB2_1
 ; GCN-NEXT:  ; %bb.5: ; %latch
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v3
-; GCN-NEXT:    v_mov_b32_e32 v7, v0
 ; GCN-NEXT:    s_add_i32 s0, s0, 1
-; GCN-NEXT:    s_orn2_b32 s4, vcc_lo, exec_lo
+; GCN-NEXT:    s_orn2_b32 s1, vcc_lo, exec_lo
 ; GCN-NEXT:    s_branch .LBB2_1
 ; GCN-NEXT:  .LBB2_6: ; %end
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, v7
 ; GCN-NEXT:    v_mov_b32_e32 v1, v6
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
index 10a3e65e5f57d6..385e37e2750d1e 100644
--- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
+++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
@@ -28,7 +28,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[I_INITIAL:%.*]] = load volatile i32, ptr addrspace(1) [[GEP]], align 4
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       LOOP.HEADER:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[FLOW3:%.*]] ]
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x100b
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[I]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 [[TMP12]]
@@ -49,8 +49,8 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       Flow2:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP9:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP6:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]]
 ; CHECK:       INNER_LOOP:
 ; CHECK-NEXT:    [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
@@ -66,20 +66,19 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[LOAD13:%.*]] = icmp uge i32 [[TMP16]], 271
 ; CHECK-NEXT:    br i1 [[LOAD13]], label [[INCREMENT_I]], label [[FLOW1:%.*]]
 ; CHECK:       Flow3:
-; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[TMP3]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
 ; CHECK:       Flow4:
-; CHECK-NEXT:    br i1 [[TMP8:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:    br i1 [[TMP7:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
 ; CHECK:       bb64:
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 42", "~{memory}"() #[[ATTR0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       Flow:
-; CHECK-NEXT:    [[TMP7]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[BB18]], label [[FLOW2]]
+; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[BB18]], label [[FLOW2]]
 ; CHECK:       INCREMENT_I:
 ; CHECK-NEXT:    [[INC_I]] = add i32 [[I]], 1
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1336
diff --git a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
index c832b7d1394a88..46881ec8272861 100644
--- a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
+++ b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
@@ -7,8 +7,8 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[HEADER:.*]]
 ; CHECK:       [[HEADER]]:
-; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW2:.*]] ]
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW2]] ]
+; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW2:.*]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW2]] ]
 ; CHECK-NEXT:    [[CC:%.*]] = icmp sge i32 [[IND]], [[X]]
 ; CHECK-NEXT:    br i1 [[CC]], label %[[ELSE:.*]], label %[[FLOW:.*]]
 ; CHECK:       [[FLOW]]:
@@ -23,20 +23,17 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:    [[CC2]] = icmp slt i32 [[IND]], [[Y]]
 ; CHECK-NEXT:    br label %[[FLOW]]
 ; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ undef, %[[IF]] ], [ [[TMP0]]...
[truncated]

@ruiling
Copy link
Contributor Author

ruiling commented Oct 31, 2024

@yxsamliu has helped with the debugging work of the original regression. The root cause is not the change itself, but hit certain deep issue in the register allocation of AMDGPU backend. With the recent fix and improvement in the register allocation part, we should be safe to land this again.

@ruiling ruiling merged commit 54d31bd into llvm:main Nov 1, 2024
12 checks passed
smallp-o-p pushed a commit to smallp-o-p/llvm-project that referenced this pull request Nov 3, 2024
NoumanAmir657 pushed a commit to NoumanAmir657/llvm-project that referenced this pull request Nov 4, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants