|
| 1 | +/*========================== begin_copyright_notice ============================ |
| 2 | + |
| 3 | +Copyright (C) 2024 Intel Corporation |
| 4 | + |
| 5 | +SPDX-License-Identifier: MIT |
| 6 | + |
| 7 | +============================= end_copyright_notice ===========================*/ |
| 8 | + |
| 9 | +/*========================== begin_copyright_notice ============================ |
| 10 | + |
| 11 | +Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 12 | +See https://llvm.org/LICENSE.txt for license information. |
| 13 | +SPDX-License-Identifier: Apache-2.0 with LLVM-exception |
| 14 | + |
| 15 | +============================= end_copyright_notice ===========================*/ |
| 16 | + |
| 17 | +From c522ebaa9d67809e7b3e2660321b12b999da24b3 Mon Sep 17 00:00:00 2001 |
| 18 | +From: pawelflisikowski < [email protected]> |
| 19 | +Date: Tue, 28 Jan 2025 03:19:57 -0800 |
| 20 | +Subject: [PATCH] [IGC LLVM] Make getPreviousDefRecursive iterative |
| 21 | + |
| 22 | +Description: |
| 23 | +Large kernels with long use-def chains may cause recursive calls to |
| 24 | +exceed the stack space within the memory SSA updater of the LICM pass. |
| 25 | + |
| 26 | +This was observed with Blender on LNL (OGLVK). |
| 27 | + |
| 28 | +Replace the MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(...) |
| 29 | +function with the new iterative version. |
| 30 | + |
| 31 | +Platforms: |
| 32 | +All |
| 33 | +--- |
| 34 | + llvm/include/llvm/Analysis/MemorySSAUpdater.h | 2 +- |
| 35 | + llvm/lib/Analysis/MemorySSAUpdater.cpp | 267 ++++++++++++------ |
| 36 | + 2 files changed, 185 insertions(+), 84 deletions(-) |
| 37 | + |
| 38 | +diff --git a/llvm/include/llvm/Analysis/MemorySSAUpdater.h b/llvm/include/llvm/Analysis/MemorySSAUpdater.h |
| 39 | +index 3e5ebe9cb..013994d86 100644 |
| 40 | +--- a/llvm/include/llvm/Analysis/MemorySSAUpdater.h |
| 41 | ++++ b/llvm/include/llvm/Analysis/MemorySSAUpdater.h |
| 42 | +@@ -255,7 +255,7 @@ private: |
| 43 | + getPreviousDefFromEnd(BasicBlock *, |
| 44 | + DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> &); |
| 45 | + MemoryAccess * |
| 46 | +- getPreviousDefRecursive(BasicBlock *, |
| 47 | ++ getPreviousDefIterative(BasicBlock *, |
| 48 | + DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> &); |
| 49 | + MemoryAccess *recursePhi(MemoryAccess *Phi); |
| 50 | + MemoryAccess *tryRemoveTrivialPhi(MemoryPhi *Phi); |
| 51 | +diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp |
| 52 | +index 9c841883d..800e9812c 100644 |
| 53 | +--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp |
| 54 | ++++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp |
| 55 | +@@ -27,6 +27,7 @@ |
| 56 | + #include "llvm/Support/Debug.h" |
| 57 | + #include "llvm/Support/FormattedStream.h" |
| 58 | + #include <algorithm> |
| 59 | ++#include <stack> |
| 60 | + |
| 61 | + #define DEBUG_TYPE "memoryssa" |
| 62 | + using namespace llvm; |
| 63 | +@@ -40,101 +41,201 @@ using namespace llvm; |
| 64 | + // that there are two or more definitions needing to be merged. |
| 65 | + // This still will leave non-minimal form in the case of irreducible control |
| 66 | + // flow, where phi nodes may be in cycles with themselves, but unnecessary. |
| 67 | +-MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive( |
| 68 | ++MemoryAccess *MemorySSAUpdater::getPreviousDefIterative( |
| 69 | + BasicBlock *BB, |
| 70 | + DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> &CachedPreviousDef) { |
| 71 | +- // First, do a cache lookup. Without this cache, certain CFG structures |
| 72 | +- // (like a series of if statements) take exponential time to visit. |
| 73 | +- auto Cached = CachedPreviousDef.find(BB); |
| 74 | +- if (Cached != CachedPreviousDef.end()) |
| 75 | +- return Cached->second; |
| 76 | +- |
| 77 | +- // If this method is called from an unreachable block, return LoE. |
| 78 | +- if (!MSSA->DT->isReachableFromEntry(BB)) |
| 79 | +- return MSSA->getLiveOnEntryDef(); |
| 80 | ++ enum ResumePoint { |
| 81 | ++ START, |
| 82 | ++ COLLECT_VALUE_FROM_UNIQUE_PREDECESSOR, |
| 83 | ++ COLLECT_VALUES_FROM_PREDECESSORS, |
| 84 | ++ PROCESS_PREDECESSOR, |
| 85 | ++ GET_PREVIOUS_DEF_FROM_END, |
| 86 | ++ SIMPLIFY_OPS |
| 87 | ++ }; |
| 88 | + |
| 89 | +- if (BasicBlock *Pred = BB->getUniquePredecessor()) { |
| 90 | +- VisitedBlocks.insert(BB); |
| 91 | +- // Single predecessor case, just recurse, we can only have one definition. |
| 92 | +- MemoryAccess *Result = getPreviousDefFromEnd(Pred, CachedPreviousDef); |
| 93 | +- CachedPreviousDef.insert({BB, Result}); |
| 94 | +- return Result; |
| 95 | +- } |
| 96 | ++ class StackFrame { |
| 97 | ++ public: |
| 98 | ++ BasicBlock *BB; |
| 99 | ++ ResumePoint ResumeAt; |
| 100 | ++ bool UniqueIncomingAccess; |
| 101 | ++ MemoryAccess *SingleAccess; |
| 102 | ++ SmallVector<TrackingVH<MemoryAccess>, 8> PhiOps; |
| 103 | ++ // Iterators for keeping track of predecessor blocks that are already |
| 104 | ++ // processed. |
| 105 | ++ pred_iterator PredIt; |
| 106 | ++ pred_iterator PredEnd; |
| 107 | ++ StackFrame(BasicBlock *BB, ResumePoint resumePoint) |
| 108 | ++ : BB(BB), ResumeAt(resumePoint), UniqueIncomingAccess(true), |
| 109 | ++ SingleAccess(nullptr), PredIt(pred_begin(BB)), PredEnd(pred_end(BB)) { |
| 110 | ++ } |
| 111 | ++ }; |
| 112 | + |
| 113 | +- if (VisitedBlocks.count(BB)) { |
| 114 | +- // We hit our node again, meaning we had a cycle, we must insert a phi |
| 115 | +- // node to break it so we have an operand. The only case this will |
| 116 | +- // insert useless phis is if we have irreducible control flow. |
| 117 | +- MemoryAccess *Result = MSSA->createMemoryPhi(BB); |
| 118 | +- CachedPreviousDef.insert({BB, Result}); |
| 119 | +- return Result; |
| 120 | +- } |
| 121 | ++ std::stack<StackFrame> WorkStack; |
| 122 | ++ std::stack<MemoryAccess *> ReturnStack; |
| 123 | ++ WorkStack.push(StackFrame(BB, START)); |
| 124 | ++ |
| 125 | ++ while (!WorkStack.empty()) { |
| 126 | ++ StackFrame &CurrentFrame = WorkStack.top(); |
| 127 | ++ BasicBlock *CurrentBB = CurrentFrame.BB; |
| 128 | ++ |
| 129 | ++ switch (CurrentFrame.ResumeAt) { |
| 130 | ++ case START: { |
| 131 | ++ // First, do a cache lookup. Without this cache, certain CFG structures |
| 132 | ++ // (like a series of if statements) take exponential time to visit. |
| 133 | ++ auto Cached = CachedPreviousDef.find(CurrentBB); |
| 134 | ++ if (Cached != CachedPreviousDef.end()) { |
| 135 | ++ ReturnStack.push(Cached->second); |
| 136 | ++ WorkStack.pop(); |
| 137 | ++ break; |
| 138 | ++ } |
| 139 | + |
| 140 | +- if (VisitedBlocks.insert(BB).second) { |
| 141 | +- // Mark us visited so we can detect a cycle |
| 142 | +- SmallVector<TrackingVH<MemoryAccess>, 8> PhiOps; |
| 143 | ++ // If this method is called from an unreachable block, return LoE. |
| 144 | ++ if (!MSSA->DT->isReachableFromEntry(CurrentBB)) { |
| 145 | ++ MemoryAccess *LoE = MSSA->getLiveOnEntryDef(); |
| 146 | ++ ReturnStack.push(LoE); |
| 147 | ++ WorkStack.pop(); |
| 148 | ++ break; |
| 149 | ++ } |
| 150 | + |
| 151 | +- // Recurse to get the values in our predecessors for placement of a |
| 152 | +- // potential phi node. This will insert phi nodes if we cycle in order to |
| 153 | +- // break the cycle and have an operand. |
| 154 | +- bool UniqueIncomingAccess = true; |
| 155 | +- MemoryAccess *SingleAccess = nullptr; |
| 156 | +- for (auto *Pred : predecessors(BB)) { |
| 157 | +- if (MSSA->DT->isReachableFromEntry(Pred)) { |
| 158 | +- auto *IncomingAccess = getPreviousDefFromEnd(Pred, CachedPreviousDef); |
| 159 | +- if (!SingleAccess) |
| 160 | +- SingleAccess = IncomingAccess; |
| 161 | +- else if (IncomingAccess != SingleAccess) |
| 162 | +- UniqueIncomingAccess = false; |
| 163 | +- PhiOps.push_back(IncomingAccess); |
| 164 | +- } else |
| 165 | +- PhiOps.push_back(MSSA->getLiveOnEntryDef()); |
| 166 | +- } |
| 167 | ++ if (BasicBlock *Pred = CurrentBB->getUniquePredecessor()) { |
| 168 | ++ VisitedBlocks.insert(CurrentBB); |
| 169 | ++ // Single predecessor case, just recurse, we can only have one |
| 170 | ++ // definition. |
| 171 | ++ CurrentFrame.ResumeAt = COLLECT_VALUE_FROM_UNIQUE_PREDECESSOR; |
| 172 | ++ WorkStack.push(StackFrame(Pred, GET_PREVIOUS_DEF_FROM_END)); |
| 173 | ++ break; |
| 174 | ++ } |
| 175 | + |
| 176 | +- // Now try to simplify the ops to avoid placing a phi. |
| 177 | +- // This may return null if we never created a phi yet, that's okay |
| 178 | +- MemoryPhi *Phi = dyn_cast_or_null<MemoryPhi>(MSSA->getMemoryAccess(BB)); |
| 179 | +- |
| 180 | +- // See if we can avoid the phi by simplifying it. |
| 181 | +- auto *Result = tryRemoveTrivialPhi(Phi, PhiOps); |
| 182 | +- // If we couldn't simplify, we may have to create a phi |
| 183 | +- if (Result == Phi && UniqueIncomingAccess && SingleAccess) { |
| 184 | +- // A concrete Phi only exists if we created an empty one to break a cycle. |
| 185 | +- if (Phi) { |
| 186 | +- assert(Phi->operands().empty() && "Expected empty Phi"); |
| 187 | +- Phi->replaceAllUsesWith(SingleAccess); |
| 188 | +- removeMemoryAccess(Phi); |
| 189 | ++ // If this block has been already visited |
| 190 | ++ if (VisitedBlocks.count(CurrentBB)) { |
| 191 | ++ // We hit our node again, meaning we had a cycle, we must insert a phi |
| 192 | ++ // node to break it so we have an operand. The only case this will |
| 193 | ++ // insert useless phis is if we have irreducible control flow. |
| 194 | ++ MemoryAccess *Result = MSSA->createMemoryPhi(CurrentBB); |
| 195 | ++ CachedPreviousDef.insert({CurrentBB, Result}); |
| 196 | ++ ReturnStack.push(Result); |
| 197 | ++ WorkStack.pop(); |
| 198 | ++ break; |
| 199 | + } |
| 200 | +- Result = SingleAccess; |
| 201 | +- } else if (Result == Phi && !(UniqueIncomingAccess && SingleAccess)) { |
| 202 | +- if (!Phi) |
| 203 | +- Phi = MSSA->createMemoryPhi(BB); |
| 204 | +- |
| 205 | +- // See if the existing phi operands match what we need. |
| 206 | +- // Unlike normal SSA, we only allow one phi node per block, so we can't just |
| 207 | +- // create a new one. |
| 208 | +- if (Phi->getNumOperands() != 0) { |
| 209 | +- // FIXME: Figure out whether this is dead code and if so remove it. |
| 210 | +- if (!std::equal(Phi->op_begin(), Phi->op_end(), PhiOps.begin())) { |
| 211 | +- // These will have been filled in by the recursive read we did above. |
| 212 | +- llvm::copy(PhiOps, Phi->op_begin()); |
| 213 | +- std::copy(pred_begin(BB), pred_end(BB), Phi->block_begin()); |
| 214 | ++ |
| 215 | ++ // If block hasn't been visited (true if the element was successfully |
| 216 | ++ // inserted, i.e., it was not already present in the set). |
| 217 | ++ if (VisitedBlocks.insert(CurrentBB).second) { |
| 218 | ++ if (CurrentFrame.PredIt != CurrentFrame.PredEnd) { |
| 219 | ++ CurrentFrame.ResumeAt = COLLECT_VALUES_FROM_PREDECESSORS; |
| 220 | ++ WorkStack.push(StackFrame(*CurrentFrame.PredIt, PROCESS_PREDECESSOR)); |
| 221 | ++ break; |
| 222 | + } |
| 223 | ++ } |
| 224 | ++ } |
| 225 | ++ case GET_PREVIOUS_DEF_FROM_END: { |
| 226 | ++ auto *Defs = MSSA->getWritableBlockDefs(CurrentBB); |
| 227 | ++ |
| 228 | ++ if (Defs) { |
| 229 | ++ CachedPreviousDef.insert({CurrentBB, &*Defs->rbegin()}); |
| 230 | ++ ReturnStack.push(&*Defs->rbegin()); |
| 231 | ++ WorkStack.pop(); |
| 232 | + } else { |
| 233 | +- unsigned i = 0; |
| 234 | +- for (auto *Pred : predecessors(BB)) |
| 235 | +- Phi->addIncoming(&*PhiOps[i++], Pred); |
| 236 | +- InsertedPHIs.push_back(Phi); |
| 237 | ++ // Start normal search from the beginning. |
| 238 | ++ CurrentFrame.ResumeAt = START; |
| 239 | ++ } |
| 240 | ++ break; |
| 241 | ++ } |
| 242 | ++ case COLLECT_VALUE_FROM_UNIQUE_PREDECESSOR: { |
| 243 | ++ WorkStack.pop(); |
| 244 | ++ MemoryAccess *Result = ReturnStack.top(); |
| 245 | ++ CachedPreviousDef.insert({CurrentBB, Result}); |
| 246 | ++ break; |
| 247 | ++ } |
| 248 | ++ case PROCESS_PREDECESSOR: { |
| 249 | ++ if (MSSA->DT->isReachableFromEntry(CurrentFrame.BB)) { |
| 250 | ++ CurrentFrame.ResumeAt = GET_PREVIOUS_DEF_FROM_END; |
| 251 | ++ } else { |
| 252 | ++ ReturnStack.push(nullptr); |
| 253 | ++ WorkStack.pop(); |
| 254 | ++ } |
| 255 | ++ break; |
| 256 | ++ } |
| 257 | ++ case COLLECT_VALUES_FROM_PREDECESSORS: { |
| 258 | ++ MemoryAccess *IncomingAccess = ReturnStack.top(); |
| 259 | ++ if (IncomingAccess) { |
| 260 | ++ if (!CurrentFrame.SingleAccess) { |
| 261 | ++ CurrentFrame.SingleAccess = IncomingAccess; |
| 262 | ++ } else if (IncomingAccess != CurrentFrame.SingleAccess) |
| 263 | ++ CurrentFrame.UniqueIncomingAccess = false; |
| 264 | ++ CurrentFrame.PhiOps.push_back(IncomingAccess); |
| 265 | ++ } else { |
| 266 | ++ CurrentFrame.PhiOps.push_back(MSSA->getLiveOnEntryDef()); |
| 267 | ++ } |
| 268 | ++ ReturnStack.pop(); |
| 269 | ++ // Process remaining predecessors. |
| 270 | ++ ++CurrentFrame.PredIt; |
| 271 | ++ if (CurrentFrame.PredIt != CurrentFrame.PredEnd) { |
| 272 | ++ WorkStack.push(StackFrame(*CurrentFrame.PredIt, PROCESS_PREDECESSOR)); |
| 273 | ++ } else { |
| 274 | ++ CurrentFrame.ResumeAt = SIMPLIFY_OPS; |
| 275 | ++ break; |
| 276 | ++ } |
| 277 | ++ } |
| 278 | ++ case SIMPLIFY_OPS: { |
| 279 | ++ // Now try to simplify the ops to avoid placing a phi. |
| 280 | ++ // This may return null if we never created a phi yet, that's okay |
| 281 | ++ MemoryPhi *Phi = |
| 282 | ++ dyn_cast_or_null<MemoryPhi>(MSSA->getMemoryAccess(CurrentBB)); |
| 283 | ++ |
| 284 | ++ // See if we can avoid the phi by simplifying it. |
| 285 | ++ auto *Result = tryRemoveTrivialPhi(Phi, CurrentFrame.PhiOps); |
| 286 | ++ // If we couldn't simplify, we may have to create a phi |
| 287 | ++ if (Result == Phi && CurrentFrame.UniqueIncomingAccess && |
| 288 | ++ CurrentFrame.SingleAccess) { |
| 289 | ++ // A concrete Phi only exists if we created an empty one to break a |
| 290 | ++ // cycle. |
| 291 | ++ if (Phi) { |
| 292 | ++ assert(Phi->operands().empty() && "Expected empty Phi"); |
| 293 | ++ Phi->replaceAllUsesWith(CurrentFrame.SingleAccess); |
| 294 | ++ removeMemoryAccess(Phi); |
| 295 | ++ } |
| 296 | ++ Result = CurrentFrame.SingleAccess; |
| 297 | ++ } else if (Result == Phi && !(CurrentFrame.UniqueIncomingAccess && |
| 298 | ++ CurrentFrame.SingleAccess)) { |
| 299 | ++ if (!Phi) |
| 300 | ++ Phi = MSSA->createMemoryPhi(CurrentBB); |
| 301 | ++ |
| 302 | ++ // See if the existing phi operands match what we need. |
| 303 | ++ // Unlike normal SSA, we only allow one phi node per block, so we |
| 304 | ++ // can't just create a new one. |
| 305 | ++ if (Phi->getNumOperands() != 0) { |
| 306 | ++ // FIXME: Figure out whether this is dead code and if so remove |
| 307 | ++ // it. |
| 308 | ++ if (!std::equal(Phi->op_begin(), Phi->op_end(), |
| 309 | ++ CurrentFrame.PhiOps.begin())) { |
| 310 | ++ // These will have been filled in by the recursive read we did |
| 311 | ++ // above. |
| 312 | ++ llvm::copy(CurrentFrame.PhiOps, Phi->op_begin()); |
| 313 | ++ std::copy(pred_begin(CurrentBB), pred_end(CurrentBB), |
| 314 | ++ Phi->block_begin()); |
| 315 | ++ } |
| 316 | ++ } else { |
| 317 | ++ unsigned i = 0; |
| 318 | ++ for (auto *Pred : predecessors(CurrentBB)) |
| 319 | ++ Phi->addIncoming(&*CurrentFrame.PhiOps[i++], Pred); |
| 320 | ++ InsertedPHIs.push_back(Phi); |
| 321 | ++ } |
| 322 | ++ Result = Phi; |
| 323 | + } |
| 324 | +- Result = Phi; |
| 325 | ++ // Set ourselves up for the next variable by resetting visited state. |
| 326 | ++ VisitedBlocks.erase(CurrentBB); |
| 327 | ++ CachedPreviousDef.insert({CurrentBB, Result}); |
| 328 | ++ ReturnStack.push(Result); |
| 329 | ++ WorkStack.pop(); |
| 330 | + } |
| 331 | ++ } // end of 'switch' statement |
| 332 | ++ } // end of 'while' loop |
| 333 | + |
| 334 | +- // Set ourselves up for the next variable by resetting visited state. |
| 335 | +- VisitedBlocks.erase(BB); |
| 336 | +- CachedPreviousDef.insert({BB, Result}); |
| 337 | +- return Result; |
| 338 | ++ if (ReturnStack.size() != 1) { |
| 339 | ++ llvm_unreachable("There should be only one return value"); |
| 340 | + } |
| 341 | +- llvm_unreachable("Should have hit one of the three cases above"); |
| 342 | ++ return ReturnStack.top(); |
| 343 | + } |
| 344 | + |
| 345 | + // This starts at the memory access, and goes backwards in the block to find the |
| 346 | +@@ -145,7 +246,7 @@ MemoryAccess *MemorySSAUpdater::getPreviousDef(MemoryAccess *MA) { |
| 347 | + if (auto *LocalResult = getPreviousDefInBlock(MA)) |
| 348 | + return LocalResult; |
| 349 | + DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> CachedPreviousDef; |
| 350 | +- return getPreviousDefRecursive(MA->getBlock(), CachedPreviousDef); |
| 351 | ++ return getPreviousDefIterative(MA->getBlock(), CachedPreviousDef); |
| 352 | + } |
| 353 | + |
| 354 | + // This starts at the memory access, and goes backwards in the block to the find |
| 355 | +@@ -186,7 +287,7 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefFromEnd( |
| 356 | + return &*Defs->rbegin(); |
| 357 | + } |
| 358 | + |
| 359 | +- return getPreviousDefRecursive(BB, CachedPreviousDef); |
| 360 | ++ return getPreviousDefIterative(BB, CachedPreviousDef); |
| 361 | + } |
| 362 | + // Recurse over a set of phi uses to eliminate the trivial ones |
| 363 | + MemoryAccess *MemorySSAUpdater::recursePhi(MemoryAccess *Phi) { |
| 364 | +-- |
| 365 | +2.46.0.windows.1 |
| 366 | + |
0 commit comments