[AMDGPU][LTO] Introduce AMDGPUCloneModuleLDS

gandhi56 · gandhi56 · commit 032f3d9f0a35 · 2024-04-24T05:42:58.000Z
The purpose of this pass is to ensure that the
combined module contains as many LDS global variables
as there are kernels that (indirectly) access them.
As LDS variables behave like C++ static variables,
it is important that each partition contains a
unique copy of the variable on a per kernel basis.
This representation also prepares the combined
module to eliminate cross-module dependencies of
LDS variables.

This pass operates as follows:
1. Firstly, traverse the call graph from each kernel
   to determine the number of kernels calling each
   device function.
2. For each LDS global variable GV, determine the
   function F that defines it. Collect it's caller
   functions. Clone F and GV, and finally insert a
   call/invoke instruction in each caller function.

Change-Id: I998291a389ea3db10de9122f08fe55c981da6049
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -149,6 +149,11 @@ struct AMDGPULowerBufferFatPointersPass
   const TargetMachine &TM;
 };
 
+struct AMDGPUCloneModuleLDSPass
+    : public PassInfoMixin<AMDGPUCloneModuleLDSPass> {
+  PreservedAnalyses run(Module &, ModuleAnalysisManager &);
+};
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp
@@ -0,0 +1,139 @@
+//===-- AMDGPUCloneModuleLDSPass.cpp ------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to ensure that the combined module contains
+// as many LDS global variables as there are kernels that (indirectly) access
+// them. As LDS variables behave like C++ static variables, it is important that
+// each partition contains a unique copy of the variable on a per kernel basis.
+// This representation also prepares the combined module to eliminate
+// cross-module dependencies of LDS variables.
+//
+// This pass operates as follows:
+// 1. Firstly, traverse the call graph from each kernel to determine the number
+//    of kernels calling each device function.
+// 2. For each LDS global variable GV, determine the function F that defines it.
+//    Collect it's caller functions. Clone F and GV, and finally insert a
+//    call/invoke instruction in each caller function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-clone-module-lds"
+
+static cl::opt<unsigned int> MaxCountForClonedFunctions(
+    "clone-lds-functions-max-count", cl::init(16), cl::Hidden,
+    cl::desc("Specify a limit to the number of clones of a function"));
+
+/// Return the function that defines \p GV
+/// \param GV The global variable in question
+/// \return The function defining \p GV
+static Function *getFunctionDefiningGV(GlobalVariable &GV) {
+  SmallVector<User *> Worklist(GV.users());
+  while (!Worklist.empty()) {
+    User *U = Worklist.pop_back_val();
+    if (auto *Inst = dyn_cast<Instruction>(U))
+      return Inst->getFunction();
+    if (auto *Op = dyn_cast<Operator>(U))
+      append_range(Worklist, Op->users());
+  }
+  return nullptr;
+};
+
+PreservedAnalyses AMDGPUCloneModuleLDSPass::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  if (MaxCountForClonedFunctions.getValue() == 1)
+    return PreservedAnalyses::all();
+
+  bool Changed = false;
+  auto &CG = AM.getResult<CallGraphAnalysis>(M);
+
+  // For each function in the call graph, determine the number
+  // of ancestor-caller kernels.
+  DenseMap<Function *, unsigned int> KernelRefsToFuncs;
+  for (auto &Fn : M) {
+    if (Fn.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+      continue;
+    for (auto I = df_begin(&CG), E = df_end(&CG); I != E; ++I)
+      if (auto *F = I->getFunction())
+        KernelRefsToFuncs[F]++;
+  }
+
+  DenseMap<GlobalVariable *, Function *> GVToFnMap;
+  for (auto &GV : M.globals()) {
+    if (GVToFnMap.contains(&GV) ||
+        GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
+        !GV.hasInitializer())
+      continue;
+
+    auto *OldF = getFunctionDefiningGV(GV);
+    GVToFnMap.insert({&GV, OldF});
+    LLVM_DEBUG(dbgs() << "Found LDS " << GV.getName() << " used in function "
+                      << OldF->getName() << '\n');
+
+    // Collect all call instructions to OldF
+    SmallVector<Instruction *> InstsCallingOldF;
+    for (auto &I : OldF->uses())
+      if (auto *CI = dyn_cast<CallBase>(I.getUser()))
+        InstsCallingOldF.push_back(CI);
+
+    // Create as many clones of the function containing LDS global as
+    // there are kernels calling the function (including the function
+    // already defining the LDS global). Respectively, clone the
+    // LDS global and the call instructions to the function.
+    LLVM_DEBUG(dbgs() << "\tFunction is referenced by "
+                      << KernelRefsToFuncs[OldF] << " kernels.\n");
+    for (unsigned int ID = 0;
+         ID + 1 < std::min(KernelRefsToFuncs[OldF],
+                           MaxCountForClonedFunctions.getValue());
+         ++ID) {
+      // Clone LDS global variable
+      auto *NewGV = new GlobalVariable(
+          M, GV.getValueType(), GV.isConstant(), GlobalValue::InternalLinkage,
+          PoisonValue::get(GV.getValueType()),
+          GV.getName() + ".clone." + Twine(ID), &GV,
+          GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+      NewGV->copyAttributesFrom(&GV);
+      NewGV->copyMetadata(&GV, 0);
+      NewGV->setComdat(GV.getComdat());
+      LLVM_DEBUG(dbgs() << "Inserting LDS clone with name " << NewGV->getName()
+                        << '\n');
+      
+      // Clone function
+      ValueToValueMapTy VMap;
+      VMap[&GV] = NewGV;
+      auto *NewF = CloneFunction(OldF, VMap);
+      NewF->setName(OldF->getName() + ".clone." + Twine(ID));
+      LLVM_DEBUG(dbgs() << "Inserting function clone with name "
+                        << NewF->getName() << '\n');
+
+
+      // Create a new CallInst to call the cloned function
+      for (auto *Inst : InstsCallingOldF) {
+        Instruction *I = Inst->clone();
+        I->setName(Inst->getName() + ".clone." + Twine(ID));
+        if (auto *CI = dyn_cast<CallBase>(I))
+          CI->setCalledOperand(NewF);
+        I->insertAfter(Inst);
+        LLVM_DEBUG(dbgs() << "Inserting inst: " << *I << '\n');
+      }
+      Changed = true;
+    }
+  }
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -22,6 +22,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
             AMDGPULowerBufferFatPointersPass(*this))
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
+MODULE_PASS("amdgpu-clone-module-lds", AMDGPUCloneModuleLDSPass())
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
 #undef MODULE_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -725,6 +725,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
         // We want to support the -lto-partitions=N option as "best effort".
         // For that, we need to lower LDS earlier in the pipeline before the
         // module is partitioned for codegen.
+        PM.addPass(AMDGPUCloneModuleLDSPass());
         if (EnableLowerModuleLDS)
           PM.addPass(AMDGPULowerModuleLDSPass(*this));
       });
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAtomicOptimizer.cpp
   AMDGPUAttributor.cpp
   AMDGPUCallLowering.cpp
+  AMDGPUCloneModuleLDS.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function.ll
@@ -0,0 +1,58 @@
+; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; In this examples, CloneModuleLDS pass creates two copies of LDS_GV
+; as two kernels call the same device function where LDS_GV is used.
+
+; CHECK: [[LDS_GV_CLONE:@.*\.clone\.0]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
+; CHECK: [[LDS_GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+
+define protected amdgpu_kernel void @kernel1(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @lds_func(i32 [[N]])
+; CHECK-NEXT:    [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @lds_func(i32 %n)
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel2(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @lds_func(i32 [[N]])
+; CHECK-NEXT:    [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @lds_func(i32 %n)
+  ret void
+}
+
+
+define i32 @lds_func(i32 %x) {
+; CHECK-LABEL: define i32 @lds_func(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[LDS_GV]] to ptr), i64 0, i64 0
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0
+  store i32 %x, ptr %p
+  ret i32 %x
+}
+
+; CHECK-LABEL: define i32 @lds_func.clone.0(i32 %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[LDS_GV_CLONE]] to ptr), i64 0, i64 0
+; CHECK-NEXT:   store i32 %x, ptr %p, align 4
+; CHECK-NEXT:   ret i32 %x
+
diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-ancestor-kernels.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-ancestor-kernels.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; Before transformation,                    After transformation,
+;  K1  K2                                    K1  K2
+;  |  /                                      |  /
+;  | /                                       | /
+;  A                         ==>             A
+;  | \                                       | \
+;  |  \                                      |  \
+;  B   C                                     B   C
+;  |                                         | \
+;  X                                         X1 X2
+;
+; where X contains an LDS reference
+
+; CHECK: [[GV_CLONE:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
+; CHECK: [[GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+
+define protected amdgpu_kernel void @kernel1(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @A(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @A(i32 %n)
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel2(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @A(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @A(i32 %n)
+  ret void
+}
+
+define void @A() {
+; CHECK-LABEL: define void @A() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @B()
+; CHECK-NEXT:    call void @C()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @B()
+  call void @C()
+  ret void
+}
+
+define i32 @B() {
+; CHECK-LABEL: define i32 @B() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 5, ptr [[P]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @X(ptr [[P]])
+; CHECK-NEXT:    [[RET_CLONE_0:%.*]] = call i32 @X.clone.0(ptr [[P]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %p = alloca i32
+  store i32 5, ptr %p
+  %ret = call i32 @X(ptr %p)
+  ret i32 %ret
+}
+
+define void @C() {
+; CHECK-LABEL: define void @C() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define i32 @X(ptr %x) {
+; CHECK-LABEL: define i32 @X(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 0, i64 0
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT:    store i32 [[V]], ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+entry:
+  %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0
+  %v = load i32, ptr %x
+  store i32 %v, ptr %p
+  ret i32 %v
+}
+
+; CHECK-LABEL: define i32 @X.clone.0(ptr %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE]] to ptr), i64 0, i64 0
+; CHECK-NEXT:   %v = load i32, ptr %x, align 4
+; CHECK-NEXT:   store i32 %v, ptr %p, align 4
+; CHECK-NEXT:   ret i32 %v
diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-successors.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-successors.ll
diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-struct-insts.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-struct-insts.ll