[AMDGPU][Inliner] Remove amdgpu-inline and add a new TTI inline hook

aeubanks · aeubanks · commit a11bf9a7fbd3 · 2021-01-21T20:29:17.000-08:00
Having a custom inliner doesn't really fit in with the new PM's pipeline. It's also extra technical debt. amdgpu-inline only does a couple of custom things compared to the normal inliner: 1) It disables inlining if the number of BBs in a function would exceed some limit 2) It increases the threshold if there are pointers to private arrays(?) These can all be handled as TTI inliner hooks. There already exists a hook for backends to multiply the inlining threshold. This way we can remove the custom amdgpu-inline pass. This caused inline-hint.ll to fail, and after some investigation, it looks like getInliningThresholdMultiplier() was previously getting applied twice in amdgpu-inline (https://reviews.llvm.org/D62707 fixed it not applying at all, so some later inliner change must have fixed something), so I had to change the threshold in the test. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D94153
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -289,6 +289,9 @@ class TargetTransformInfo {
   /// individual classes of instructions would be better.
   unsigned getInliningThresholdMultiplier() const;
 
+  /// \returns A value to be added to the inlining threshold.
+  unsigned adjustInliningThreshold(const CallBase *CB) const;
+
   /// \returns Vector bonus in percent.
   ///
   /// Vector bonuses: We want to more aggressively inline vector-dense kernels
@@ -1395,6 +1398,7 @@ class TargetTransformInfo::Concept {
                          ArrayRef<const Value *> Operands,
                          TTI::TargetCostKind CostKind) = 0;
   virtual unsigned getInliningThresholdMultiplier() = 0;
+  virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
   virtual int getInlinerVectorBonusPercent() = 0;
   virtual int getMemcpyCost(const Instruction *I) = 0;
   virtual unsigned
@@ -1679,6 +1683,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getInliningThresholdMultiplier() override {
     return Impl.getInliningThresholdMultiplier();
   }
+  unsigned adjustInliningThreshold(const CallBase *CB) override {
+    return Impl.adjustInliningThreshold(CB);
+  }
   int getInlinerVectorBonusPercent() override {
     return Impl.getInlinerVectorBonusPercent();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -67,6 +67,7 @@ class TargetTransformInfoImplBase {
   }
 
   unsigned getInliningThresholdMultiplier() const { return 1; }
+  unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; }
 
   int getInlinerVectorBonusPercent() const { return 150; }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -401,6 +401,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   }
 
   unsigned getInliningThresholdMultiplier() { return 1; }
+  unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
 
   int getInlinerVectorBonusPercent() { return 150; }
 
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
@@ -1580,6 +1580,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
   // Finally, take the target-specific inlining threshold multiplier into
   // account.
   Threshold *= TTI.getInliningThresholdMultiplier();
+  Threshold += TTI.adjustInliningThreshold(&Call);
 
   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
   VectorBonus = Threshold * VectorBonusPercent / 100;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -247,6 +247,11 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
   return TTIImpl->getInliningThresholdMultiplier();
 }
 
+unsigned
+TargetTransformInfo::adjustInliningThreshold(const CallBase *CB) const {
+  return TTIImpl->adjustInliningThreshold(CB);
+}
+
 int TargetTransformInfo::getInlinerVectorBonusPercent() const {
   return TTIImpl->getInlinerVectorBonusPercent();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -327,9 +327,6 @@ void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
 
 void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
 
-Pass *createAMDGPUFunctionInliningPass();
-void initializeAMDGPUInlinerPass(PassRegistry&);
-
 ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
 void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
 extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -255,7 +255,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUExternalAAWrapperPass(*PR);
   initializeAMDGPUUseNativeCallsPass(*PR);
   initializeAMDGPUSimplifyLibCallsPass(*PR);
-  initializeAMDGPUInlinerPass(*PR);
   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
@@ -423,7 +422,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
 
   if (EnableFunctionCalls) {
     delete Builder.Inliner;
-    Builder.Inliner = createAMDGPUFunctionInliningPass();
+    Builder.Inliner = createFunctionInliningPass();
   }
 
   Builder.addExtension(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -56,6 +56,24 @@ static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
     cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
     cl::init(32), cl::Hidden);
 
+static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
+                                       cl::Hidden, cl::init(4000),
+                                       cl::desc("Cost of alloca argument"));
+
+// If the amount of scratch memory to eliminate exceeds our ability to allocate
+// it into registers we gain nothing by aggressively inlining functions for that
+// heuristic.
+static cl::opt<unsigned>
+    ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
+                    cl::init(256),
+                    cl::desc("Maximum alloca size to use for inline cost"));
+
+// Inliner constraint to achieve reasonable compilation time.
+static cl::opt<size_t> InlineMaxBB(
+    "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
+    cl::desc("Maximum number of BBs allowed in a function after inlining"
+             " (compile time constraint)"));
+
 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
                               unsigned Depth = 0) {
   const Instruction *I = dyn_cast<Instruction>(Cond);
@@ -1120,7 +1138,47 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
   // no way to support merge for backend defined attributes.
   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
-  return CallerMode.isInlineCompatible(CalleeMode);
+  if (!CallerMode.isInlineCompatible(CalleeMode))
+    return false;
+
+  // Hack to make compile times reasonable.
+  if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) {
+    // Single BB does not increase total BB amount, thus subtract 1.
+    size_t BBSize = Caller->size() + Callee->size() - 1;
+    return BBSize <= InlineMaxBB;
+  }
+
+  return true;
+}
+
+unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+  // If we have a pointer to private array passed into a function
+  // it will not be optimized out, leaving scratch usage.
+  // Increase the inline threshold to allow inlining in this case.
+  uint64_t AllocaSize = 0;
+  SmallPtrSet<const AllocaInst *, 8> AIVisited;
+  for (Value *PtrArg : CB->args()) {
+    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
+    if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+                Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
+      continue;
+
+    PtrArg = getUnderlyingObject(PtrArg);
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
+      if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
+        continue;
+      AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
+      // If the amount of stack memory is excessive we will not be able
+      // to get rid of the scratch anyway, bail out.
+      if (AllocaSize > ArgAllocaCutoff) {
+        AllocaSize = 0;
+        break;
+      }
+    }
+  }
+  if (AllocaSize)
+    return ArgAllocaCost;
+  return 0;
 }
 
 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -203,6 +203,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
                            const Function *Callee) const;
 
   unsigned getInliningThresholdMultiplier() { return 11; }
+  unsigned adjustInliningThreshold(const CallBase *CB) const;
 
   int getInlinerVectorBonusPercent() { return 0; }
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -86,7 +86,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUTargetTransformInfo.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
   AMDGPUUnifyMetadata.cpp
-  AMDGPUInline.cpp
   AMDGPUPerfHintAnalysis.cpp
   AMDILCFGStructurizer.cpp
   AMDGPUPrintfRuntimeBinding.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s
+; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -passes='default<O3>' -S -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s
+; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -passes='default<O3>' -S < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s
 
 define coldcc float @foo(float %x, float %y) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll b/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll
@@ -1,5 +1,7 @@
-; RUN: opt -mtriple=amdgcn-- --amdgpu-inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL
-; RUN: opt -mtriple=amdgcn-- --amdgpu-inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL
+; RUN: opt -mtriple=amdgcn-- -inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL
+; RUN: opt -mtriple=amdgcn-- -inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL
+; RUN: opt -mtriple=amdgcn-- -passes=inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL
+; RUN: opt -mtriple=amdgcn-- -passes=inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL
 
 define i32 @callee(i32 %x) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument.ll
diff --git a/llvm/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll b/llvm/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll
diff --git a/llvm/test/Transforms/Inline/AMDGPU/inline-hint.ll b/llvm/test/Transforms/Inline/AMDGPU/inline-hint.ll
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn

Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,7 @@ class TargetTransformInfoImplBase {`
`67`	`67`	`}`
`68`	`68`
`69`	`69`	`unsigned getInliningThresholdMultiplier() const { return 1; }`
	`70`	`+ unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; }`
`70`	`71`
`71`	`72`	`int getInlinerVectorBonusPercent() const { return 150; }`
`72`	`73`
Original file line number	Diff line number	Diff line change
`@@ -401,6 +401,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {`
`401`	`401`	`}`
`402`	`402`
`403`	`403`	`unsigned getInliningThresholdMultiplier() { return 1; }`
	`404`	`+ unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }`
`404`	`405`
`405`	`406`	`int getInlinerVectorBonusPercent() { return 150; }`
`406`	`407`
Original file line number	Diff line number	Diff line change
`@@ -247,6 +247,11 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {`
`247`	`247`	`return TTIImpl->getInliningThresholdMultiplier();`
`248`	`248`	`}`
`249`	`249`
	`250`	`+unsigned`
	`251`	`+TargetTransformInfo::adjustInliningThreshold(const CallBase *CB) const {`
	`252`	`+ return TTIImpl->adjustInliningThreshold(CB);`
	`253`	`+}`
	`254`	`+`
`250`	`255`	`int TargetTransformInfo::getInlinerVectorBonusPercent() const {`
`251`	`256`	`return TTIImpl->getInlinerVectorBonusPercent();`
`252`	`257`	`}`