llvm · jdenny-ornl · Jan 29, 2025 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst
@@ -0,0 +1,61 @@
+==========
+KernelInfo
+==========
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This LLVM IR pass reports various statistics for codes compiled for GPUs.  The
+goal of these statistics is to help identify bad code patterns and ways to
+mitigate them.  The pass operates at the LLVM IR level so that it can, in
+theory, support any LLVM-based compiler for programming languages supporting
+GPUs.
+
+By default, the pass is disabled.  For convenience, the command-line option
+``-kernel-info-end-lto`` inserts it at the end of LTO, and options like
+``-Rpass=kernel-info`` enable its remarks.  Example ``opt`` and ``clang``
+command lines appear in the next section.
+
+Remarks include summary statistics (e.g., total size of static allocas) and
+individual occurrences (e.g., source location of each alloca).  Examples of the
+output appear in tests in `llvm/test/Analysis/KernelInfo`.
+
+Example Command Lines
+=====================
+
+To analyze a C program as it appears to an LLVM GPU backend at the end of LTO:
+
+.. code-block:: shell
+
+  $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+      -Rpass=kernel-info -mllvm -kernel-info-end-lto
+
+To analyze specified LLVM IR, perhaps previously generated by something like
+``clang -save-temps -g -fopenmp --offload-arch=native test.c``:
+
+.. code-block:: shell
+
+  $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+      -pass-remarks=kernel-info -passes=kernel-info
+
+kernel-info can also be inserted into a specified LLVM pass pipeline using
+``-kernel-info-end-lto``, or it can be positioned explicitly in that pipeline:
+
+.. code-block:: shell
+
+  $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+      -Rpass=kernel-info -mllvm -kernel-info-end-lto \
+      -Xoffload-linker --lto-newpm-passes='lto<O2>'
+
+  $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+      -Rpass=kernel-info \
+      -Xoffload-linker --lto-newpm-passes='lto<O2>,module(kernel-info)'
+
+  $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+      -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto<O2>'
+
+  $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+      -pass-remarks=kernel-info -passes='lto<O2>,module(kernel-info)'
diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -0,0 +1,123 @@
+//=- KernelInfo.h - Kernel Analysis -------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter
+// classes used to extract function properties from a GPU kernel.
+//
+// See llvm/docs/KernelInfo.rst.
+// ===---------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_KERNELINFO_H
+#define LLVM_ANALYSIS_KERNELINFO_H
+
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+namespace llvm {
+class DominatorTree;
+class Function;
+
+/// Data structure holding function info for kernels.
+class KernelInfo {
+  void updateForBB(const BasicBlock &BB, int64_t Direction,
+                   OptimizationRemarkEmitter &ORE,
+                   const TargetTransformInfo &TTI);
+
+public:
+  static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM);
+
+  bool operator==(const KernelInfo &FPI) const {
+    return std::memcmp(this, &FPI, sizeof(KernelInfo)) == 0;
+  }
+
+  bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); }
+
+  /// If false, nothing was recorded here because the supplied function didn't
+  /// appear in a module compiled for a GPU.
+  bool IsValid = false;
+
+  /// Whether the function has external linkage and is not a kernel function.
+  bool ExternalNotKernel = false;
+
+  /// OpenMP Launch bounds.
+  ///@{
+  std::optional<int64_t> OmpTargetNumTeams;
+  std::optional<int64_t> OmpTargetThreadLimit;
+  ///@}
+
+  /// AMDGPU launch bounds.
+  ///@{
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsX;
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsY;
+  std::optional<int64_t> AmdgpuMaxNumWorkgroupsZ;
+  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMin;
+  std::optional<int64_t> AmdgpuFlatWorkGroupSizeMax;
+  std::optional<int64_t> AmdgpuWavesPerEuMin;
+  std::optional<int64_t> AmdgpuWavesPerEuMax;
+  ///@}
+
+  /// NVPTX launch bounds.
+  ///@{
+  std::optional<int64_t> Maxclusterrank;
+  std::optional<int64_t> Maxntidx;
+  ///@}
+
+  /// The number of alloca instructions inside the function, the number of those
+  /// with allocation sizes that cannot be determined at compile time, and the
+  /// sum of the sizes that can be.
+  ///
+  /// With the current implementation for at least some GPU archs,
+  /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in
+  /// case the implementation changes.
+  int64_t Allocas = 0;
+  int64_t AllocasDyn = 0;
+  int64_t AllocasStaticSizeSum = 0;
+
+  /// Number of direct/indirect calls (anything derived from CallBase).
+  int64_t DirectCalls = 0;
+  int64_t IndirectCalls = 0;
+
+  /// Number of direct calls made from this function to other functions
+  /// defined in this module.
+  int64_t DirectCallsToDefinedFunctions = 0;
+
+  /// Number of calls of type InvokeInst.
+  int64_t Invokes = 0;
+
+  /// Number of addrspace(0) memory accesses (via load, store, etc.).
+  int64_t AddrspaceZeroAccesses = 0;
+};
+
+/// Analysis class for KernelInfo.
+class KernelInfoAnalysis : public AnalysisInfoMixin<KernelInfoAnalysis> {
+public:
+  static AnalysisKey Key;
+
+  using Result = const KernelInfo;
+
+  KernelInfo run(Function &F, FunctionAnalysisManager &FAM) {
+    return KernelInfo::getKernelInfo(F, FAM);
+  }
+};
+
+/// Printer pass for KernelInfoAnalysis.
+///
+/// It just calls KernelInfoAnalysis, which prints remarks if they are enabled.
+class KernelInfoPrinter : public PassInfoMixin<KernelInfoPrinter> {
+public:
+  explicit KernelInfoPrinter() {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
+    AM.getResult<KernelInfoAnalysis>(F);
+    return PreservedAnalyses::all();
+  }
+
+  static bool isRequired() { return true; }
+};
+} // namespace llvm
+#endif // LLVM_ANALYSIS_KERNELINFO_H
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
@@ -18,6 +18,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Target/CGPassBuilderOption.h"
@@ -27,6 +28,8 @@
 #include <string>
 #include <utility>
 
+extern llvm::cl::opt<bool> KernelInfoEndLTO;
+
 namespace llvm {
 
 class AAManager;

diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
@@ -78,6 +78,7 @@ add_llvm_component_library(LLVMAnalysis
   InstructionPrecedenceTracking.cpp
   InstructionSimplify.cpp
   InteractiveModelRunner.cpp
+  KernelInfo.cpp
   LazyBranchProbabilityInfo.cpp
   LazyBlockFrequencyInfo.cpp
   LazyCallGraph.cpp