llvm · jroelofs · Nov 6, 2024 · Oct 22, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h
@@ -14,9 +14,11 @@
 #ifndef LLVM_MC_MCSCHEDULE_H
 #define LLVM_MC_MCSCHEDULE_H
 
-#include "llvm/Config/llvm-config.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
+#include <optional>
 
 namespace llvm {
 
@@ -25,6 +27,7 @@ struct InstrItinerary;
 class MCSubtargetInfo;
 class MCInstrInfo;
 class MCInst;
+class MCInstrDesc;
 class InstrItineraryData;
 
 /// Define a kind of processor resource that will be modeled by the scheduler.
@@ -369,9 +372,19 @@ struct MCSchedModel {
                                  const MCSchedClassDesc &SCDesc);
 
   int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const;
+
   int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
                           const MCInst &Inst) const;
 
+  template <typename MCSubtargetInfo, typename MCInstrInfo,
+            typename InstrItineraryData, typename MCInstOrMachineInstr>
+  int computeInstrLatency(
+      const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
+      const MCInstOrMachineInstr &Inst,
+      llvm::function_ref<const MCSchedClassDesc *(const MCSchedClassDesc *)>
+          ResolveVariantSchedClass =
+              [](const MCSchedClassDesc *SCDesc) { return SCDesc; }) const;
+
   // Returns the reciprocal throughput information from a MCSchedClassDesc.
   static double
   getReciprocalThroughput(const MCSubtargetInfo &STI,
@@ -393,6 +406,54 @@ struct MCSchedModel {
   static const MCSchedModel Default;
 };
 
+// The first three are only template'd arguments so we can get away with leaving
+// them as incomplete types below. The third is a template over
+// MCInst/MachineInstr so as to avoid a layering violation here that would make
+// the MC layer depend on CodeGen.
+template <typename MCSubtargetInfo, typename MCInstrInfo,
+          typename InstrItineraryData, typename MCInstOrMachineInstr>
+int MCSchedModel::computeInstrLatency(
+    const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
+    const MCInstOrMachineInstr &Inst,
+    llvm::function_ref<const MCSchedClassDesc *(const MCSchedClassDesc *)>
+        ResolveVariantSchedClass) const {
+  static const int NoInformationAvailable = -1;
+  // Check if we have a scheduling model for instructions.
+  if (!hasInstrSchedModel()) {
+    // Try to fall back to the itinerary model if the scheduling model doesn't
+    // have a scheduling table.  Note the default does not have a table.
+
+    llvm::StringRef CPU = STI.getCPU();
+
+    // Check if we have a CPU to get the itinerary information.
+    if (CPU.empty())
+      return NoInformationAvailable;
+
+    // Get itinerary information.
+    InstrItineraryData IID = STI.getInstrItineraryForCPU(CPU);
+    // Get the scheduling class of the requested instruction.
+    const MCInstrDesc &Desc = MCII.get(Inst.getOpcode());
+    unsigned SCClass = Desc.getSchedClass();
+
+    unsigned Latency = 0;
+
+    for (unsigned Idx = 0, IdxEnd = Inst.getNumOperands(); Idx != IdxEnd; ++Idx)
+      if (std::optional<unsigned> OperCycle = IID.getOperandCycle(SCClass, Idx))
+        Latency = std::max(Latency, *OperCycle);
+
+    return int(Latency);
+  }
+
+  unsigned SchedClass = MCII.get(Inst.getOpcode()).getSchedClass();
+  const MCSchedClassDesc *SCDesc = getSchedClassDesc(SchedClass);
+  SCDesc = ResolveVariantSchedClass(SCDesc);
+
+  if (!SCDesc || !SCDesc->isValid())
+    return NoInformationAvailable;
+
+  return MCSchedModel::computeInstrLatency(STI, *SCDesc);
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -91,6 +91,7 @@
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -166,6 +167,13 @@ static cl::opt<bool> EmitJumpTableSizesSection(
     cl::desc("Emit a section containing jump table addresses and sizes"),
     cl::Hidden, cl::init(false));
 
+// This isn't turned on by default, since several of the scheduling models are
+// not completely accurate, and we don't want to be misleading.
+static cl::opt<bool> PrintLatency(
+    "asm-print-latency",
+    cl::desc("Print instruction latencies as verbose asm comments"), cl::Hidden,
+    cl::init(false));
+
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
 char AsmPrinter::ID = 0;
@@ -1085,7 +1093,8 @@ void AsmPrinter::emitFunctionEntryLabel() {
 }
 
 /// emitComments - Pretty-print comments for instructions.
-static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+static void emitComments(const MachineInstr &MI, const MCSubtargetInfo *STI,
+                         raw_ostream &CommentOS) {
   const MachineFunction *MF = MI.getMF();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
@@ -1113,6 +1122,17 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   // Check for spill-induced copies
   if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
     CommentOS << " Reload Reuse\n";
+
+  if (PrintLatency) {
+    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+    const MCSchedModel &SCModel = STI->getSchedModel();
+    int Latency = SCModel.computeInstrLatency<MCSubtargetInfo, MCInstrInfo,
+                                              InstrItineraryData, MachineInstr>(
+        *STI, *TII, MI);
+    // Report only interesting latencies.
+    if (1 < Latency)
+      CommentOS << " Latency: " << Latency << "\n";
+  }
 }
 
 /// emitImplicitDef - This method emits the specified machine instruction
@@ -1763,6 +1783,12 @@ void AsmPrinter::emitFunctionBody() {
   int NumInstsInFunction = 0;
   bool IsEHa = MMI->getModule()->getModuleFlag("eh-asynch");
 
+  const MCSubtargetInfo *STI = nullptr;
+  if (this->MF)
+    STI = &getSubtargetInfo();
+  else
+    STI = TM.getMCSubtargetInfo();
+
   bool CanDoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
@@ -1786,7 +1812,7 @@ void AsmPrinter::emitFunctionBody() {
         Handler->beginInstruction(&MI);
 
       if (isVerbose())
-        emitComments(MI, OutStreamer->getCommentOS());
+        emitComments(MI, STI, OutStreamer->getCommentOS());
 
       switch (MI.getOpcode()) {
       case TargetOpcode::CFI_INSTRUCTION:

diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -162,74 +162,13 @@ static void emitComments(LLVMDisasmContext *DC,
   DC->CommentsToEmit.clear();
 }
 
-/// Gets latency information for \p Inst from the itinerary
-/// scheduling model, based on \p DC information.
-/// \return The maximum expected latency over all the operands or -1
-/// if no information is available.
-static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
-  const int NoInformationAvailable = -1;
-
-  // Check if we have a CPU to get the itinerary information.
-  if (DC->getCPU().empty())
-    return NoInformationAvailable;
-
-  // Get itinerary information.
-  const MCSubtargetInfo *STI = DC->getSubtargetInfo();
-  InstrItineraryData IID = STI->getInstrItineraryForCPU(DC->getCPU());
-  // Get the scheduling class of the requested instruction.
-  const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
-  unsigned SCClass = Desc.getSchedClass();
-
-  unsigned Latency = 0;
-
-  for (unsigned Idx = 0, IdxEnd = Inst.getNumOperands(); Idx != IdxEnd; ++Idx)
-    if (std::optional<unsigned> OperCycle = IID.getOperandCycle(SCClass, Idx))
-      Latency = std::max(Latency, *OperCycle);
-
-  return (int)Latency;
-}
-
-/// Gets latency information for \p Inst, based on \p DC information.
-/// \return The maximum expected latency over all the definitions or -1
-/// if no information is available.
-static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
-  // Try to compute scheduling information.
-  const MCSubtargetInfo *STI = DC->getSubtargetInfo();
-  const MCSchedModel SCModel = STI->getSchedModel();
-  const int NoInformationAvailable = -1;
-
-  // Check if we have a scheduling model for instructions.
-  if (!SCModel.hasInstrSchedModel())
-    // Try to fall back to the itinerary model if the scheduling model doesn't
-    // have a scheduling table.  Note the default does not have a table.
-    return getItineraryLatency(DC, Inst);
-
-  // Get the scheduling class of the requested instruction.
-  const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
-  unsigned SCClass = Desc.getSchedClass();
-  const MCSchedClassDesc *SCDesc = SCModel.getSchedClassDesc(SCClass);
-  // Resolving the variant SchedClass requires an MI to pass to
-  // SubTargetInfo::resolveSchedClass.
-  if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant())
-    return NoInformationAvailable;
-
-  // Compute output latency.
-  int16_t Latency = 0;
-  for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
-       DefIdx != DefEnd; ++DefIdx) {
-    // Lookup the definition's write latency in SubtargetInfo.
-    const MCWriteLatencyEntry *WLEntry = STI->getWriteLatencyEntry(SCDesc,
-                                                                   DefIdx);
-    Latency = std::max(Latency, WLEntry->Cycles);
-  }
-
-  return Latency;
-}
-
 /// Emits latency information in DC->CommentStream for \p Inst, based
 /// on the information available in \p DC.
 static void emitLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
-  int Latency = getLatency(DC, Inst);
+  const MCSubtargetInfo *STI = DC->getSubtargetInfo();
+  const MCInstrInfo *MCII = DC->getInstrInfo();
+  const MCSchedModel &SCModel = STI->getSchedModel();
+  int Latency = SCModel.computeInstrLatency(*STI, *MCII, Inst);
 
   // Report only interesting latencies.
   if (Latency < 2)

diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp
@@ -69,21 +69,28 @@ int MCSchedModel::computeInstrLatency(const MCSubtargetInfo &STI,
 int MCSchedModel::computeInstrLatency(const MCSubtargetInfo &STI,
                                       const MCInstrInfo &MCII,
                                       const MCInst &Inst) const {
-  unsigned SchedClass = MCII.get(Inst.getOpcode()).getSchedClass();
-  const MCSchedClassDesc *SCDesc = getSchedClassDesc(SchedClass);
-  if (!SCDesc->isValid())
-    return 0;
-
-  unsigned CPUID = getProcessorID();
-  while (SCDesc->isVariant()) {
-    SchedClass = STI.resolveVariantSchedClass(SchedClass, &Inst, &MCII, CPUID);
-    SCDesc = getSchedClassDesc(SchedClass);
-  }
-
-  if (SchedClass)
-    return MCSchedModel::computeInstrLatency(STI, *SCDesc);
-
-  llvm_unreachable("unsupported variant scheduling class");
+  return MCSchedModel::computeInstrLatency<MCSubtargetInfo, MCInstrInfo,
+                                           InstrItineraryData, MCInst>(
+      STI, MCII, Inst,
+      [&](const MCSchedClassDesc *SCDesc) -> const MCSchedClassDesc * {
+        if (!SCDesc->isValid())
+          return nullptr;
+
+        unsigned CPUID = getProcessorID();
+        unsigned SchedClass = 0;
+        while (SCDesc->isVariant()) {
+          SchedClass =
+              STI.resolveVariantSchedClass(SchedClass, &Inst, &MCII, CPUID);
+          SCDesc = getSchedClassDesc(SchedClass);
+        }
+
+        if (!SchedClass) {
+          assert(false && "unsupported variant scheduling class");
+          return nullptr;
+        }
+
+        return SCDesc;
+      });
 }
 
 double

diff --git a/llvm/test/CodeGen/AArch64/latency.ll b/llvm/test/CodeGen/AArch64/latency.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mcpu=cyclone -asm-print-latency=1 | FileCheck %s --match-full-lines --check-prefix=ON
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mcpu=cyclone -asm-print-latency=0 | FileCheck %s --match-full-lines --check-prefix=OFF
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mcpu=cyclone                      | FileCheck %s --match-full-lines --check-prefix=OFF
+
+define <4 x i64> @load_v4i64(ptr %ptr){
+; ON:     ldp q0, q1, [x0] ; Latency: 4
+; OFF:    ldp q0, q1, [x0]
+  %a = load <4 x i64>, ptr %ptr
+  ret <4 x i64> %a
+}
diff --git a/llvm/test/CodeGen/ARM/latency.ll b/llvm/test/CodeGen/ARM/latency.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=thumb-none-eabi %s -o - -mcpu=cortex-m0 -asm-print-latency=1 | FileCheck %s --match-full-lines --check-prefix=ON
+; RUN: llc -mtriple=thumb-none-eabi %s -o - -mcpu=cortex-m0 -asm-print-latency=0 | FileCheck %s --match-full-lines --check-prefix=OFF
+; RUN: llc -mtriple=thumb-none-eabi %s -o - -mcpu=cortex-m0                      | FileCheck %s --match-full-lines --check-prefix=OFF
+
+define i64 @load_i64(ptr %ptr){
+; ON:   ldr     r2, [r0]                        @  Latency: 4
+; ON:   ldr     r1, [r0, #4]                    @  Latency: 4
+; ON:   mov     r0, r2                          @  Latency: 2
+; ON:   bx      lr
+; OFF:  ldr     r2, [r0]
+; OFF:  ldr     r1, [r0, #4]
+; OFF:  mov     r0, r2
+; OFf:  bx      lr
+  %a = load i64, ptr %ptr
+  ret i64 %a
+}