-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISC-V] Base scheduling model for tt-ascalon-d8 #120160
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
First part of tt-ascalon-d8 scheduling model, only containing scalar ops. Scheduling for vector instructions will be added in a follow-up patch. Co-authored-by: Anton Blanchard <[email protected]>
@llvm/pr-subscribers-backend-risc-v Author: Petr Penzin (ppenzin) ChangesFirst part of tt-ascalon-d8 scheduling model, only containing scalar ops. Scheduling for vector instructions will be added in a follow-up patch. Patch is 21.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/120160.diff 5 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 00c3d702e12a22..e5578a5447638f 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -54,6 +54,7 @@ include "RISCVSchedSyntacoreSCR1.td"
include "RISCVSchedSyntacoreSCR345.td"
include "RISCVSchedSyntacoreSCR7.td"
include "RISCVSchedXiangShanNanHu.td"
+include "RISCVSchedTTAscalonD8.td"
//===----------------------------------------------------------------------===//
// RISC-V processors supported.
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 445e084d07686b..84ef9faf7a37e9 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -454,7 +454,7 @@ def SYNTACORE_SCR7 : RISCVProcessorModel<"syntacore-scr7",
[TuneNoDefaultUnroll, TunePostRAScheduler]>;
def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
- NoSchedModel,
+ TTAscalonD8Model,
!listconcat(RVA23S64Features,
[FeatureStdExtSmaia,
FeatureStdExtSsaia,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td
new file mode 100644
index 00000000000000..764e546beee189
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td
@@ -0,0 +1,333 @@
+//=- RISCVSchedTTAscalonD8.td - Tenstorrent Ascalon Scheduling Defs -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+def TTAscalonD8Model : SchedMachineModel {
+ let IssueWidth = 8; // 8-way decode and dispatch
+ let MicroOpBufferSize = 256; // 256 micro-op re-order buffer
+ let LoadLatency = 4; // Optimistic load latency
+ let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch
+
+ // Enable partial & runtime unrolling.
+ let LoopMicroOpBufferSize = 16;
+
+ let CompleteModel = 0;
+
+ // TODO supported, but haven't added scheduling info yet
+ let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+ HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
+ HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
+ HasStdExtZkr, HasVInstructions, HasVInstructionsI64];
+}
+
+let SchedModel = TTAscalonD8Model in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+let BufferSize = 16 in {
+ def AscalonLS : ProcResource<3>;
+ def AscalonFXA : ProcResource<1>; // ALU, FP/VEC -> INT, MUL, DIV, CSR
+ def AscalonFXB : ProcResource<1>; // ALU, INT -> FP/VEC
+ def AscalonFXC : ProcResource<2>; // ALU, BR
+ def AscalonFXD : ProcResource<2>; // ALU
+ def AscalonFP : ProcResource<2>;
+ def AscalonV : ProcResource<2>;
+}
+
+def AscalonFX : ProcResGroup<[AscalonFXA, AscalonFXB, AscalonFXC, AscalonFXD]>;
+
+//===----------------------------------------------------------------------===//
+
+// Branching
+def : WriteRes<WriteJmp, [AscalonFXC]>;
+def : WriteRes<WriteJal, [AscalonFXC]>;
+def : WriteRes<WriteJalr, [AscalonFXC]>;
+
+// Integer arithmetic and logic
+def : WriteRes<WriteIALU32, [AscalonFX]>;
+def : WriteRes<WriteIALU, [AscalonFX]>;
+def : WriteRes<WriteShiftImm32, [AscalonFX]>;
+def : WriteRes<WriteShiftImm, [AscalonFX]>;
+def : WriteRes<WriteShiftReg32, [AscalonFX]>;
+def : WriteRes<WriteShiftReg, [AscalonFX]>;
+
+// Integer multiplication
+let Latency = 3 in {
+def : WriteRes<WriteIMul, [AscalonFXA]>;
+def : WriteRes<WriteIMul32, [AscalonFXA]>;
+}
+
+// Integer division
+// Worst case latency is used.
+
+let Latency = 7, ReleaseAtCycles = [7] in {
+ def : WriteRes<WriteIDiv32, [AscalonFXA]>;
+ def : WriteRes<WriteIDiv, [AscalonFXA]>;
+ def : WriteRes<WriteIRem32, [AscalonFXA]>;
+ def : WriteRes<WriteIRem, [AscalonFXA]>;
+}
+
+// Bitmanip
+def : WriteRes<WriteRotateImm, [AscalonFX]>;
+def : WriteRes<WriteRotateImm32, [AscalonFX]>;
+def : WriteRes<WriteRotateReg, [AscalonFX]>;
+def : WriteRes<WriteRotateReg32, [AscalonFX]>;
+
+def : WriteRes<WriteCLZ, [AscalonFX]>;
+def : WriteRes<WriteCLZ32, [AscalonFX]>;
+def : WriteRes<WriteCTZ, [AscalonFX]>;
+def : WriteRes<WriteCTZ32, [AscalonFX]>;
+
+def : WriteRes<WriteCPOP, [AscalonFX]>;
+def : WriteRes<WriteCPOP32, [AscalonFX]>;
+
+def : WriteRes<WriteORCB, [AscalonFX]>;
+
+def : WriteRes<WriteIMinMax, [AscalonFX]>;
+
+def : WriteRes<WriteREV8, [AscalonFX]>;
+
+def : WriteRes<WriteSHXADD, [AscalonFX]>;
+def : WriteRes<WriteSHXADD32, [AscalonFX]>;
+
+// Single-bit instructions
+def : WriteRes<WriteSingleBit, [AscalonFX]>;
+def : WriteRes<WriteSingleBitImm, [AscalonFX]>;
+def : WriteRes<WriteBEXT, [AscalonFX]>;
+def : WriteRes<WriteBEXTI, [AscalonFX]>;
+
+// Memory
+def : WriteRes<WriteSTB, [AscalonLS]>;
+def : WriteRes<WriteSTH, [AscalonLS]>;
+def : WriteRes<WriteSTW, [AscalonLS]>;
+def : WriteRes<WriteSTD, [AscalonLS]>;
+def : WriteRes<WriteFST16, [AscalonLS]>;
+def : WriteRes<WriteFST32, [AscalonLS]>;
+def : WriteRes<WriteFST64, [AscalonLS]>;
+
+let Latency = 4 in {
+def : WriteRes<WriteLDB, [AscalonLS]>;
+def : WriteRes<WriteLDH, [AscalonLS]>;
+def : WriteRes<WriteLDW, [AscalonLS]>;
+def : WriteRes<WriteLDD, [AscalonLS]>;
+def : WriteRes<WriteFLD16, [AscalonLS]>;
+def : WriteRes<WriteFLD32, [AscalonLS]>;
+def : WriteRes<WriteFLD64, [AscalonLS]>;
+}
+
+// Atomic memory
+def : WriteRes<WriteAtomicSTW, [AscalonLS]>;
+def : WriteRes<WriteAtomicSTD, [AscalonLS]>;
+
+let Latency = 4 in {
+def : WriteRes<WriteAtomicW, [AscalonLS]>;
+def : WriteRes<WriteAtomicD, [AscalonLS]>;
+def : WriteRes<WriteAtomicLDW, [AscalonLS]>;
+def : WriteRes<WriteAtomicLDD, [AscalonLS]>;
+}
+
+// Half precision.
+let Latency = 3 in {
+def : WriteRes<WriteFAdd16, [AscalonFP]>;
+def : WriteRes<WriteFMul16, [AscalonFP]>;
+def : WriteRes<WriteFMA16, [AscalonFP]>;
+def : WriteRes<WriteFSGNJ16, [AscalonFP]>;
+def : WriteRes<WriteFMinMax16, [AscalonFP]>;
+}
+
+let Latency = 7, ReleaseAtCycles = [7] in {
+def : WriteRes<WriteFDiv16, [AscalonFP]>;
+def : WriteRes<WriteFSqrt16, [AscalonFP]>;
+}
+
+// Single precision.
+let Latency = 3 in {
+def : WriteRes<WriteFAdd32, [AscalonFP]>;
+def : WriteRes<WriteFMul32, [AscalonFP]>;
+def : WriteRes<WriteFMA32, [AscalonFP]>;
+def : WriteRes<WriteFSGNJ32, [AscalonFP]>;
+def : WriteRes<WriteFMinMax32, [AscalonFP]>;
+}
+
+let Latency = 7, ReleaseAtCycles = [7] in {
+def : WriteRes<WriteFDiv32, [AscalonFP]>;
+def : WriteRes<WriteFSqrt32, [AscalonFP]>;
+}
+
+// Double precision
+let Latency = 3 in {
+def : WriteRes<WriteFAdd64, [AscalonFP]>;
+def : WriteRes<WriteFMul64, [AscalonFP]>;
+def : WriteRes<WriteFMA64, [AscalonFP]>;
+def : WriteRes<WriteFSGNJ64, [AscalonFP]>;
+def : WriteRes<WriteFMinMax64, [AscalonFP]>;
+}
+
+let Latency = 12, ReleaseAtCycles = [12] in {
+def : WriteRes<WriteFDiv64, [AscalonFP]>;
+def : WriteRes<WriteFSqrt64, [AscalonFP]>;
+}
+
+// Conversions
+def : WriteRes<WriteFCvtI32ToF16, [AscalonFXB]>;
+def : WriteRes<WriteFCvtI32ToF32, [AscalonFXB]>;
+def : WriteRes<WriteFCvtI32ToF64, [AscalonFXB]>;
+def : WriteRes<WriteFCvtI64ToF16, [AscalonFXB]>;
+def : WriteRes<WriteFCvtI64ToF32, [AscalonFXB]>;
+def : WriteRes<WriteFCvtI64ToF64, [AscalonFXB]>;
+def : WriteRes<WriteFCvtF16ToI32, [AscalonFXA]>;
+def : WriteRes<WriteFCvtF16ToI64, [AscalonFXA]>;
+def : WriteRes<WriteFCvtF16ToF32, [AscalonFP]>;
+def : WriteRes<WriteFCvtF16ToF64, [AscalonFP]>;
+def : WriteRes<WriteFCvtF32ToI32, [AscalonFXA]>;
+def : WriteRes<WriteFCvtF32ToI64, [AscalonFXA]>;
+def : WriteRes<WriteFCvtF32ToF16, [AscalonFP]>;
+def : WriteRes<WriteFCvtF32ToF64, [AscalonFP]>;
+def : WriteRes<WriteFCvtF64ToI32, [AscalonFXA]>;
+def : WriteRes<WriteFCvtF64ToI64, [AscalonFXA]>;
+def : WriteRes<WriteFCvtF64ToF16, [AscalonFP]>;
+def : WriteRes<WriteFCvtF64ToF32, [AscalonFP]>;
+
+def : WriteRes<WriteFClass16, [AscalonFP]>;
+def : WriteRes<WriteFClass32, [AscalonFP]>;
+def : WriteRes<WriteFClass64, [AscalonFP]>;
+def : WriteRes<WriteFCmp16, [AscalonFP]>;
+def : WriteRes<WriteFCmp32, [AscalonFP]>;
+def : WriteRes<WriteFCmp64, [AscalonFP]>;
+
+def : WriteRes<WriteFMovI16ToF16, [AscalonFXB]>;
+def : WriteRes<WriteFMovF16ToI16, [AscalonFXA]>;
+def : WriteRes<WriteFMovI32ToF32, [AscalonFXB]>;
+def : WriteRes<WriteFMovF32ToI32, [AscalonFXA]>;
+def : WriteRes<WriteFMovI64ToF64, [AscalonFXB]>;
+def : WriteRes<WriteFMovF64ToI64, [AscalonFXA]>;
+
+// Others
+def : WriteRes<WriteCSR, [AscalonFXA]>;
+def : WriteRes<WriteNop, [AscalonFX]>;
+
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
+//===----------------------------------------------------------------------===//
+// Bypass and advance
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShiftImm, 0>;
+def : ReadAdvance<ReadShiftImm32, 0>;
+def : ReadAdvance<ReadShiftReg, 0>;
+def : ReadAdvance<ReadShiftReg32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIRem, 0>;
+def : ReadAdvance<ReadIRem32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFStoreData, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFAdd16, 0>;
+def : ReadAdvance<ReadFAdd32, 0>;
+def : ReadAdvance<ReadFAdd64, 0>;
+def : ReadAdvance<ReadFMul16, 0>;
+def : ReadAdvance<ReadFMA16, 0>;
+def : ReadAdvance<ReadFMA16Addend, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
+def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
+def : ReadAdvance<ReadFDiv16, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt16, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp16, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ16, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax16, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF16ToI32, 0>;
+def : ReadAdvance<ReadFCvtF16ToI64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF16, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF16, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFCvtF16ToF32, 0>;
+def : ReadAdvance<ReadFCvtF32ToF16, 0>;
+def : ReadAdvance<ReadFCvtF16ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF16, 0>;
+def : ReadAdvance<ReadFMovF16ToI16, 0>;
+def : ReadAdvance<ReadFMovI16ToF16, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass16, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+
+// Bitmanip
+def : ReadAdvance<ReadRotateImm, 0>;
+def : ReadAdvance<ReadRotateImm32, 0>;
+def : ReadAdvance<ReadRotateReg, 0>;
+def : ReadAdvance<ReadRotateReg32, 0>;
+def : ReadAdvance<ReadCLZ, 0>;
+def : ReadAdvance<ReadCLZ32, 0>;
+def : ReadAdvance<ReadCTZ, 0>;
+def : ReadAdvance<ReadCTZ32, 0>;
+def : ReadAdvance<ReadCPOP, 0>;
+def : ReadAdvance<ReadCPOP32, 0>;
+def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
+def : ReadAdvance<ReadREV8, 0>;
+def : ReadAdvance<ReadSHXADD, 0>;
+def : ReadAdvance<ReadSHXADD32, 0>;
+// Single-bit instructions
+def : ReadAdvance<ReadSingleBit, 0>;
+def : ReadAdvance<ReadSingleBitImm, 0>;
+
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
+defm : UnsupportedSchedXsfvcp;
+defm : UnsupportedSchedZabha;
+defm : UnsupportedSchedZbc;
+defm : UnsupportedSchedZbkb;
+defm : UnsupportedSchedZbkx;
+defm : UnsupportedSchedZfa;
+defm : UnsupportedSchedZvk;
+defm : UnsupportedSchedSFB;
+}
diff --git a/llvm/test/tools/llvm-mca/RISCV/tt-ascalon-d8/fp.s b/llvm/test/tools/llvm-mca/RISCV/tt-ascalon-d8/fp.s
new file mode 100644
index 00000000000000..801db0edf83ef3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/tt-ascalon-d8/fp.s
@@ -0,0 +1,81 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64-unknown-unknown -mcpu=tt-ascalon-d8 --iterations=1 < %s | FileCheck %s
+
+fmin.s ft0, fa0, fa1
+fmax.s ft1, fa0, fa1
+fmin.d ft2, ft4, ft5
+fmax.d ft3, ft4, ft5
+fmadd.s fs0, fs0, fs8, fs9
+fmsub.s fs1, fs1, fs8, fs9
+fmul.s fs3, fs3, fs4
+fdiv.s fs2, fs3, fs4
+fmul.d ft4, ft4, ft5
+fdiv.d fs4, fa3, ft5
+fsqrt.s ft1, fa2
+fsqrt.d ft2, fa3
+
+# CHECK: Iterations: 1
+# CHECK-NEXT: Instructions: 12
+# CHECK-NEXT: Total Cycles: 31
+# CHECK-NEXT: Total uOps: 12
+
+# CHECK: Dispatch Width: 8
+# CHECK-NEXT: uOps Per Cycle: 0.39
+# CHECK-NEXT: IPC: 0.39
+# CHECK-NEXT: Block RThroughput: 23.0
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 3 0.50 fmin.s ft0, fa0, fa1
+# CHECK-NEXT: 1 3 0.50 fmax.s ft1, fa0, fa1
+# CHECK-NEXT: 1 3 0.50 fmin.d ft2, ft4, ft5
+# CHECK-NEXT: 1 3 0.50 fmax.d ft3, ft4, ft5
+# CHECK-NEXT: 1 3 0.50 fmadd.s fs0, fs0, fs8, fs9
+# CHECK-NEXT: 1 3 0.50 fmsub.s fs1, fs1, fs8, fs9
+# CHECK-NEXT: 1 3 0.50 fmul.s fs3, fs3, fs4
+# CHECK-NEXT: 1 7 3.50 fdiv.s fs2, fs3, fs4
+# CHECK-NEXT: 1 3 0.50 fmul.d ft4, ft4, ft5
+# CHECK-NEXT: 1 12 6.00 fdiv.d fs4, fa3, ft5
+# CHECK-NEXT: 1 7 3.50 fsqrt.s ft1, fa2
+# CHECK-NEXT: 1 12 6.00 fsqrt.d ft2, fa3
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - AscalonFP
+# CHECK-NEXT: [0.1] - AscalonFP
+# CHECK-NEXT: [1] - AscalonFXA
+# CHECK-NEXT: [2] - AscalonFXB
+# CHECK-NEXT: [3.0] - AscalonFXC
+# CHECK-NEXT: [3.1] - AscalonFXC
+# CHECK-NEXT: [4.0] - AscalonFXD
+# CHECK-NEXT: [4.1] - AscalonFXD
+# CHECK-NEXT: [5.0] - AscalonLS
+# CHECK-NEXT: [5.1] - AscalonLS
+# CHECK-NEXT: [5.2] - AscalonLS
+# CHECK-NEXT: [6.0] - AscalonV
+# CHECK-NEXT: [6.1] - AscalonV
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4.0] [4.1] [5.0] [5.1] [5.2] [6.0] [6.1]
+# CHECK-NEXT: 18.00 28.00 - - - - - - - - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4.0] [4.1] [5.0] [5.1] [5.2] [6.0] [6.1] Instructions:
+# CHECK-NEXT: - 1.00 - - - - - - - - - - - fmin.s ft0, fa0, fa1
+# CHECK-NEXT: 1.00 - - - - - - - - - - - - fmax.s ft1, fa0, fa1
+# CHECK-NEXT: - 1.00 - - - - - - - - - - - fmin.d ft2, ft4, ft5
+# CHECK-NEXT: 1.00 - - - - - - - - - - - - fmax.d ft3, ft4, ft5
+# CHECK-NEXT: - 1.00 - - - - - - - - - - - fmadd.s fs0, fs0, fs8, fs9
+# CHECK-NEXT: 1.00 - - - - - - - - - - - - fmsub.s fs1, fs1, fs8, fs9
+# CHECK-NEXT: - 1.00 - - - - - - - - - - - fmul.s fs3, fs3, fs4
+# CHECK-NEXT: 7.00 - - - - - - - - - - - - fdiv.s fs2, fs3, fs4
+# CHECK-NEXT: 1.00 - - - - - - - - - - - - fmul.d ft4, ft4, ft5
+# CHECK-NEXT: - 12.00 - - - - - - - - - - - fdiv.d fs4, fa3, ft5
+# CHECK-NEXT: 7.00 - - - - - - - - - - - - fsqrt.s ft1, fa2
+# CHECK-NEXT: - 12.00 - - - - - - - - - - - fsqrt.d ft2, fa3
diff --git a/llvm/test/tools/llvm-mca/RISCV/tt-ascalon-d8/fx.s b/llvm/test/tools/llvm-mca/RISCV/tt-ascalon-d8/fx.s
new file mode 100644
index 00000000000000..62827eb6628156
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/tt-ascalon-d8/fx.s
@@ -0,0 +1,81 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64-unknown-unknown -mcpu=tt-ascalon-d8 --iterations=1 < %s | FileCheck %s
+
+mul t0, a0, t0
+sub s2, a2, a3
+div t1, t2, t3
+add t1, a4, x3
+div a1, a2, a3
+add t1, a0, t0
+mul s0, a5, s0
+add t2, t2, t2
+sub s1, s0, s1
+fcvt.s.w f1, t3
+add s2, s2, s2
+fcvt.w.s t5, f3
+
+# CHECK: Iterations: 1
+# CHECK-NEXT: Instructions: 12
+# CHECK-NEXT: Total Cycles: 22
+# CHECK-NEXT: Total uOps: 12
+
+# CHECK: Dispatch Width: 8
+# CHECK-NEXT: uOps Per Cycle: 0.55
+# CHECK-NEXT: IPC: 0.55
+# CHECK-NEXT: Block RThroughput: 17.0
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 3 1.00 mul t0, a0, t0
+# CHECK-NEXT: 1 1 0.17 sub s2, a2, a3
+# CHECK-NEXT: 1 7 7.00 div t1, t2, t3
+# CHECK-NEXT: 1 1 0.17 add t1, a4, gp
+# CHECK-NEXT: 1 7 7.00 div a1, a2, a3
+# CHECK-NEXT: 1 1 0.17 add t1, a0, t0
+# CHECK-NEXT: 1 3 1.00 mul s0, s0, a5
+# CHECK-NEXT: 1 1 0.17 add t2, t2, t2
+# CHECK-NEXT: 1 1 0.17 sub s1, s0, s1
+# CHECK-NEXT: 1 1 1.00 fcvt.s.w ft1, t3
+# CHECK-NEXT: 1 1 0.17 add s2, s2, s2
+# CHECK-NEXT: 1 1 1.00 fcvt.w.s t5, ft3
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - AscalonFP
+# CHECK-NEXT: [0.1] - AscalonFP
+# CHECK-NEXT: [1] - AscalonFXA
+# CHECK-NEXT: [2] - AscalonFXB
+# CHECK-NEXT: [3.0] - AscalonFXC
+# CHECK-NEXT: [3.1] - AscalonFXC
+# CHECK-NEXT: [4.0] - AscalonFXD
+# CHECK-NEXT: [4.1] - AscalonFXD
+# CHECK-NEXT: [5.0] - AscalonLS
+# CHECK-NEXT: [5.1] - AscalonLS
+# CHECK-NEXT: [5.2] - AscalonLS
+# CHECK-NEXT: [6.0] - AscalonV
+# CHEC...
[truncated]
|
Changing processor resources changes llvm-mca output
To the issues @mshockwave raised with microarch diagram on our website - I am trying to follow up on what is the backstory behind that. However, this PR is the definition we are using internally, and pretty confident it is accurate. |
Co-authored-by: Pengcheng Wang <[email protected]>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
I need help merging the change, thanks in advance! |
First part of tt-ascalon-d8 scheduling model, only containing scalar ops. Scheduling for vector instructions will be added in a follow-up patch.