[AArch64] Initial sched model for Neoverse N3 #106371

FLZ101 · 2024-08-28T11:18:23Z

References:

Arm Neoverse N3 Software Optimization Guide
Arm A64 Instruction Set for A-profile architecture

llvmbot · 2024-08-28T11:18:55Z

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-aarch64

Author: Franklin (FLZ101)

Changes

References:

Arm Neoverse N3 Software Optimization Guide
Arm A64 Instruction Set for A-profile architecture

Patch is 1.64 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106371.diff

14 Files Affected:

(modified) clang/test/Misc/target-invalid-cpu-note/arm.c (+1)
(modified) llvm/include/llvm/TargetParser/ARMTargetParser.def (+3)
(modified) llvm/lib/Target/AArch64/AArch64.td (+1)
(modified) llvm/lib/Target/AArch64/AArch64Processors.td (+1-1)
(added) llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td (+2359)
(modified) llvm/test/CodeGen/AArch64/cpus.ll (+1)
(added) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-basic-instructions.s (+3725)
(added) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-mte-instructions.s (+350)
(added) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-neon-instructions.s (+3236)
(added) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s (+10262)
(added) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-writeback.s (+5320)
(modified) llvm/unittests/Target/AArch64/AArch64SVESchedPseudoTest.cpp (+4)
(modified) llvm/unittests/TargetParser/Host.cpp (+3)
(modified) llvm/unittests/TargetParser/TargetParserTest.cpp (+1-1)

diff --git a/clang/test/Misc/target-invalid-cpu-note/arm.c b/clang/test/Misc/target-invalid-cpu-note/arm.c
index 27608cc6eb29fc..278cd76bdf170e 100644
--- a/clang/test/Misc/target-invalid-cpu-note/arm.c
+++ b/clang/test/Misc/target-invalid-cpu-note/arm.c
@@ -88,6 +88,7 @@
 // CHECK-SAME: {{^}}, cortex-x1c
 // CHECK-SAME: {{^}}, neoverse-n1
 // CHECK-SAME: {{^}}, neoverse-n2
+// CHECK-SAME: {{^}}, neoverse-n3
 // CHECK-SAME: {{^}}, neoverse-v1
 // CHECK-SAME: {{^}}, cyclone
 // CHECK-SAME: {{^}}, exynos-m3
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.def b/llvm/include/llvm/TargetParser/ARMTargetParser.def
index e5a1ce54fd46a7..bf4ef09303d1e8 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.def
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.def
@@ -380,6 +380,9 @@ ARM_CPU_NAME("neoverse-n1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
 ARM_CPU_NAME("neoverse-n2", ARMV9A, FK_NEON_FP_ARMV8, false,
              (ARM::AEK_BF16 | ARM::AEK_DOTPROD | ARM::AEK_FP16FML |
               ARM::AEK_I8MM | ARM::AEK_RAS | ARM::AEK_SB ))
+ARM_CPU_NAME("neoverse-n3", ARMV9_2A, FK_NEON_FP_ARMV8, false,
+             (ARM::AEK_BF16 | ARM::AEK_DOTPROD | ARM::AEK_FP16FML |
+              ARM::AEK_I8MM | ARM::AEK_RAS | ARM::AEK_SB ))
 ARM_CPU_NAME("neoverse-v1", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
              (ARM::AEK_RAS | ARM::AEK_FP16 | ARM::AEK_BF16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 251318fe4b5efd..9378081e675a85 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -112,6 +112,7 @@ include "AArch64SchedAmpere1.td"
 include "AArch64SchedAmpere1B.td"
 include "AArch64SchedNeoverseN1.td"
 include "AArch64SchedNeoverseN2.td"
+include "AArch64SchedNeoverseN3.td"
 include "AArch64SchedNeoverseV1.td"
 include "AArch64SchedNeoverseV2.td"
 include "AArch64SchedOryon.td"
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 84d8cae3a0a5d1..8944eb88b4a4ff 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -1127,7 +1127,7 @@ def : ProcessorModel<"neoverse-n1", NeoverseN1Model,
 def : ProcessorModel<"neoverse-n2", NeoverseN2Model,
                      ProcessorFeatures.NeoverseN2, [TuneNeoverseN2]>;
 def : ProcessorAlias<"cobalt-100", "neoverse-n2">;
-def : ProcessorModel<"neoverse-n3", NeoverseN2Model,
+def : ProcessorModel<"neoverse-n3", NeoverseN3Model,
                      ProcessorFeatures.NeoverseN3, [TuneNeoverseN3]>;
 def : ProcessorModel<"neoverse-512tvb", NeoverseV1Model,
                      ProcessorFeatures.Neoverse512TVB, [TuneNeoverse512TVB]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
new file mode 100644
index 00000000000000..68568f6ec7ac78
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -0,0 +1,2359 @@
+//=- AArch64SchedNeoverseN3.td - NeoverseN3 Scheduling Defs --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for the Arm Neoverse N3 processors.
+//
+//===----------------------------------------------------------------------===//
+
+def NeoverseN3Model : SchedMachineModel {
+    let IssueWidth            =  10; // Micro-ops dispatched at a time.
+    let MicroOpBufferSize     = 160; // Entries in micro-op re-order buffer. NOTE: Copied from N2.
+    let LoadLatency           =   4; // Optimistic load latency.
+    let MispredictPenalty     =  10; // Extra cycles for mispredicted branch. NOTE: Copied from N2.
+    let LoopMicroOpBufferSize =  16; // NOTE: Copied from Cortex-A57.
+    let CompleteModel         =   1;
+
+    list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
+        [HasSVE2p1, HasPAuthLR, HasCPA, HasCSSC]);
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Neoverse N3.
+// Instructions are first fetched and then decoded into internal Macro-OPerations
+// (MOPs). From there, the MOPs proceed through register renaming and dispatch stages.
+// A MOP can be split into two Micro-OPerations (µOPs) further down the pipeline
+// after the decode stage. Once dispatched, µOPs wait for their operands and issue
+// out-of-order to one of thirteen issue pipelines. Each issue pipeline can accept
+// one µOP per cycle.
+
+let SchedModel = NeoverseN3Model in {
+
+// Define the (13) issue ports.
+def N3UnitB   : ProcResource<2>;  // Branch 0/1
+def N3UnitS   : ProcResource<2>;  // Integer Single-Cycle 0/1
+def N3UnitM0  : ProcResource<1>;  // Integer Single/Multi-Cycle 0
+def N3UnitM1  : ProcResource<1>;  // Integer Single/Multi-Cycle 1
+def N3UnitV0  : ProcResource<1>;  // FP/ASIMD 0
+def N3UnitV1  : ProcResource<1>;  // FP/ASIMD 1
+def N3UnitD   : ProcResource<2>;  // Integer Store data 0/1
+def N3UnitL01 : ProcResource<2>;  // Load/Store 0/1
+def N3UnitL2  : ProcResource<1>;  // Load 2
+
+def N3UnitI : ProcResGroup<[N3UnitS, N3UnitM0, N3UnitM1]>;
+def N3UnitM : ProcResGroup<[N3UnitM0, N3UnitM1]>;
+def N3UnitL : ProcResGroup<[N3UnitL01, N3UnitL2]>;
+def N3UnitV : ProcResGroup<[N3UnitV0, N3UnitV1]>;
+
+//===----------------------------------------------------------------------===//
+
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST,      0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+def : WriteRes<WriteFDiv,    []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Unsupported = 1; }
+def : WriteRes<WriteHint,    []> { let Unsupported = 1; }
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Neoverse N3.
+
+//===----------------------------------------------------------------------===//
+// Define generic 0 micro-op types
+
+def N3Write_0c : SchedWriteRes<[]> {
+    let Latency = 0;
+    let NumMicroOps = 0;
+}
+
+def N3Write_4c : SchedWriteRes<[]> {
+    let Latency = 4;
+    let NumMicroOps = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 1 micro-op types
+
+def N3Write_1c_1B   : SchedWriteRes<[N3UnitB]>   { let Latency = 1; }
+def N3Write_1c_1I   : SchedWriteRes<[N3UnitI]>   { let Latency = 1; }
+def N3Write_2c_1M   : SchedWriteRes<[N3UnitM]>   { let Latency = 2; }
+def N3Write_2c_1M0  : SchedWriteRes<[N3UnitM0]>  { let Latency = 2; }
+def N3Write_3c_1M   : SchedWriteRes<[N3UnitM]>   { let Latency = 3; }
+def N3Write_1c_1M   : SchedWriteRes<[N3UnitM]>   { let Latency = 1; }
+def N3Write_4c_1M   : SchedWriteRes<[N3UnitM]>   { let Latency = 4; }
+def N3Write_1c_1S   : SchedWriteRes<[N3UnitS]>   { let Latency = 1; }
+def N3Write_4c_1L   : SchedWriteRes<[N3UnitL]>   { let Latency = 4; }
+def N3Write_2c_1V   : SchedWriteRes<[N3UnitV]>   { let Latency = 2; }
+def N3Write_5c_1V0  : SchedWriteRes<[N3UnitV0]>  { let Latency = 5; }
+def N3Write_7c_1V0  : SchedWriteRes<[N3UnitV0]>  { let Latency = 7; }
+def N3Write_12c_1V0 : SchedWriteRes<[N3UnitV0]>  { let Latency = 12; }
+def N3Write_3c_1V   : SchedWriteRes<[N3UnitV]>   { let Latency = 3; }
+def N3Write_4c_1V   : SchedWriteRes<[N3UnitV]>   { let Latency = 4; }
+def N3Write_3c_1V0  : SchedWriteRes<[N3UnitV0]>  { let Latency = 3; }
+def N3Write_3c_1M0  : SchedWriteRes<[N3UnitM0]>  { let Latency = 3; }
+def N3Write_6c_1L   : SchedWriteRes<[N3UnitL]>   { let Latency = 6; }
+def N3Write_4c_1V1  : SchedWriteRes<[N3UnitV1]>  { let Latency = 4; }
+def N3Write_3c_1V1  : SchedWriteRes<[N3UnitV1]>  { let Latency = 3; }
+def N3Write_4c_1V0  : SchedWriteRes<[N3UnitV0]>  { let Latency = 4; }
+def N3Write_2c_1V0  : SchedWriteRes<[N3UnitV0]>  { let Latency = 2; }
+def N3Write_2c_1V1  : SchedWriteRes<[N3UnitV1]>  { let Latency = 2; }
+def N3Write_5c_1V   : SchedWriteRes<[N3UnitV]>   { let Latency = 5; }
+def N3Write_1c_1L01 : SchedWriteRes<[N3UnitL01]> { let Latency = 1; }
+
+def N3Write_12c_1M0 : SchedWriteRes<[N3UnitM0]> {
+    let Latency = 12;
+    let ReleaseAtCycles = [12];
+}
+
+def N3Write_20c_1M0 : SchedWriteRes<[N3UnitM0]> {
+    let Latency = 20;
+    let ReleaseAtCycles = [20];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 2 micro-op types
+
+def N3Write_1c_2I : SchedWriteRes<[N3UnitI]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+  let ReleaseAtCycles = [2];
+}
+
+def N3Write_1c_1B_1S : SchedWriteRes<[N3UnitB, N3UnitS]> {
+    let Latency = 1;
+    let NumMicroOps = 2;
+}
+
+def N3Write_2c_1M_1B : SchedWriteRes<[N3UnitM, N3UnitB]> {
+    let Latency = 2;
+    let NumMicroOps = 2;
+}
+
+def N3Write_5c_1L_1S : SchedWriteRes<[N3UnitL, N3UnitS]> {
+    let Latency = 5;
+    let NumMicroOps = 2;
+}
+
+def N3Write_4c_2L : SchedWriteRes<[N3UnitL]> {
+    let Latency = 4;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_3c_1L01_1V : SchedWriteRes<[N3UnitL01, N3UnitV]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+}
+
+def N3Write_1c_1L01_1D : SchedWriteRes<[N3UnitL01, N3UnitD]> {
+    let Latency = 1;
+    let NumMicroOps = 2;
+}
+
+def N3Write_5c_1L_1I : SchedWriteRes<[N3UnitL, N3UnitI]> {
+    let Latency = 5;
+    let NumMicroOps = 2;
+}
+
+def N3Write_6c_2L : SchedWriteRes<[N3UnitL]> {
+    let Latency = 6;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_2c_1L01_1V : SchedWriteRes<[N3UnitL01, N3UnitV]> {
+    let Latency = 2;
+    let NumMicroOps = 2;
+}
+
+def N3Write_6c_2V1 : SchedWriteRes<[N3UnitV1]> {
+    let Latency = 6;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_4c_2V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 4;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_8c_2V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 8;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_13c_2V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 13;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_4c_2V : SchedWriteRes<[N3UnitV]> {
+    let Latency = 4;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_2c_2V : SchedWriteRes<[N3UnitV]> {
+    let Latency = 2;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_8c_1L_1V : SchedWriteRes<[N3UnitL, N3UnitV]> {
+    let Latency = 8;
+    let NumMicroOps = 2;
+}
+
+def N3Write_2c_1V_1L01 : SchedWriteRes<[N3UnitV, N3UnitL01]> {
+    let Latency = 2;
+    let NumMicroOps = 2;
+}
+
+def N3Write_5c_2V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 5;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_6c_2V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 6;
+    let NumMicroOps = 2;
+    let ReleaseAtCycles = [2];
+}
+
+def N3Write_7c_1L_1M : SchedWriteRes<[N3UnitL, N3UnitM]> {
+    let Latency = 7;
+    let NumMicroOps = 2;
+}
+
+def N3Write_8c_1V_1L : SchedWriteRes<[N3UnitV, N3UnitL]> {
+    let Latency = 8;
+    let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 3 micro-op types
+
+def N3Write_5c_1M0_2V : SchedWriteRes<[N3UnitM0, N3UnitV]> {
+    let Latency = 5;
+    let NumMicroOps = 3;
+    let ReleaseAtCycles = [1, 2];
+}
+
+def N3Write_5c_1V1_2V : SchedWriteRes<[N3UnitV1, N3UnitV]> {
+    let Latency = 5;
+    let NumMicroOps = 3;
+    let ReleaseAtCycles = [1, 2];
+}
+
+def N3Write_6c_3V : SchedWriteRes<[N3UnitV]> {
+    let Latency = 6;
+    let NumMicroOps = 3;
+    let ReleaseAtCycles = [3];
+}
+
+def N3Write_4c_3V : SchedWriteRes<[N3UnitV]> {
+    let Latency = 4;
+    let NumMicroOps = 3;
+    let ReleaseAtCycles = [3];
+}
+
+def N3Write_6c_3L : SchedWriteRes<[N3UnitL]> {
+    let Latency = 6;
+    let NumMicroOps = 3;
+    let ReleaseAtCycles = [3];
+}
+
+def N3Write_8c_2L_1V : SchedWriteRes<[N3UnitL, N3UnitV]> {
+    let Latency = 8;
+    let NumMicroOps = 3;
+    let ReleaseAtCycles = [2, 1];
+}
+
+def N3Write_8c_1M0_2V : SchedWriteRes<[N3UnitM0, N3UnitV]> {
+    let Latency = 8;
+    let NumMicroOps = 3;
+    let ReleaseAtCycles = [1, 2];
+}
+
+def N3Write_7c_2V_1V1 : SchedWriteRes<[N3UnitV, N3UnitV1]> {
+    let Latency = 7;
+    let NumMicroOps = 3;
+    let ReleaseAtCycles = [2, 1];
+}
+
+def N3Write_5c_2V_1V1 : SchedWriteRes<[N3UnitV, N3UnitV1]> {
+    let Latency = 5;
+    let NumMicroOps = 3;
+    let ReleaseAtCycles = [2, 1];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 4 micro-op types
+
+def N3Write_5c_1M_1L_2I : SchedWriteRes<[N3UnitM, N3UnitL, N3UnitI]> {
+    let Latency = 5;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [1, 1, 2];
+}
+
+def N3Write_4c_2I_2L : SchedWriteRes<[N3UnitI, N3UnitL]> {
+    let Latency = 4;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [2, 2];
+}
+
+def N3Write_1c_1L01_1D_2I : SchedWriteRes<[N3UnitL01, N3UnitD, N3UnitI]> {
+    let Latency = 1;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [1, 1, 2];
+}
+
+def N3Write_2c_2I_1L01_1V : SchedWriteRes<[N3UnitI, N3UnitL01, N3UnitV]> {
+    let Latency = 2;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [2, 1, 1];
+}
+
+def N3Write_6c_4V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 6;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [4];
+}
+
+def N3Write_8c_4V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 8;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [4];
+}
+
+def N3Write_10c_4V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 10;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [4];
+}
+
+def N3Write_6c_4V : SchedWriteRes<[N3UnitV]> {
+    let Latency = 6;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [4];
+}
+
+def N3Write_7c_4L : SchedWriteRes<[N3UnitL]> {
+    let Latency = 7;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [4];
+}
+
+def N3Write_2c_2L01_2V : SchedWriteRes<[N3UnitL01, N3UnitV]> {
+    let Latency = 2;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [2, 2];
+}
+
+def N3Write_4c_2V_2L01 : SchedWriteRes<[N3UnitV, N3UnitL01]> {
+    let Latency = 4;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [2, 2];
+}
+
+def N3Write_2c_2V_2L01 : SchedWriteRes<[N3UnitV, N3UnitL01]> {
+    let Latency = 2;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [2, 2];
+}
+
+def N3Write_8c_4V : SchedWriteRes<[N3UnitV]> {
+    let Latency = 8;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [4];
+}
+
+def N3Write_2c_1L01_2I_1V : SchedWriteRes<[N3UnitL01, N3UnitI, N3UnitV]> {
+    let Latency = 2;
+    let NumMicroOps = 4;
+    let ReleaseAtCycles = [1, 2, 1];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 5 micro-op types
+
+def N3Write_7c_2M_1M0_2V : SchedWriteRes<[N3UnitM, N3UnitM0, N3UnitV]> {
+    let Latency = 7;
+    let NumMicroOps = 5;
+    let ReleaseAtCycles = [2, 1, 2];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 6 micro-op types
+
+def N3Write_4c_3V_3L01 : SchedWriteRes<[N3UnitV, N3UnitL01]> {
+    let Latency = 4;
+    let NumMicroOps = 6;
+    let ReleaseAtCycles = [3, 3];
+}
+
+def N3Write_2c_3V_3L01 : SchedWriteRes<[N3UnitV, N3UnitL01]> {
+    let Latency = 2;
+    let NumMicroOps = 6;
+    let ReleaseAtCycles = [3, 3];
+}
+
+def N3Write_8c_4V_2V1 : SchedWriteRes<[N3UnitV, N3UnitV1]> {
+    let Latency = 8;
+    let NumMicroOps = 6;
+    let ReleaseAtCycles = [4, 2];
+}
+
+def N3Write_4c_3L01_3V : SchedWriteRes<[N3UnitL01, N3UnitV]> {
+    let Latency = 4;
+    let NumMicroOps = 6;
+    let ReleaseAtCycles = [3, 3];
+}
+
+def N3Write_3c_3L01_3V : SchedWriteRes<[N3UnitL01, N3UnitV]> {
+    let Latency = 3;
+    let NumMicroOps = 6;
+    let ReleaseAtCycles = [3, 3];
+}
+
+def N3Write_6c_3L01_3V : SchedWriteRes<[N3UnitL01, N3UnitV]> {
+    let Latency = 6;
+    let NumMicroOps = 6;
+    let ReleaseAtCycles = [3, 3];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 7 micro-op types
+
+def N3Write_8c_4L_3V : SchedWriteRes<[N3UnitL, N3UnitV]> {
+    let Latency = 8;
+    let NumMicroOps = 7;
+    let ReleaseAtCycles = [4, 3];
+}
+
+def N3Write_10c_4L_3V : SchedWriteRes<[N3UnitL, N3UnitV]> {
+    let Latency = 10;
+    let NumMicroOps = 7;
+    let ReleaseAtCycles = [4, 3];
+}
+
+def N3Write_8c_3V_4L : SchedWriteRes<[N3UnitV, N3UnitL]> {
+    let Latency = 8;
+    let NumMicroOps = 7;
+    let ReleaseAtCycles = [3, 4];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 8 micro-op types
+
+def N3Write_12c_8V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 12;
+    let NumMicroOps = 8;
+    let ReleaseAtCycles = [8];
+}
+
+def N3Write_4c_4V_4L01 : SchedWriteRes<[N3UnitV, N3UnitL01]> {
+    let Latency = 4;
+    let NumMicroOps = 8;
+    let ReleaseAtCycles = [4, 4];
+}
+
+def N3Write_8c_8V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 8;
+    let NumMicroOps = 8;
+    let ReleaseAtCycles = [8];
+}
+
+def N3Write_16c_8V : SchedWriteRes<[N3UnitV]> {
+    let Latency = 16;
+    let NumMicroOps = 8;
+    let ReleaseAtCycles = [8];
+}
+
+def N3Write_3c_4L01_4V : SchedWriteRes<[N3UnitL01, N3UnitV]> {
+    let Latency = 3;
+    let NumMicroOps = 8;
+    let ReleaseAtCycles = [4, 4];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 10 micro-op types
+
+def N3Write_8c_6L_4V : SchedWriteRes<[N3UnitL, N3UnitV]> {
+    let Latency = 8;
+    let NumMicroOps = 10;
+    let ReleaseAtCycles = [6, 4];
+}
+
+def N3Write_8c_4V_6L : SchedWriteRes<[N3UnitV, N3UnitL]> {
+    let Latency = 8;
+    let NumMicroOps = 10;
+    let ReleaseAtCycles = [4, 6];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 12 micro-op types
+
+def N3Write_12c_5V_7L : SchedWriteRes<[N3UnitV, N3UnitL]> {
+    let Latency = 12;
+    let NumMicroOps = 12;
+    let ReleaseAtCycles = [5, 7];
+}
+
+def N3Write_4c_3L01_6I_3V : SchedWriteRes<[N3UnitL01, N3UnitI, N3UnitV]> {
+    let Latency = 4;
+    let NumMicroOps = 12;
+    let ReleaseAtCycles = [3, 6, 3];
+}
+
+def N3Write_3c_3L01_6I_3V : SchedWriteRes<[N3UnitL01, N3UnitI, N3UnitV]> {
+    let Latency = 3;
+    let NumMicroOps = 12;
+    let ReleaseAtCycles = [3, 6, 3];
+}
+
+def N3Write_6c_3L01_6I_3V : SchedWriteRes<[N3UnitL01, N3UnitI, N3UnitV]> {
+    let Latency = 6;
+    let NumMicroOps = 12;
+    let ReleaseAtCycles = [3, 6, 3];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 13 micro-op types
+
+def N3Write_9c_3V_4L_6I : SchedWriteRes<[N3UnitV, N3UnitL, N3UnitI]> {
+    let Latency = 9;
+    let NumMicroOps = 13;
+    let ReleaseAtCycles = [3, 4, 6];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 15 micro-op types
+
+def N3Write_10c_6V_9L : SchedWriteRes<[N3UnitV, N3UnitL]> {
+    let Latency = 10;
+    let NumMicroOps = 15;
+    let ReleaseAtCycles = [6, 9];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 16 micro-op types
+
+def N3Write_16c_16V0 : SchedWriteRes<[N3UnitV0]> {
+    let Latency = 16;
+    let NumMicroOps = 16;
+    let ReleaseAtCycles = [16];
+}
+
+def N3Write_3c_4L01_8I_4V : SchedWriteRes<[N3UnitL01, N3UnitI, N3UnitV]> {
+    let Latency = 3;
+    let NumMicroOps = 16;
+    let ReleaseAtCycles = [4, 8, 4];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 18 micro-op types
+
+def N3Write_9c_6L_4V_8I : SchedWriteRes<[N3Uni...
[truncated]

davemgreen · 2024-08-28T12:25:24Z

llvm/include/llvm/TargetParser/ARMTargetParser.def

@@ -380,6 +380,9 @@ ARM_CPU_NAME("neoverse-n1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
 ARM_CPU_NAME("neoverse-n2", ARMV9A, FK_NEON_FP_ARMV8, false,
             (ARM::AEK_BF16 | ARM::AEK_DOTPROD | ARM::AEK_FP16FML |
              ARM::AEK_I8MM | ARM::AEK_RAS | ARM::AEK_SB ))
+ARM_CPU_NAME("neoverse-n3", ARMV9_2A, FK_NEON_FP_ARMV8, false,


This looks like it is for the "Arm" backend, as opposed to AArch64. It shouldn't be needed as far as I understand.

Thanks. The changes are undone.

FLZ101 · 2024-08-29T11:30:35Z

@c-rhodes @sjoerdmeijer @rj-jesus

davemgreen

I didn't look at everything but most of what I looked at matches what I would have expected. I just had some questions/comments.

davemgreen · 2024-08-29T13:22:02Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+def : ReadAdvance<ReadVLD,     0>;
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+def : WriteRes<WriteFDiv,    []> { let Unsupported = 1; }


Any reason why these are Unsupported?

Since they are not used by the sched model. Is it better to set them even they are not used?

I think I would copy what other models do. If there are new instructions added with these Write's (which granted might be unlikely for some), then the scheduling info would get some sensible default for the new instructions, even if the model doesn't know about them. (Although I can see your point about them being unused).

davemgreen · 2024-08-29T13:32:14Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+  let NumMicroOps = 2;
+  let ReleaseAtCycles = [2];


This looks like it is used for the increment on some post/preinc. Why is it 2 MicroOps? I think I would expect them to be WriteADR and WriteADR to be N3Write_1c_1I.

Sure. I'll remove this write and use WriteAdr instead.

davemgreen · 2024-08-29T13:45:33Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+def : InstRW<[N3Write_2c_1L01_1V], (instregex "^STUR[BHSD]i$")>;
+
+// Store vector reg, unscaled immed, Q-form
+def : InstRW<[N3Write_2c_1L01_1V], (instrs STURQi)>;


Combine into STUR above?

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

davemgreen · 2024-08-29T14:11:03Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+                                         "^UMOVvi(8|16|32|64)$")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[N3Write_5c_1M0_2V], (instregex "^INSvi(8|16|32|64)gpr$")>;


Why 2V? Same for other places.

From "Neoverse N3 Software Optimization Guide":

Instruction Group AArch64 Instructions Exec Latency Execution Throughput Utilized Pipelines

ASIMD transfer, gen reg to element INS 5 1 M0, V

V has two ports V0, V1 and so can execute two uops per cycle.

"Execution Throughput" is 1, so one instruction can be completed per cycle. And there are at most two uops for V decoded from each instruction.

Here "2V" means two uops for V.

I think the bottleneck on throughput comes from elsewhere, maybe the M0 pipeline? The optimization guide does not contain all the details, but I believe the core should be able, for example, to execute an INS and another V operation in the same cycle.

Actually I used a script to generate a draft of the sched model based on a simple rule: matching the throughput assuming all utilized ports are fully utilized without considering any microarchitecture details.

Are those details described in some documents ?

Done. With V2 as a reference, those '2V' has been replaced with '1V'.

davemgreen · 2024-08-29T14:16:09Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+
+// Arithmetic, shift by immediate
+// Arithmetic, shift by immediate and insert
+// def : InstRW<[N3Write_2c_1V1], (instrs SHRNB, SHRNT, SSHLLB, SSHLLT, USHLLB, USHLLT)>;


Commented code.

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

llvm/lib/Target/AArch64/AArch64Processors.td

llvm/unittests/TargetParser/Host.cpp

davemgreen · 2024-09-02T07:56:45Z

Hello. The other thing that came up from talking to internal colleagues was whether we have any evidence that this scheduling model is working well and reflecting the real hardware. Have you run any performance data or even have you looked into calibrating against the latencies/throughputs produced from hardware?

FLZ101 · 2024-09-02T08:05:41Z

Hello. The other thing that came up from talking to internal colleagues was whether we have any evidence that this scheduling model is working well and reflecting the real hardware. Have you run any performance data or even have you looked into calibrating against the latencies/throughputs produced from hardware?

I have no N3 machine. I just "translate" the N3 SWOG into a sched model.

davemgreen · 2024-09-04T08:31:02Z

I have no N3 machine. I just "translate" the N3 SWOG into a sched model.

OK. I will see if I can try and run something to give it a quick check.

One extra thing, this definition says "take one N3UnitL and hold it for 6 cycles and one N3UnitV is held for 4 cycles".

def N3Write_8c_6L_4V : SchedWriteRes<[N3UnitL, N3UnitV]> {
    let Latency = 8;
    let NumMicroOps = 10;
    let ReleaseAtCycles = [6, 4];
}

If the operation is split down into micro-operations then they should be separate uops that can be executed independantly. So this version for example says "there are 4 L ops and 8 v ops that can execute in any unit as needed".

def V2Write_9cyc_4L_8V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL,
                                        V2UnitL, V2UnitV, V2UnitV,
                                        V2UnitV, V2UnitV, V2UnitV,
                                        V2UnitV, V2UnitV, V2UnitV]> {
  let Latency     = 9;
  let NumMicroOps = 12;
}

The dependency between the L and V ops is not well described in tablegen scheduling models, but it should lead to better analysis in general. Things like divide (and in older models sqrt) generally use ReleaseAtCycles, others operations should ideally split them out.

FLZ101 · 2024-09-05T02:01:16Z

OK. I will see if I can try and run something to give it a quick check.

Thanks. I will have more time to work on this PR this weekend.

FLZ101 · 2024-09-07T12:05:18Z

Things like divide (and in older models sqrt) generally use ReleaseAtCycles, others operations should ideally split them out.

Thanks. Now I only keep ReleaseAtCycles for "Divide, [WX]-form".

In the N3 SWOG, only integer divides are performed using an iterative algorithm and operations like "FP divide and square root", "ASIMD FP divide and square root", etc are now performed using a fully pipelined data path.

davemgreen

Thanks. Now I only keep ReleaseAtCycles for "Divide, [WX]-form".

In the N3 SWOG, only integer divides are performed using an iterative algorithm and operations like "FP divide and square root", "ASIMD FP divide and square root", etc are now performed using a fully pipelined data path.

Yeah that is nice. Thanks for the updates, I managed to run some testing and it looked OK. It took a little while to set up but the performance, overall, was a very slight improvement (but that might be in the realms of noise). It helps validate that nothing is going too wrong with the new numbers.

I have some suggestions mostly about the number of L pipes used in certain operations. I don't think I would worry too much about hitting the throughput numbers in the SWOG exactly so long as they are close. There can be other things other than number of micro-ops that can cause bottlenecks that are not modelled as well in the scheduler. It probably makes more sense to try and keep the micro-ops sensible.

If you can make these adjustments then this LGTM. Thanks for working on this and making the changes.

davemgreen · 2024-09-12T08:50:22Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+def : InstRW<[WriteAdr, N3Write_6c_2L], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+def : InstRW<[N3Write_8c_4L_3V], (instregex "LD3Threev(8b|4h|2s)$")>;


I think it only makes sense for an LD3 to load 3 things. I would change this to N3Write_8c_3L_3V, and if the throughputs are a little off from the guide that is OK.

(The actual thing making them slower could be elsewhere like the dispatch constraints, which are not modelled as well).

done.

N3Write_8c_3L_3V still matches the throughput.

davemgreen · 2024-09-12T08:52:11Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+def : InstRW<[WriteAdr, N3Write_6c_3L], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+def : InstRW<[N3Write_8c_6L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>;


4L for these LD4's.

davemgreen · 2024-09-12T09:03:03Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+                                         "^LDNT1S[BH]_ZZR_S$")>;
+
+// Non temporal gather load, vector + scalar 64-bit element size
+def : InstRW<[N3Write_6c_3L], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;


I would expect 2L here.

davemgreen · 2024-09-12T09:04:35Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+def : InstRW<[N3Write_8c_1V_1L], (instregex "^LD2[BHWD]$")>;
+
+// Contiguous Load three structures to three vectors, scalar + imm
+def : InstRW<[N3Write_8c_3V_4L], (instregex "^LD3D_IMM$")>;


davemgreen · 2024-09-12T09:05:04Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+def : InstRW<[N3Write_8c_3V_4L], (instregex "^LD3D_IMM$")>;
+
+// Contiguous Load three structures to three vectors, scalar + imm
+def : InstRW<[N3Write_10c_6V_9L], (instregex "^LD3[BHW]_IMM$")>;


9L sounds like a lot. I would expect 3L most of the time.

davemgreen · 2024-09-12T09:06:44Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+def : InstRW<[N3Write_11c_6V_9L_12I], (instregex "^LD3[BHW]$")>;
+
+// Contiguous Load four structures to four vectors, scalar + imm
+def : InstRW<[N3Write_8c_4V_6L], (instregex "^LD4D_IMM$")>;


4L for the LD4's.

davemgreen · 2024-09-12T09:11:47Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+                                         "^GLD(FF)?1W_IMM$")>;
+
+// Gather load, vector + imm, 64-bit element size
+def : InstRW<[N3Write_6c_3L], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",


I would expect a 2-element gather (v2i64) to use 2 L ops, but maybe some extra work is needed to combine them back together.

FLZ101 · 2024-09-12T17:29:26Z

I am checking others with N2ScheModel and V2SchedModel as references. May take some time.

References: * Arm Neoverse N3 Software Optimization Guide * Arm A64 Instruction Set for A-profile architecture

* use ReleaseAtCycles only for "Divide, [WX]-form"

* Multiply without accumulate when Ra is ZR, MUL, MNEG, SMULL, SMNEGL, UMULL and UMNEGL instructions can be executed on utilized pipeline M with an execution throughput of 2.

FLZ101 · 2024-09-17T11:15:51Z

I am checking others with N2ScheModel and V2SchedModel as references. May take some time.

Done. Besides changing some Ls, I also adjust some Is and perform a rebase.

* adjust some Ls and Is

davemgreen · 2024-09-18T15:58:27Z

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

+    let NumMicroOps = 9;
+}
+
+def N3Write_4c_3L01_3I_3V : SchedWriteRes<[N3UnitL01, N3UnitL01, N3UnitL01,


I think they should all be 1I, as in there is 1 additional integer operation to calculate the address.

I am not sure which one is better.

AArch64SchedNeoverseN2.td:

// Contiguous store three structures from three vectors, scalar + scalar def : InstRW<[N2Write_7c_5L01_5S_5V], (instrs ST3H)>;

AArch64SchedNeoverseV2.td:

// Contiguous store three structures from three vectors, scalar + scalar def : InstRW<[V2Write_7c_9L01_9S_9V01], (instregex "^ST3[BHWD]$")>;

It seems that the number of integer uops is the same as the number of vector uops. So I set the number of I to 3.

Oh I see. They might be wrong too, but I'm just guessing from what seems plausible and we don't have any official documentation one way or the other. Lets keep it like this then for the moment, and if we need to change it we can change them all.

Are you happy for me to hit submit?

Sure. Thanks. I have learned a lot.

davemgreen

If you can make that last change (or explain why it isn't needed), then this LGTM and we can get it in. Thanks

davemgreen · 2024-09-19T18:22:16Z

Thanks for the patch.

References: * Arm Neoverse N3 Software Optimization Guide * Arm A64 Instruction Set for A-profile architecture

llvmbot added clang Clang issues not falling into any other category backend:AArch64 labels Aug 28, 2024

davemgreen reviewed Aug 28, 2024

View reviewed changes

FLZ101 force-pushed the feature-n3 branch from 59b4684 to 687ca87 Compare August 28, 2024 14:02

davemgreen reviewed Aug 29, 2024

View reviewed changes

davemgreen requested review from c-rhodes, rj-jesus and sjoerdmeijer August 30, 2024 11:10

FLZ101 force-pushed the feature-n3 branch from 687ca87 to 20a0bc9 Compare August 30, 2024 16:59

davemgreen approved these changes Sep 12, 2024

View reviewed changes

FLZ101 added 5 commits September 17, 2024 18:09

[AArch64] Initial sched model for Neoverse N3

4e73fa9

References: * Arm Neoverse N3 Software Optimization Guide * Arm A64 Instruction Set for A-profile architecture

[AArch64] Initial sched model for Neoverse N3

6c9c180

[AArch64] Initial sched model for Neoverse N3

9e123ad

* use ReleaseAtCycles only for "Divide, [WX]-form"

[AArch64] Initial sched model for Neoverse N3

24bbf27

* Multiply without accumulate when Ra is ZR, MUL, MNEG, SMULL, SMNEGL, UMULL and UMNEGL instructions can be executed on utilized pipeline M with an execution throughput of 2.

[AArch64] Initial sched model for Neoverse N3

8e35762

FLZ101 force-pushed the feature-n3 branch from 24f4cfb to f984c41 Compare September 17, 2024 11:10

[AArch64] Initial sched model for Neoverse N3

9551cc2

* adjust some Ls and Is

FLZ101 force-pushed the feature-n3 branch from f984c41 to 9551cc2 Compare September 17, 2024 11:17

davemgreen reviewed Sep 18, 2024

View reviewed changes

davemgreen approved these changes Sep 18, 2024

View reviewed changes

davemgreen merged commit e45f9aa into llvm:main Sep 19, 2024
8 checks passed

tmsri pushed a commit to tmsri/llvm-project that referenced this pull request Sep 19, 2024

[AArch64] Initial sched model for Neoverse N3 (llvm#106371)

d354147

References: * Arm Neoverse N3 Software Optimization Guide * Arm A64 Instruction Set for A-profile architecture

[AArch64] Initial sched model for Neoverse N3 #106371

[AArch64] Initial sched model for Neoverse N3 #106371

Uh oh!

Conversation

FLZ101 commented Aug 28, 2024

Uh oh!

llvmbot commented Aug 28, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

FLZ101 commented Aug 29, 2024

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

FLZ101 Aug 30, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

FLZ101 Aug 30, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

davemgreen commented Sep 2, 2024

Uh oh!

FLZ101 commented Sep 2, 2024

Uh oh!

davemgreen commented Sep 4, 2024

Uh oh!

FLZ101 commented Sep 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

FLZ101 commented Sep 7, 2024

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

llvmbot commented Aug 28, 2024 •

edited

Loading

FLZ101 Aug 30, 2024 •

edited

Loading

FLZ101 Aug 30, 2024 •

edited

Loading

FLZ101 commented Sep 5, 2024 •

edited

Loading

FLZ101 Sep 19, 2024 •

edited

Loading