Skip to content

Commit 847e46c

Browse files
authored
[AArch64] Add initial support for -mcpu=olympus. (#132368)
This patch adds support for the NVIDIA Olympus core. This does not add any special tuning decisions, and those may come later.
1 parent 9b060d1 commit 847e46c

File tree

9 files changed

+141
-1
lines changed

9 files changed

+141
-1
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// RUN: %clang --target=aarch64 -mcpu=olympus -### -c %s 2>&1 | FileCheck -check-prefix=olympus %s
2+
// RUN: %clang --target=aarch64 -mlittle-endian -mcpu=olympus -### -c %s 2>&1 | FileCheck -check-prefix=olympus %s
3+
// RUN: %clang --target=aarch64 -mtune=olympus -### -c %s 2>&1 | FileCheck -check-prefix=olympus-TUNE %s
4+
// RUN: %clang --target=aarch64 -mlittle-endian -mtune=olympus -### -c %s 2>&1 | FileCheck -check-prefix=olympus-TUNE %s
5+
// olympus: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "olympus"
6+
// olympus-TUNE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic"
7+
8+
// RUN: %clang --target=arm64 -mcpu=olympus -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-olympus %s
9+
// RUN: %clang --target=arm64 -mlittle-endian -mcpu=olympus -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-olympus %s
10+
// RUN: %clang --target=arm64 -mtune=olympus -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-olympus-TUNE %s
11+
// RUN: %clang --target=arm64 -mlittle-endian -mtune=olympus -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-olympus-TUNE %s
12+
// ARM64-olympus: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "olympus"
13+
// ARM64-olympus-TUNE: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "generic"
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// REQUIRES: aarch64-registered-target
2+
// RUN: %clang --target=aarch64 --print-enabled-extensions -mcpu=olympus | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s
3+
4+
// CHECK: Extensions enabled for the given AArch64 target
5+
// CHECK-EMPTY:
6+
// CHECK-NEXT: Architecture Feature(s) Description
7+
// CHECK-NEXT: FEAT_AES, FEAT_PMULL Enable AES support
8+
// CHECK-NEXT: FEAT_AMUv1 Enable Armv8.4-A Activity Monitors extension
9+
// CHECK-NEXT: FEAT_AMUv1p1 Enable Armv8.6-A Activity Monitors Virtualization support
10+
// CHECK-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions
11+
// CHECK-NEXT: FEAT_BF16 Enable BFloat16 Extension
12+
// CHECK-NEXT: FEAT_BRBE Enable Branch Record Buffer Extension
13+
// CHECK-NEXT: FEAT_BTI Enable Branch Target Identification
14+
// CHECK-NEXT: FEAT_CCIDX Enable Armv8.3-A Extend of the CCSIDR number of sets
15+
// CHECK-NEXT: FEAT_CHK Enable Armv8.0-A Check Feature Status Extension
16+
// CHECK-NEXT: FEAT_CRC32 Enable Armv8.0-A CRC-32 checksum instructions
17+
// CHECK-NEXT: FEAT_CSV2_2 Enable architectural speculation restriction
18+
// CHECK-NEXT: FEAT_DIT Enable Armv8.4-A Data Independent Timing instructions
19+
// CHECK-NEXT: FEAT_DPB Enable Armv8.2-A data Cache Clean to Point of Persistence
20+
// CHECK-NEXT: FEAT_DPB2 Enable Armv8.5-A Cache Clean to Point of Deep Persistence
21+
// CHECK-NEXT: FEAT_DotProd Enable dot product support
22+
// CHECK-NEXT: FEAT_ECV Enable enhanced counter virtualization extension
23+
// CHECK-NEXT: FEAT_ETE Enable Embedded Trace Extension
24+
// CHECK-NEXT: FEAT_FAMINMAX Enable FAMIN and FAMAX instructions
25+
// CHECK-NEXT: FEAT_FCMA Enable Armv8.3-A Floating-point complex number support
26+
// CHECK-NEXT: FEAT_FGT Enable fine grained virtualization traps extension
27+
// CHECK-NEXT: FEAT_FHM Enable FP16 FML instructions
28+
// CHECK-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions
29+
// CHECK-NEXT: FEAT_FP16 Enable half-precision floating-point data processing
30+
// CHECK-NEXT: FEAT_FP8 Enable FP8 instructions
31+
// CHECK-NEXT: FEAT_FP8DOT2 Enable FP8 2-way dot instructions
32+
// CHECK-NEXT: FEAT_FP8DOT4 Enable FP8 4-way dot instructions
33+
// CHECK-NEXT: FEAT_FP8FMA Enable Armv9.5-A FP8 multiply-add instructions
34+
// CHECK-NEXT: FEAT_FPAC Enable Armv8.3-A Pointer Authentication Faulting enhancement
35+
// CHECK-NEXT: FEAT_FRINTTS Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int
36+
// CHECK-NEXT: FEAT_FlagM Enable Armv8.4-A Flag Manipulation instructions
37+
// CHECK-NEXT: FEAT_FlagM2 Enable alternative NZCV format for floating point comparisons
38+
// CHECK-NEXT: FEAT_HCX Enable Armv8.7-A HCRX_EL2 system register
39+
// CHECK-NEXT: FEAT_I8MM Enable Matrix Multiply Int8 Extension
40+
// CHECK-NEXT: FEAT_JSCVT Enable Armv8.3-A JavaScript FP conversion instructions
41+
// CHECK-NEXT: FEAT_LOR Enable Armv8.1-A Limited Ordering Regions extension
42+
// CHECK-NEXT: FEAT_LRCPC Enable support for RCPC extension
43+
// CHECK-NEXT: FEAT_LRCPC2 Enable Armv8.4-A RCPC instructions with Immediate Offsets
44+
// CHECK-NEXT: FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA Enable Armv8.7-A LD64B/ST64B Accelerator Extension
45+
// CHECK-NEXT: FEAT_LSE Enable Armv8.1-A Large System Extension (LSE) atomic instructions
46+
// CHECK-NEXT: FEAT_LSE2 Enable Armv8.4-A Large System Extension 2 (LSE2) atomicity rules
47+
// CHECK-NEXT: FEAT_LUT Enable Lookup Table instructions
48+
// CHECK-NEXT: FEAT_MEC Enable Memory Encryption Contexts Extension
49+
// CHECK-NEXT: FEAT_MPAM Enable Armv8.4-A Memory system Partitioning and Monitoring extension
50+
// CHECK-NEXT: FEAT_MTE, FEAT_MTE2 Enable Memory Tagging Extension
51+
// CHECK-NEXT: FEAT_NV, FEAT_NV2 Enable Armv8.4-A Nested Virtualization Enchancement
52+
// CHECK-NEXT: FEAT_PAN Enable Armv8.1-A Privileged Access-Never extension
53+
// CHECK-NEXT: FEAT_PAN2 Enable Armv8.2-A PAN s1e1R and s1e1W Variants
54+
// CHECK-NEXT: FEAT_PAuth Enable Armv8.3-A Pointer Authentication extension
55+
// CHECK-NEXT: FEAT_PMUv3 Enable Armv8.0-A PMUv3 Performance Monitors extension
56+
// CHECK-NEXT: FEAT_RAS, FEAT_RASv1p1 Enable Armv8.0-A Reliability, Availability and Serviceability Extensions
57+
// CHECK-NEXT: FEAT_RDM Enable Armv8.1-A Rounding Double Multiply Add/Subtract instructions
58+
// CHECK-NEXT: FEAT_RME Enable Realm Management Extension
59+
// CHECK-NEXT: FEAT_RNG Enable Random Number generation instructions
60+
// CHECK-NEXT: FEAT_SB Enable Armv8.5-A Speculation Barrier
61+
// CHECK-NEXT: FEAT_SEL2 Enable Armv8.4-A Secure Exception Level 2 extension
62+
// CHECK-NEXT: FEAT_SHA1, FEAT_SHA256 Enable SHA1 and SHA256 support
63+
// CHECK-NEXT: FEAT_SHA3, FEAT_SHA512 Enable SHA512 and SHA3 support
64+
// CHECK-NEXT: FEAT_SM4, FEAT_SM3 Enable SM3 and SM4 support
65+
// CHECK-NEXT: FEAT_SPE Enable Statistical Profiling extension
66+
// CHECK-NEXT: FEAT_SPECRES Enable Armv8.5-A execution and data prediction invalidation instructions
67+
// CHECK-NEXT: FEAT_SPEv1p2 Enable extra register in the Statistical Profiling Extension
68+
// CHECK-NEXT: FEAT_SSBS, FEAT_SSBS2 Enable Speculative Store Bypass Safe bit
69+
// CHECK-NEXT: FEAT_SVE Enable Scalable Vector Extension (SVE) instructions
70+
// CHECK-NEXT: FEAT_SVE2 Enable Scalable Vector Extension 2 (SVE2) instructions
71+
// CHECK-NEXT: FEAT_SVE_AES, FEAT_SVE_PMULL128 Enable SVE AES and quadword SVE polynomial multiply instructions
72+
// CHECK-NEXT: FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions
73+
// CHECK-NEXT: FEAT_SVE_SHA3 Enable SHA3 SVE2 instructions
74+
// CHECK-NEXT: FEAT_SVE_SM4 Enable SM4 SVE2 instructions
75+
// CHECK-NEXT: FEAT_TLBIOS, FEAT_TLBIRANGE Enable Armv8.4-A TLB Range and Maintenance instructions
76+
// CHECK-NEXT: FEAT_TRBE Enable Trace Buffer Extension
77+
// CHECK-NEXT: FEAT_TRF Enable Armv8.4-A Trace extension
78+
// CHECK-NEXT: FEAT_UAO Enable Armv8.2-A UAO PState
79+
// CHECK-NEXT: FEAT_VHE Enable Armv8.1-A Virtual Host extension
80+
// CHECK-NEXT: FEAT_WFxT Enable Armv8.7-A WFET and WFIT instruction
81+
// CHECK-NEXT: FEAT_XS Enable Armv8.7-A limited-TLB-maintenance instruction

clang/test/Misc/target-invalid-cpu-note/aarch64.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
// CHECK-SAME: {{^}}, neoverse-v2
8787
// CHECK-SAME: {{^}}, neoverse-v3
8888
// CHECK-SAME: {{^}}, neoverse-v3ae
89+
// CHECK-SAME: {{^}}, olympus
8990
// CHECK-SAME: {{^}}, oryon-1
9091
// CHECK-SAME: {{^}}, saphira
9192
// CHECK-SAME: {{^}}, thunderx

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,17 @@ def TuneMONAKA : SubtargetFeature<"fujitsu-monaka", "ARMProcFamily", "MONAKA",
284284
def TuneCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
285285
"Nvidia Carmel processors">;
286286

287+
def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
288+
"NVIDIA Olympus processors", [
289+
FeatureALULSLFast,
290+
FeatureCmpBccFusion,
291+
FeatureEnableSelectOptimize,
292+
FeatureFuseAES,
293+
FeatureFuseAdrpAdd,
294+
FeaturePostRAScheduler,
295+
FeaturePredictableSelectIsExpensive,
296+
FeatureUseFixedOverScalableIfEqualCost]>;
297+
287298
// Note that cyclone does not fuse AES instructions, but newer apple chips do
288299
// perform the fusion and cyclone is used by default when targetting apple OSes.
289300
def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
@@ -872,6 +883,15 @@ def ProcessorFeatures {
872883
list<SubtargetFeature> Carmel = [HasV8_2aOps, FeatureNEON, FeatureSHA2, FeatureAES,
873884
FeatureFullFP16, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM,
874885
FeatureFPARMv8];
886+
list<SubtargetFeature> Olympus = [HasV9_2aOps, FeatureBRBE, FeatureCCIDX,
887+
FeatureCHK, FeatureETE, FeatureFAMINMAX,
888+
FeatureFP16FML, FeatureFP8DOT2,
889+
FeatureFP8DOT4, FeatureFP8FMA, FeatureFPAC,
890+
FeatureLS64, FeatureLUT, FeatureMEC,
891+
FeatureMTE, FeaturePerfMon, FeatureRandGen,
892+
FeatureSPE, FeatureSPE_EEF, FeatureSSBS,
893+
FeatureSVEBitPerm, FeatureSVE2SHA3,
894+
FeatureSVE2SM4, FeatureSVEAES];
875895
list<SubtargetFeature> AppleA7 = [HasV8_0aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
876896
FeatureNEON,FeaturePerfMon];
877897
list<SubtargetFeature> AppleA10 = [HasV8_0aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
@@ -1266,6 +1286,10 @@ def : ProcessorModel<"fujitsu-monaka", A64FXModel, ProcessorFeatures.MONAKA,
12661286
def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel,
12671287
[TuneCarmel]>;
12681288

1289+
// NVIDIA Olympus
1290+
def : ProcessorModel<"olympus", NeoverseV2Model, ProcessorFeatures.Olympus,
1291+
[TuneOlympus]>;
1292+
12691293
// Ampere Computing
12701294
def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
12711295
[TuneAmpere1]>;

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,15 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
349349
PrefetchDistance = 128;
350350
MinPrefetchStride = 1024;
351351
break;
352+
case Olympus:
353+
EpilogueVectorizationMinVF = 8;
354+
MaxInterleaveFactor = 4;
355+
ScatterOverhead = 13;
356+
PrefFunctionAlignment = Align(16);
357+
PrefLoopAlignment = Align(32);
358+
MaxBytesForLoopAlignment = 16;
359+
VScaleForTuning = 1;
360+
break;
352361
}
353362

354363
if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize)

llvm/lib/TargetParser/Host.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,8 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
288288
if (Implementer == "0x4e") { // NVIDIA Corporation
289289
return StringSwitch<const char *>(Part)
290290
.Case("0x004", "carmel")
291+
.Case("0x10", "olympus")
292+
.Case("0x010", "olympus")
291293
.Default("generic");
292294
}
293295

llvm/test/CodeGen/AArch64/cpus.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
55
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=carmel 2>&1 | FileCheck %s
6+
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=olympus 2>&1 | FileCheck %s
67
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a35 2>&1 | FileCheck %s
78
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a34 2>&1 | FileCheck %s
89
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s

llvm/unittests/TargetParser/Host.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,14 @@ CPU revision : 0
305305

306306
EXPECT_EQ(sys::detail::getHostCPUNameForARM(CarmelProcCpuInfo), "carmel");
307307

308+
EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x4e\n"
309+
"CPU part : 0x10"),
310+
"olympus");
311+
312+
EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x4e\n"
313+
"CPU part : 0x010"),
314+
"olympus");
315+
308316
// Snapdragon mixed implementer quirk
309317
const std::string Snapdragon865ProcCPUInfo = R"(
310318
processor : 0

llvm/unittests/TargetParser/TargetParserTest.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1168,6 +1168,7 @@ INSTANTIATE_TEST_SUITE_P(
11681168
AArch64CPUTestParams("fujitsu-monaka", "armv9.3-a"),
11691169
AArch64CPUTestParams("carmel", "armv8.2-a"),
11701170
AArch64CPUTestParams("grace", "armv9-a"),
1171+
AArch64CPUTestParams("olympus", "armv9.2-a"),
11711172
AArch64CPUTestParams("saphira", "armv8.4-a"),
11721173
AArch64CPUTestParams("oryon-1", "armv8.6-a")),
11731174
AArch64CPUTestParams::PrintToStringParamName);
@@ -1262,7 +1263,7 @@ INSTANTIATE_TEST_SUITE_P(
12621263
AArch64CPUAliasTestParams::PrintToStringParamName);
12631264

12641265
// Note: number of CPUs includes aliases.
1265-
static constexpr unsigned NumAArch64CPUArchs = 88;
1266+
static constexpr unsigned NumAArch64CPUArchs = 89;
12661267

12671268
TEST(TargetParserTest, testAArch64CPUArchList) {
12681269
SmallVector<StringRef, NumAArch64CPUArchs> List;

0 commit comments

Comments
 (0)