Skip to content

Commit 4f8c961

Browse files
authored
[compiler-rt][AArch64][FMV] Use the hw.optional.arm.caps fast path (#95275)
MacOS 15.0 and iOS 18.0 added a new sysctl to fetch a bitvector of all the hw.optional.arm.FEAT_*'s in one go. Using this has a perf advantage over doing multiple round-trips to the kernel and back, but since it's not present in older oses, we still need the slow fallback.
1 parent 785dc76 commit 4f8c961

File tree

1 file changed

+75
-2
lines changed
  • compiler-rt/lib/builtins/cpu_model/aarch64/fmv

1 file changed

+75
-2
lines changed

compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
#if TARGET_OS_OSX || TARGET_OS_IPHONE
33
#include <sys/sysctl.h>
44

5+
#if __has_include(<arm/cpu_capabilities_public.h>)
6+
#include <arm/cpu_capabilities_public.h>
7+
#define HAS_CPU_CAPABILITIES_PUBLIC_H 1
8+
#endif
9+
510
static bool isKnownAndSupported(const char *name) {
611
int32_t val = 0;
712
size_t size = sizeof(val);
@@ -10,6 +15,19 @@ static bool isKnownAndSupported(const char *name) {
1015
return val;
1116
}
1217

18+
static uint64_t deriveImplicitFeatures(uint64_t features) {
19+
// FEAT_SSBS2 implies FEAT_SSBS
20+
if ((1ULL << FEAT_SSBS2) & features)
21+
features |= (1ULL << FEAT_SSBS);
22+
23+
// FEAT_FP is always enabled
24+
features |= (1ULL << FEAT_FP);
25+
26+
features |= (1ULL << FEAT_INIT);
27+
28+
return features;
29+
}
30+
1331
void __init_cpu_features_resolver(void) {
1432
// On Darwin platforms, this may be called concurrently by multiple threads
1533
// because the resolvers that use it are called lazily at runtime (unlike on
@@ -21,6 +39,62 @@ void __init_cpu_features_resolver(void) {
2139

2240
uint64_t features = 0;
2341

42+
#ifdef HAS_CPU_CAPABILITIES_PUBLIC_H
43+
uint8_t feats_bitvec[(CAP_BIT_NB + 7) / 8] = {0};
44+
size_t len = sizeof(feats_bitvec);
45+
// When hw.optional.arm.feats is available (macOS 15.0+, iOS 18.0+), use the
46+
// fast path to get all the feature bits, otherwise fall back to the slow
47+
// ~20-something sysctls path.
48+
if (!sysctlbyname("hw.optional.arm.caps", &feats_bitvec, &len, 0, 0)) {
49+
50+
#define CHECK_BIT(FROM, TO) \
51+
do { \
52+
if (feats_bitvec[FROM / 8] & (1u << ((FROM) & 7))) { \
53+
features |= (1ULL << TO); \
54+
} \
55+
} while (0)
56+
57+
CHECK_BIT(CAP_BIT_FEAT_FlagM, FEAT_FLAGM);
58+
CHECK_BIT(CAP_BIT_FEAT_FlagM2, FEAT_FLAGM2);
59+
CHECK_BIT(CAP_BIT_FEAT_FHM, FEAT_FP16FML);
60+
CHECK_BIT(CAP_BIT_FEAT_DotProd, FEAT_DOTPROD);
61+
CHECK_BIT(CAP_BIT_FEAT_SHA3, FEAT_SHA3);
62+
CHECK_BIT(CAP_BIT_FEAT_RDM, FEAT_RDM);
63+
CHECK_BIT(CAP_BIT_FEAT_LSE, FEAT_LSE);
64+
CHECK_BIT(CAP_BIT_FEAT_SHA256, FEAT_SHA2);
65+
CHECK_BIT(CAP_BIT_FEAT_SHA1, FEAT_SHA1);
66+
CHECK_BIT(CAP_BIT_FEAT_AES, FEAT_AES);
67+
CHECK_BIT(CAP_BIT_FEAT_PMULL, FEAT_PMULL);
68+
CHECK_BIT(CAP_BIT_FEAT_SPECRES, FEAT_PREDRES);
69+
CHECK_BIT(CAP_BIT_FEAT_SB, FEAT_SB);
70+
CHECK_BIT(CAP_BIT_FEAT_FRINTTS, FEAT_FRINTTS);
71+
CHECK_BIT(CAP_BIT_FEAT_LRCPC, FEAT_RCPC);
72+
CHECK_BIT(CAP_BIT_FEAT_LRCPC2, FEAT_RCPC2);
73+
CHECK_BIT(CAP_BIT_FEAT_FCMA, FEAT_FCMA);
74+
CHECK_BIT(CAP_BIT_FEAT_JSCVT, FEAT_JSCVT);
75+
CHECK_BIT(CAP_BIT_FEAT_DPB, FEAT_DPB);
76+
CHECK_BIT(CAP_BIT_FEAT_DPB2, FEAT_DPB2);
77+
CHECK_BIT(CAP_BIT_FEAT_BF16, FEAT_BF16);
78+
CHECK_BIT(CAP_BIT_FEAT_I8MM, FEAT_I8MM);
79+
CHECK_BIT(CAP_BIT_FEAT_DIT, FEAT_DIT);
80+
CHECK_BIT(CAP_BIT_FEAT_FP16, FEAT_FP16);
81+
CHECK_BIT(CAP_BIT_FEAT_SSBS, FEAT_SSBS2);
82+
CHECK_BIT(CAP_BIT_FEAT_BTI, FEAT_BTI);
83+
CHECK_BIT(CAP_BIT_AdvSIMD, FEAT_SIMD);
84+
CHECK_BIT(CAP_BIT_CRC32, FEAT_CRC);
85+
CHECK_BIT(CAP_BIT_FEAT_SME, FEAT_SME);
86+
CHECK_BIT(CAP_BIT_FEAT_SME2, FEAT_SME2);
87+
CHECK_BIT(CAP_BIT_FEAT_SME_F64F64, FEAT_SME_F64);
88+
CHECK_BIT(CAP_BIT_FEAT_SME_I16I64, FEAT_SME_I64);
89+
90+
features = deriveImplicitFeatures(features);
91+
92+
__atomic_store(&__aarch64_cpu_features.features, &features,
93+
__ATOMIC_RELAXED);
94+
return;
95+
}
96+
#endif
97+
2498
// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
2599
static const struct {
26100
const char *sysctl_name;
@@ -32,7 +106,6 @@ void __init_cpu_features_resolver(void) {
32106
{"hw.optional.arm.FEAT_DotProd", FEAT_DOTPROD},
33107
{"hw.optional.arm.FEAT_RDM", FEAT_RDM},
34108
{"hw.optional.arm.FEAT_LSE", FEAT_LSE},
35-
{"hw.optional.floatingpoint", FEAT_FP},
36109
{"hw.optional.AdvSIMD", FEAT_SIMD},
37110
{"hw.optional.armv8_crc32", FEAT_CRC},
38111
{"hw.optional.arm.FEAT_SHA1", FEAT_SHA1},
@@ -62,7 +135,7 @@ void __init_cpu_features_resolver(void) {
62135
if (isKnownAndSupported(feature_checks[I].sysctl_name))
63136
features |= (1ULL << feature_checks[I].feature);
64137

65-
features |= (1ULL << FEAT_INIT);
138+
features = deriveImplicitFeatures(features);
66139

67140
__atomic_store(&__aarch64_cpu_features.features, &features,
68141
__ATOMIC_RELAXED);

0 commit comments

Comments
 (0)