Skip to content

[compiler-rt][AArch64][FMV] Use the hw.optional.arm.caps fast path #95275

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 14, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 75 additions & 2 deletions compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
#if TARGET_OS_OSX || TARGET_OS_IPHONE
#include <sys/sysctl.h>

#if __has_include(<arm/cpu_capabilities_public.h>)
#include <arm/cpu_capabilities_public.h>
#define HAS_CPU_CAPABILITIES_PUBLIC_H 1
#endif

static bool isKnownAndSupported(const char *name) {
int32_t val = 0;
size_t size = sizeof(val);
Expand All @@ -10,6 +15,19 @@ static bool isKnownAndSupported(const char *name) {
return val;
}

static uint64_t deriveImplicitFeatures(uint64_t features) {
// FEAT_SSBS2 implies FEAT_SSBS
if ((1ULL << FEAT_SSBS2) & features)
features |= (1ULL << FEAT_SSBS);

// FEAT_FP is always enabled
features |= (1ULL << FEAT_FP);

features |= (1ULL << FEAT_INIT);

return features;
}

void __init_cpu_features_resolver(void) {
// On Darwin platforms, this may be called concurrently by multiple threads
// because the resolvers that use it are called lazily at runtime (unlike on
Expand All @@ -21,6 +39,62 @@ void __init_cpu_features_resolver(void) {

uint64_t features = 0;

#ifdef HAS_CPU_CAPABILITIES_PUBLIC_H
uint8_t feats_bitvec[(CAP_BIT_NB + 7) / 8] = {0};
size_t len = sizeof(feats_bitvec);
// When hw.optional.arm.feats is available (macOS 15.0+, iOS 18.0+), use the
// fast path to get all the feature bits, otherwise fall back to the slow
// ~20-something sysctls path.
if (!sysctlbyname("hw.optional.arm.caps", &feats_bitvec, &len, 0, 0)) {

#define CHECK_BIT(FROM, TO) \
do { \
if (feats_bitvec[FROM / 8] & (1u << ((FROM) & 7))) { \
features |= (1ULL << TO); \
} \
} while (0)

CHECK_BIT(CAP_BIT_FEAT_FlagM, FEAT_FLAGM);
CHECK_BIT(CAP_BIT_FEAT_FlagM2, FEAT_FLAGM2);
CHECK_BIT(CAP_BIT_FEAT_FHM, FEAT_FP16FML);
CHECK_BIT(CAP_BIT_FEAT_DotProd, FEAT_DOTPROD);
CHECK_BIT(CAP_BIT_FEAT_SHA3, FEAT_SHA3);
CHECK_BIT(CAP_BIT_FEAT_RDM, FEAT_RDM);
CHECK_BIT(CAP_BIT_FEAT_LSE, FEAT_LSE);
CHECK_BIT(CAP_BIT_FEAT_SHA256, FEAT_SHA2);
CHECK_BIT(CAP_BIT_FEAT_SHA1, FEAT_SHA1);
CHECK_BIT(CAP_BIT_FEAT_AES, FEAT_AES);
CHECK_BIT(CAP_BIT_FEAT_PMULL, FEAT_PMULL);
CHECK_BIT(CAP_BIT_FEAT_SPECRES, FEAT_PREDRES);
CHECK_BIT(CAP_BIT_FEAT_SB, FEAT_SB);
CHECK_BIT(CAP_BIT_FEAT_FRINTTS, FEAT_FRINTTS);
CHECK_BIT(CAP_BIT_FEAT_LRCPC, FEAT_RCPC);
CHECK_BIT(CAP_BIT_FEAT_LRCPC2, FEAT_RCPC2);
CHECK_BIT(CAP_BIT_FEAT_FCMA, FEAT_FCMA);
CHECK_BIT(CAP_BIT_FEAT_JSCVT, FEAT_JSCVT);
CHECK_BIT(CAP_BIT_FEAT_DPB, FEAT_DPB);
CHECK_BIT(CAP_BIT_FEAT_DPB2, FEAT_DPB2);
CHECK_BIT(CAP_BIT_FEAT_BF16, FEAT_BF16);
CHECK_BIT(CAP_BIT_FEAT_I8MM, FEAT_I8MM);
CHECK_BIT(CAP_BIT_FEAT_DIT, FEAT_DIT);
CHECK_BIT(CAP_BIT_FEAT_FP16, FEAT_FP16);
CHECK_BIT(CAP_BIT_FEAT_SSBS, FEAT_SSBS2);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I said on the other review SSBS2 implies SSBS.

CHECK_BIT(CAP_BIT_FEAT_BTI, FEAT_BTI);
CHECK_BIT(CAP_BIT_AdvSIMD, FEAT_SIMD);
CHECK_BIT(CAP_BIT_CRC32, FEAT_CRC);
CHECK_BIT(CAP_BIT_FEAT_SME, FEAT_SME);
CHECK_BIT(CAP_BIT_FEAT_SME2, FEAT_SME2);
CHECK_BIT(CAP_BIT_FEAT_SME_F64F64, FEAT_SME_F64);
CHECK_BIT(CAP_BIT_FEAT_SME_I16I64, FEAT_SME_I64);

features = deriveImplicitFeatures(features);

__atomic_store(&__aarch64_cpu_features.features, &features,
__ATOMIC_RELAXED);
return;
}
#endif

// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
static const struct {
const char *sysctl_name;
Expand All @@ -32,7 +106,6 @@ void __init_cpu_features_resolver(void) {
{"hw.optional.arm.FEAT_DotProd", FEAT_DOTPROD},
{"hw.optional.arm.FEAT_RDM", FEAT_RDM},
{"hw.optional.arm.FEAT_LSE", FEAT_LSE},
{"hw.optional.floatingpoint", FEAT_FP},
{"hw.optional.AdvSIMD", FEAT_SIMD},
{"hw.optional.armv8_crc32", FEAT_CRC},
{"hw.optional.arm.FEAT_SHA1", FEAT_SHA1},
Expand Down Expand Up @@ -62,7 +135,7 @@ void __init_cpu_features_resolver(void) {
if (isKnownAndSupported(feature_checks[I].sysctl_name))
features |= (1ULL << feature_checks[I].feature);

features |= (1ULL << FEAT_INIT);
features = deriveImplicitFeatures(features);

__atomic_store(&__aarch64_cpu_features.features, &features,
__ATOMIC_RELAXED);
Expand Down
Loading