ggml: Added run-time detection of neon, i8mm and sve

eddnjjn · eddnjjn · commit 83243679b0a5 · 2024-09-05T15:56:07.000+02:00
Adds run-time detection of the Arm instructions set features
neon, i8mm and sve for Linux and Apple build targets.
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -2514,6 +2514,9 @@ extern "C" {
     GGML_API int ggml_cpu_has_cann       (void);
     GGML_API int ggml_cpu_has_llamafile  (void);
 
+    // get the sve vector length in bytes
+    GGML_API int ggml_cpu_get_sve_cnt(void);
+
     //
     // Internal types and functions exposed for tests and benchmarks
     //
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
@@ -546,8 +546,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -658,8 +658,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -776,7 +776,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -842,12 +842,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         return;
     }
     else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                     "performance");
     }
     else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                     "quantization format for optimal performance");
     }
@@ -997,8 +997,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_sve_cnt_b == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -1518,8 +1518,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_sve_cnt_b == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -1980,7 +1980,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -2391,12 +2391,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         return;
     }
     else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                     "performance");
     }
     else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                     "quantization format for optimal performance");
     }
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
@@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     float sumf = 0;
 
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
         const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
         const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
 
@@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     float sumf = 0;
 
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
         svfloat32_t sumv0 = svdup_n_f32(0.0f);
         svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
@@ -127,10 +127,6 @@ void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 
-#if defined(__ARM_FEATURE_SVE)
-extern int ggml_sve_cnt_b;
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -37,9 +37,15 @@
 #include <unistd.h>
 #endif
 
-#if defined(__ARM_FEATURE_SVE)
-int ggml_sve_cnt_b = 0;
+#if defined(__aarch64__)
+struct ggml_aarch64_features_type {
+    int has_neon;
+    int has_i8mm;
+    int has_sve;
+    int sve_cnt;
+} ggml_aarch64_features = {-1, -1, -1, 0};
 #endif
+
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@@ -3601,6 +3607,69 @@ static inline int ggml_up(int n, int m) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__aarch64__)
+
+#if defined(__linux__)
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+static void ggml_init_aarch64_features(void) {
+    if (ggml_aarch64_features.has_neon == -1) {
+
+#if defined(__linux__)
+        uint32_t hwcap = getauxval(AT_HWCAP);
+        uint32_t hwcap2 = getauxval(AT_HWCAP2);
+
+        ggml_aarch64_features.has_neon = !!(hwcap & HWCAP_ASIMD);
+        ggml_aarch64_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
+        ggml_aarch64_features.has_sve  = !!(hwcap & HWCAP_SVE);
+
+#if defined(__ARM_FEATURE_SVE)
+        ggml_aarch64_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#endif
+#elif defined(__APPLE__)
+        int oldp = 0;
+        size_t size = sizeof(oldp);
+        if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
+            oldp = 0;
+        }
+        ggml_aarch64_features.has_neon = oldp;
+
+        if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
+            oldp = 0;
+        }
+        ggml_aarch64_features.has_i8mm = oldp;
+
+        ggml_aarch64_features.has_sve = 0;
+        ggml_aarch64_features.sve_cnt = 0;
+#else
+// Run-time CPU feature detection not implemented for this platform, fallback to compile time
+#if defined(__ARM_NEON)
+        ggml_aarch64_features.has_neon = 1;
+#else
+        ggml_aarch64_features.has_neon = 0;
+#endif
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+        ggml_aarch64_features.has_i8mm = 1;
+#else
+        ggml_aarch64_features.has_i8mm = 0;
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+        ggml_aarch64_features.has_sve = 1;
+        ggml_aarch64_features.sve_cnt = 16;
+#else
+        ggml_aarch64_features.has_sve = 0;
+        ggml_aarch64_features.sve_cnt = 0;
+#endif
+#endif
+    }
+}
+#endif
+
 struct ggml_context * ggml_init(struct ggml_init_params params) {
     // make this function thread safe
     ggml_critical_section_start();
@@ -3699,10 +3768,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     GGML_ASSERT_ALIGNED(ctx->mem_buffer);
 
-#if defined(__ARM_FEATURE_SVE)
-    if (!ggml_sve_cnt_b) {
-        ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
-    }
+#if defined(__aarch64__)
+    ggml_init_aarch64_features();
 #endif
 
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
@@ -23181,16 +23248,16 @@ int ggml_cpu_has_fma(void) {
 }
 
 int ggml_cpu_has_neon(void) {
-#if defined(__ARM_NEON)
-    return 1;
+#if defined(__aarch64__)
+    return ggml_aarch64_features.has_neon;
 #else
     return 0;
 #endif
 }
 
 int ggml_cpu_has_sve(void) {
-#if defined(__ARM_FEATURE_SVE)
-    return 1;
+#if defined(__aarch64__)
+    return ggml_aarch64_features.has_sve;
 #else
     return 0;
 #endif
@@ -23329,11 +23396,18 @@ int ggml_cpu_has_vsx(void) {
 }
 
 int ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    return 1;
+#if defined(__aarch64__)
+    return ggml_aarch64_features.has_i8mm;
 #else
     return 0;
 #endif
 }
 
+int ggml_cpu_get_sve_cnt(void) {
+#if defined(__aarch64__)
+    return ggml_aarch64_features.sve_cnt;
+#else
+    return 0;
+#endif
+}
 ////////////////////////////////////////////////////////////////////////////////