Skip to content

Commit 8324367

Browse files
committed
ggml: Added run-time detection of neon, i8mm and sve
Adds run-time detection of the Arm instructions set features neon, i8mm and sve for Linux and Apple build targets.
1 parent 4db0478 commit 8324367

File tree

5 files changed

+105
-32
lines changed

5 files changed

+105
-32
lines changed

ggml/include/ggml.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2514,6 +2514,9 @@ extern "C" {
25142514
GGML_API int ggml_cpu_has_cann (void);
25152515
GGML_API int ggml_cpu_has_llamafile (void);
25162516

2517+
// get the sve vector length in bytes
2518+
GGML_API int ggml_cpu_get_sve_cnt(void);
2519+
25172520
//
25182521
// Internal types and functions exposed for tests and benchmarks
25192522
//

ggml/src/ggml-aarch64.c

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -546,8 +546,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
546546
UNUSED(blocklen);
547547

548548
#if defined(__ARM_FEATURE_SVE)
549-
if (ggml_sve_cnt_b == QK8_0) {
550-
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
549+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
550+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
551551
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
552552
}
553553
#endif
@@ -658,8 +658,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
658658
UNUSED(blocklen);
659659

660660
#if defined(__ARM_FEATURE_SVE)
661-
if (ggml_sve_cnt_b == QK8_0) {
662-
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
661+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
662+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
663663
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
664664
}
665665
#endif
@@ -776,7 +776,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
776776
UNUSED(blocklen);
777777

778778
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
779-
if (ggml_sve_cnt_b == QK8_0) {
779+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
780780
const void * b_ptr = vx;
781781
const void * a_ptr = vy;
782782
float * res_ptr = s;
@@ -842,12 +842,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
842842
return;
843843
}
844844
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
845-
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
845+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
846846
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
847847
"performance");
848848
}
849849
else if (ggml_cpu_has_neon()) {
850-
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
850+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
851851
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
852852
"quantization format for optimal performance");
853853
}
@@ -997,8 +997,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
997997
UNUSED(blocklen);
998998

999999
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1000-
if (ggml_sve_cnt_b == QK8_0) {
1001-
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
1000+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
1001+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
10021002
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
10031003
}
10041004
#endif
@@ -1518,8 +1518,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
15181518
UNUSED(blocklen);
15191519

15201520
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1521-
if (ggml_sve_cnt_b == QK8_0) {
1522-
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
1521+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
1522+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
15231523
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
15241524
}
15251525
#endif
@@ -1980,7 +1980,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
19801980
UNUSED(blocklen);
19811981

19821982
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1983-
if (ggml_sve_cnt_b == QK8_0) {
1983+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
19841984
const void * b_ptr = vx;
19851985
const void * a_ptr = vy;
19861986
float * res_ptr = s;
@@ -2391,12 +2391,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
23912391
return;
23922392
}
23932393
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2394-
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
2394+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
23952395
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
23962396
"performance");
23972397
}
23982398
else if (ggml_cpu_has_neon()) {
2399-
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
2399+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
24002400
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
24012401
"quantization format for optimal performance");
24022402
}

ggml/src/ggml-quants.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
38183818
float sumf = 0;
38193819

38203820
#if defined(__ARM_FEATURE_SVE)
3821-
if (ggml_sve_cnt_b == QK8_0) {
3821+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
38223822
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
38233823
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
38243824

@@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
53035303
float sumf = 0;
53045304

53055305
#if defined(__ARM_FEATURE_SVE)
5306-
if (ggml_sve_cnt_b == QK8_0) {
5306+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
53075307
svfloat32_t sumv0 = svdup_n_f32(0.0f);
53085308
svfloat32_t sumv1 = svdup_n_f32(0.0f);
53095309

ggml/src/ggml-quants.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,6 @@ void iq2xs_free_impl(enum ggml_type type);
127127
void iq3xs_init_impl(int grid_size);
128128
void iq3xs_free_impl(int grid_size);
129129

130-
#if defined(__ARM_FEATURE_SVE)
131-
extern int ggml_sve_cnt_b;
132-
#endif
133-
134130
#ifdef __cplusplus
135131
}
136132
#endif

ggml/src/ggml.c

Lines changed: 86 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,15 @@
3737
#include <unistd.h>
3838
#endif
3939

40-
#if defined(__ARM_FEATURE_SVE)
41-
int ggml_sve_cnt_b = 0;
40+
#if defined(__aarch64__)
41+
struct ggml_aarch64_features_type {
42+
int has_neon;
43+
int has_i8mm;
44+
int has_sve;
45+
int sve_cnt;
46+
} ggml_aarch64_features = {-1, -1, -1, 0};
4247
#endif
48+
4349
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
4450
#undef GGML_USE_LLAMAFILE
4551
#endif
@@ -3601,6 +3607,69 @@ static inline int ggml_up(int n, int m) {
36013607

36023608
////////////////////////////////////////////////////////////////////////////////
36033609

3610+
#if defined(__aarch64__)
3611+
3612+
#if defined(__linux__)
3613+
#include <sys/auxv.h>
3614+
#elif defined(__APPLE__)
3615+
#include <sys/sysctl.h>
3616+
#endif
3617+
3618+
static void ggml_init_aarch64_features(void) {
3619+
if (ggml_aarch64_features.has_neon == -1) {
3620+
3621+
#if defined(__linux__)
3622+
uint32_t hwcap = getauxval(AT_HWCAP);
3623+
uint32_t hwcap2 = getauxval(AT_HWCAP2);
3624+
3625+
ggml_aarch64_features.has_neon = !!(hwcap & HWCAP_ASIMD);
3626+
ggml_aarch64_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
3627+
ggml_aarch64_features.has_sve = !!(hwcap & HWCAP_SVE);
3628+
3629+
#if defined(__ARM_FEATURE_SVE)
3630+
ggml_aarch64_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3631+
#endif
3632+
#elif defined(__APPLE__)
3633+
int oldp = 0;
3634+
size_t size = sizeof(oldp);
3635+
if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
3636+
oldp = 0;
3637+
}
3638+
ggml_aarch64_features.has_neon = oldp;
3639+
3640+
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
3641+
oldp = 0;
3642+
}
3643+
ggml_aarch64_features.has_i8mm = oldp;
3644+
3645+
ggml_aarch64_features.has_sve = 0;
3646+
ggml_aarch64_features.sve_cnt = 0;
3647+
#else
3648+
// Run-time CPU feature detection not implemented for this platform, fallback to compile time
3649+
#if defined(__ARM_NEON)
3650+
ggml_aarch64_features.has_neon = 1;
3651+
#else
3652+
ggml_aarch64_features.has_neon = 0;
3653+
#endif
3654+
3655+
#if defined(__ARM_FEATURE_MATMUL_INT8)
3656+
ggml_aarch64_features.has_i8mm = 1;
3657+
#else
3658+
ggml_aarch64_features.has_i8mm = 0;
3659+
#endif
3660+
3661+
#if defined(__ARM_FEATURE_SVE)
3662+
ggml_aarch64_features.has_sve = 1;
3663+
ggml_aarch64_features.sve_cnt = 16;
3664+
#else
3665+
ggml_aarch64_features.has_sve = 0;
3666+
ggml_aarch64_features.sve_cnt = 0;
3667+
#endif
3668+
#endif
3669+
}
3670+
}
3671+
#endif
3672+
36043673
struct ggml_context * ggml_init(struct ggml_init_params params) {
36053674
// make this function thread safe
36063675
ggml_critical_section_start();
@@ -3699,10 +3768,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
36993768

37003769
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
37013770

3702-
#if defined(__ARM_FEATURE_SVE)
3703-
if (!ggml_sve_cnt_b) {
3704-
ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3705-
}
3771+
#if defined(__aarch64__)
3772+
ggml_init_aarch64_features();
37063773
#endif
37073774

37083775
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
@@ -23181,16 +23248,16 @@ int ggml_cpu_has_fma(void) {
2318123248
}
2318223249

2318323250
int ggml_cpu_has_neon(void) {
23184-
#if defined(__ARM_NEON)
23185-
return 1;
23251+
#if defined(__aarch64__)
23252+
return ggml_aarch64_features.has_neon;
2318623253
#else
2318723254
return 0;
2318823255
#endif
2318923256
}
2319023257

2319123258
int ggml_cpu_has_sve(void) {
23192-
#if defined(__ARM_FEATURE_SVE)
23193-
return 1;
23259+
#if defined(__aarch64__)
23260+
return ggml_aarch64_features.has_sve;
2319423261
#else
2319523262
return 0;
2319623263
#endif
@@ -23329,11 +23396,18 @@ int ggml_cpu_has_vsx(void) {
2332923396
}
2333023397

2333123398
int ggml_cpu_has_matmul_int8(void) {
23332-
#if defined(__ARM_FEATURE_MATMUL_INT8)
23333-
return 1;
23399+
#if defined(__aarch64__)
23400+
return ggml_aarch64_features.has_i8mm;
2333423401
#else
2333523402
return 0;
2333623403
#endif
2333723404
}
2333823405

23406+
int ggml_cpu_get_sve_cnt(void) {
23407+
#if defined(__aarch64__)
23408+
return ggml_aarch64_features.sve_cnt;
23409+
#else
23410+
return 0;
23411+
#endif
23412+
}
2333923413
////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)