@@ -76,6 +76,11 @@ static int sched_yield (void) {
76
76
#include <stdatomic.h>
77
77
78
78
typedef void * thread_ret_t ;
79
+
80
+ #include <sys/types.h>
81
+ #include <sys/stat.h>
82
+ #include <unistd.h>
83
+
79
84
#endif
80
85
81
86
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -103,6 +108,30 @@ typedef void* thread_ret_t;
103
108
#define GGML_SOFT_MAX_UNROLL 4
104
109
#define GGML_VEC_DOT_UNROLL 2
105
110
111
+ //
112
+ // logging
113
+ //
114
+
115
+ #if (GGML_DEBUG >= 1 )
116
+ #define GGML_PRINT_DEBUG (...) printf(__VA_ARGS__)
117
+ #else
118
+ #define GGML_PRINT_DEBUG (...)
119
+ #endif
120
+
121
+ #if (GGML_DEBUG >= 5 )
122
+ #define GGML_PRINT_DEBUG_5 (...) printf(__VA_ARGS__)
123
+ #else
124
+ #define GGML_PRINT_DEBUG_5 (...)
125
+ #endif
126
+
127
+ #if (GGML_DEBUG >= 10 )
128
+ #define GGML_PRINT_DEBUG_10 (...) printf(__VA_ARGS__)
129
+ #else
130
+ #define GGML_PRINT_DEBUG_10 (...)
131
+ #endif
132
+
133
+ #define GGML_PRINT (...) printf(__VA_ARGS__)
134
+
106
135
#ifdef GGML_USE_ACCELERATE
107
136
// uncomment to use vDSP for soft max computation
108
137
// note: not sure if it is actually faster
@@ -395,7 +424,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
395
424
}
396
425
}
397
426
398
-
399
427
//
400
428
// timing
401
429
//
@@ -452,6 +480,85 @@ int64_t ggml_cycles_per_ms(void) {
452
480
#define ggml_perf_cycles_per_ms () 0
453
481
#endif
454
482
483
+ //
484
+ // NUMA support
485
+ //
486
+
487
+ struct ggml_numa_node
488
+ {
489
+ uint32_t * cpus ; // hardware threads on this node
490
+ uint32_t n_cpus ;
491
+ };
492
+
493
+ struct ggml_numa_nodes
494
+ {
495
+ struct ggml_numa_node * nodes ;
496
+ uint32_t n_nodes ;
497
+ uint32_t total_cpus ; // hardware threads on system
498
+ };
499
+
500
+ struct ggml_numa_nodes ggml_numa = {
501
+ .nodes = NULL ,
502
+ .n_nodes = 0 ,
503
+ .total_cpus = 0 ,
504
+ };
505
+
506
+ void ggml_numa_init (void )
507
+ {
508
+ if (ggml_numa .n_nodes > 0 ) return ;
509
+ #ifdef __linux__
510
+ struct stat st ;
511
+ char path [256 ];
512
+ int rv ;
513
+ // enumerate nodes
514
+ while (true) {
515
+ rv = snprintf (path , sizeof (path ), "/sys/devices/system/node/node%u" , ggml_numa .n_nodes );
516
+ GGML_ASSERT (rv > 0 && (unsigned )rv < sizeof (path ));
517
+ if (stat (path , & st ) != 0 ) break ;
518
+ ++ ggml_numa .n_nodes ;
519
+ }
520
+ // enumerate CPUs
521
+ while (true) {
522
+ rv = snprintf (path , sizeof (path ), "/sys/devices/system/cpu/cpu%u" , ggml_numa .total_cpus );
523
+ GGML_ASSERT (rv > 0 && (unsigned )rv < sizeof (path ));
524
+ if (stat (path , & st ) != 0 ) break ;
525
+ ++ ggml_numa .total_cpus ;
526
+ }
527
+ GGML_PRINT_DEBUG ("found %u numa nodes, %u CPUs\n" , ggml_numa .n_nodes , ggml_numa .total_cpus );
528
+ ggml_numa .nodes = calloc (ggml_numa .n_nodes , sizeof (struct ggml_numa_node ));
529
+ GGML_ASSERT (ggml_numa .nodes != NULL );
530
+ for (uint32_t n = 0 ; n < ggml_numa .n_nodes ; ++ n ) {
531
+ struct ggml_numa_node * node = & ggml_numa .nodes [n ];
532
+ node -> cpus = calloc (ggml_numa .total_cpus , sizeof (uint32_t ));
533
+ GGML_ASSERT (node -> cpus != NULL );
534
+ GGML_PRINT_DEBUG ("CPUs on node %u:" , n );
535
+ for (uint32_t c = 0 ; c < ggml_numa .total_cpus ; ++ c ) {
536
+ rv = snprintf (path , sizeof (path ), "/sys/devices/system/node/node%u/cpu%u" , n , c );
537
+ GGML_ASSERT (rv > 0 && (unsigned )rv < sizeof (path ));
538
+ if (stat (path , & st ) == 0 ) {
539
+ node -> cpus [node -> n_cpus ++ ] = c ;
540
+ GGML_PRINT_DEBUG (" %u" , c );
541
+ }
542
+ }
543
+ GGML_PRINT_DEBUG ("\n" );
544
+ }
545
+ if (ggml_is_numa ()) {
546
+ FILE * fptr = fopen ("/proc/sys/kernel/numa_balancing" , "r" );
547
+ if (fptr != NULL ) {
548
+ char buf [42 ];
549
+ if (fgets (buf , sizeof (buf ), fptr ) && strncmp (buf , "0\n" , sizeof (buf )) != 0 ) {
550
+ GGML_PRINT ("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n" );
551
+ }
552
+ fclose (fptr );
553
+ }
554
+ }
555
+ #else
556
+ // TODO
557
+ #endif
558
+ }
559
+
560
+ bool ggml_is_numa (void ) { return ggml_numa .n_nodes > 1 ; }
561
+
455
562
//
456
563
// cache line
457
564
//
@@ -3405,30 +3512,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
3405
3512
* s = 1.f /(* s );
3406
3513
}
3407
3514
3408
- //
3409
- // logging
3410
- //
3411
-
3412
- #if (GGML_DEBUG >= 1 )
3413
- #define GGML_PRINT_DEBUG (...) printf(__VA_ARGS__)
3414
- #else
3415
- #define GGML_PRINT_DEBUG (...)
3416
- #endif
3417
-
3418
- #if (GGML_DEBUG >= 5 )
3419
- #define GGML_PRINT_DEBUG_5 (...) printf(__VA_ARGS__)
3420
- #else
3421
- #define GGML_PRINT_DEBUG_5 (...)
3422
- #endif
3423
-
3424
- #if (GGML_DEBUG >= 10 )
3425
- #define GGML_PRINT_DEBUG_10 (...) printf(__VA_ARGS__)
3426
- #else
3427
- #define GGML_PRINT_DEBUG_10 (...)
3428
- #endif
3429
-
3430
- #define GGML_PRINT (...) printf(__VA_ARGS__)
3431
-
3432
3515
//
3433
3516
// data types
3434
3517
//
@@ -13966,6 +14049,49 @@ typedef pthread_t ggml_thread_t;
13966
14049
13967
14050
#endif
13968
14051
14052
+ #ifdef __linux__
14053
+ void set_numa_thread_affinity (int thread_n , int n_threads )
14054
+ {
14055
+ if (!ggml_is_numa ()) return ;
14056
+ // run thread on node_num thread_n / (threads per node)
14057
+ int node_num = thread_n / (n_threads / ggml_numa .n_nodes );
14058
+ struct ggml_numa_node * node = & ggml_numa .nodes [node_num ];
14059
+ size_t setsize = CPU_ALLOC_SIZE (ggml_numa .total_cpus );
14060
+ cpu_set_t * cpus = CPU_ALLOC (ggml_numa .total_cpus );
14061
+ CPU_ZERO_S (setsize , cpus );
14062
+ for (size_t i = 0 ; i < node -> n_cpus ; ++ i ) {
14063
+ CPU_SET_S (node -> cpus [i ], setsize , cpus );
14064
+ }
14065
+ int rv ;
14066
+ if ((rv = pthread_setaffinity_np (pthread_self (), setsize , cpus ))) {
14067
+ fprintf (stderr , "warning: pthread_setaffinity_np() failed: %s\n" ,
14068
+ strerror (rv ));
14069
+ }
14070
+ CPU_FREE (cpus );
14071
+ }
14072
+ void clear_numa_thread_affinity (void )
14073
+ {
14074
+ if (!ggml_is_numa ()) return ;
14075
+ size_t setsize = CPU_ALLOC_SIZE (ggml_numa .total_cpus );
14076
+ cpu_set_t * cpus = CPU_ALLOC (ggml_numa .total_cpus );
14077
+ CPU_ZERO_S (setsize , cpus );
14078
+ for (unsigned i = 0 ; i < ggml_numa .total_cpus ; ++ i ) {
14079
+ CPU_SET_S (i , setsize , cpus );
14080
+ }
14081
+ int rv ;
14082
+ if ((rv = pthread_setaffinity_np (pthread_self (), setsize , cpus ))) {
14083
+ fprintf (stderr , "warning: pthread_setaffinity_np() failed: %s\n" ,
14084
+ strerror (rv ));
14085
+ }
14086
+ CPU_FREE (cpus );
14087
+ }
14088
+ #else
14089
+ // TODO: Windows etc.
14090
+ // (the linux implementation may also work on BSD, someone should test)
14091
+ void set_numa_thread_affinity (int thread_n , int n_threads ) { UNUSED (thread_n ); UNUSED (n_threads ); }
14092
+ void clear_numa_thread_affinity (int thread_n , int n_threads ) { UNUSED (thread_n ); UNUSED (n_threads ); }
14093
+ #endif
14094
+
13969
14095
struct ggml_compute_state_shared {
13970
14096
ggml_lock_t spin ;
13971
14097
@@ -13990,6 +14116,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
13990
14116
struct ggml_compute_state * state = (struct ggml_compute_state * ) data ;
13991
14117
13992
14118
const int n_threads = state -> shared -> n_threads ;
14119
+ set_numa_thread_affinity (state -> params .ith , n_threads );
13993
14120
13994
14121
while (true) {
13995
14122
if (atomic_fetch_add (& state -> shared -> n_ready , 1 ) == n_threads - 1 ) {
0 commit comments