Skip to content

Commit 6fc5f17

Browse files
committed
detect NUMA systems and pin work threads to nodes (linux)
1 parent 7780e4f commit 6fc5f17

File tree

3 files changed

+156
-25
lines changed

3 files changed

+156
-25
lines changed

ggml.c

Lines changed: 152 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ static int sched_yield (void) {
7676
#include <stdatomic.h>
7777

7878
typedef void* thread_ret_t;
79+
80+
#include <sys/types.h>
81+
#include <sys/stat.h>
82+
#include <unistd.h>
83+
7984
#endif
8085

8186
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -103,6 +108,30 @@ typedef void* thread_ret_t;
103108
#define GGML_SOFT_MAX_UNROLL 4
104109
#define GGML_VEC_DOT_UNROLL 2
105110

111+
//
112+
// logging
113+
//
114+
115+
#if (GGML_DEBUG >= 1)
116+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
117+
#else
118+
#define GGML_PRINT_DEBUG(...)
119+
#endif
120+
121+
#if (GGML_DEBUG >= 5)
122+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
123+
#else
124+
#define GGML_PRINT_DEBUG_5(...)
125+
#endif
126+
127+
#if (GGML_DEBUG >= 10)
128+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
129+
#else
130+
#define GGML_PRINT_DEBUG_10(...)
131+
#endif
132+
133+
#define GGML_PRINT(...) printf(__VA_ARGS__)
134+
106135
#ifdef GGML_USE_ACCELERATE
107136
// uncomment to use vDSP for soft max computation
108137
// note: not sure if it is actually faster
@@ -395,7 +424,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
395424
}
396425
}
397426

398-
399427
//
400428
// timing
401429
//
@@ -452,6 +480,85 @@ int64_t ggml_cycles_per_ms(void) {
452480
#define ggml_perf_cycles_per_ms() 0
453481
#endif
454482

483+
//
484+
// NUMA support
485+
//
486+
487+
struct ggml_numa_node
488+
{
489+
uint32_t *cpus; // hardware threads on this node
490+
uint32_t n_cpus;
491+
};
492+
493+
struct ggml_numa_nodes
494+
{
495+
struct ggml_numa_node *nodes;
496+
uint32_t n_nodes;
497+
uint32_t total_cpus; // hardware threads on system
498+
};
499+
500+
struct ggml_numa_nodes ggml_numa = {
501+
.nodes = NULL,
502+
.n_nodes = 0,
503+
.total_cpus = 0,
504+
};
505+
506+
void ggml_numa_init(void)
507+
{
508+
if (ggml_numa.n_nodes > 0) return;
509+
#ifdef __linux__
510+
struct stat st;
511+
char path[256];
512+
int rv;
513+
// enumerate nodes
514+
while (true) {
515+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes);
516+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
517+
if (stat(path, &st) != 0) break;
518+
++ggml_numa.n_nodes;
519+
}
520+
// enumerate CPUs
521+
while (true) {
522+
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus);
523+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
524+
if (stat(path, &st) != 0) break;
525+
++ggml_numa.total_cpus;
526+
}
527+
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", ggml_numa.n_nodes, ggml_numa.total_cpus);
528+
ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
529+
GGML_ASSERT(ggml_numa.nodes != NULL);
530+
for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) {
531+
struct ggml_numa_node *node = &ggml_numa.nodes[n];
532+
node->cpus = calloc(ggml_numa.total_cpus, sizeof(uint32_t));
533+
GGML_ASSERT(node->cpus != NULL);
534+
GGML_PRINT_DEBUG("CPUs on node %u:", n);
535+
for (uint32_t c = 0; c < ggml_numa.total_cpus; ++c) {
536+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
537+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
538+
if (stat(path, &st) == 0) {
539+
node->cpus[node->n_cpus++] = c;
540+
GGML_PRINT_DEBUG(" %u", c);
541+
}
542+
}
543+
GGML_PRINT_DEBUG("\n");
544+
}
545+
if (ggml_is_numa()) {
546+
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
547+
if (fptr != NULL) {
548+
char buf[42];
549+
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
550+
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
551+
}
552+
fclose(fptr);
553+
}
554+
}
555+
#else
556+
// TODO
557+
#endif
558+
}
559+
560+
bool ggml_is_numa(void) { return ggml_numa.n_nodes > 1; }
561+
455562
//
456563
// cache line
457564
//
@@ -3405,30 +3512,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
34053512
*s = 1.f/(*s);
34063513
}
34073514

3408-
//
3409-
// logging
3410-
//
3411-
3412-
#if (GGML_DEBUG >= 1)
3413-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
3414-
#else
3415-
#define GGML_PRINT_DEBUG(...)
3416-
#endif
3417-
3418-
#if (GGML_DEBUG >= 5)
3419-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
3420-
#else
3421-
#define GGML_PRINT_DEBUG_5(...)
3422-
#endif
3423-
3424-
#if (GGML_DEBUG >= 10)
3425-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
3426-
#else
3427-
#define GGML_PRINT_DEBUG_10(...)
3428-
#endif
3429-
3430-
#define GGML_PRINT(...) printf(__VA_ARGS__)
3431-
34323515
//
34333516
// data types
34343517
//
@@ -13966,6 +14049,49 @@ typedef pthread_t ggml_thread_t;
1396614049

1396714050
#endif
1396814051

14052+
#ifdef __linux__
14053+
void set_numa_thread_affinity(int thread_n, int n_threads)
14054+
{
14055+
if (!ggml_is_numa()) return;
14056+
// run thread on node_num thread_n / (threads per node)
14057+
int node_num = thread_n / (n_threads / ggml_numa.n_nodes);
14058+
struct ggml_numa_node *node = &ggml_numa.nodes[node_num];
14059+
size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
14060+
cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
14061+
CPU_ZERO_S(setsize, cpus);
14062+
for (size_t i=0; i < node->n_cpus; ++i) {
14063+
CPU_SET_S(node->cpus[i], setsize, cpus);
14064+
}
14065+
int rv;
14066+
if ((rv = pthread_setaffinity_np(pthread_self(), setsize, cpus))) {
14067+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
14068+
strerror(rv));
14069+
}
14070+
CPU_FREE(cpus);
14071+
}
14072+
void clear_numa_thread_affinity(void)
14073+
{
14074+
if (!ggml_is_numa()) return;
14075+
size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
14076+
cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
14077+
CPU_ZERO_S(setsize, cpus);
14078+
for (unsigned i=0; i < ggml_numa.total_cpus; ++i) {
14079+
CPU_SET_S(i, setsize, cpus);
14080+
}
14081+
int rv;
14082+
if((rv = pthread_setaffinity_np(pthread_self(), setsize, cpus))) {
14083+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
14084+
strerror(rv));
14085+
}
14086+
CPU_FREE(cpus);
14087+
}
14088+
#else
14089+
// TODO: Windows etc.
14090+
// (the linux implementation may also work on BSD, someone should test)
14091+
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
14092+
void clear_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
14093+
#endif
14094+
1396914095
struct ggml_compute_state_shared {
1397014096
ggml_lock_t spin;
1397114097

@@ -13990,6 +14116,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1399014116
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
1399114117

1399214118
const int n_threads = state->shared->n_threads;
14119+
set_numa_thread_affinity(state->params.ith, n_threads);
1399314120

1399414121
while (true) {
1399514122
if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {

ggml.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,9 @@ extern "C" {
417417
GGML_API int64_t ggml_cycles(void);
418418
GGML_API int64_t ggml_cycles_per_ms(void);
419419

420+
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
421+
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
422+
420423
GGML_API void ggml_print_object (const struct ggml_object * obj);
421424
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
422425

llama.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,7 @@ bool llama_mlock_supported() {
851851

852852
void llama_init_backend() {
853853
ggml_time_init();
854+
ggml_numa_init();
854855

855856
// needed to initialize f16 tables
856857
{

0 commit comments

Comments
 (0)