Skip to content

Commit 2bd9f47

Browse files
threadpool: skip polling for unused threads
Currently all threads do N polling rounds even if only 1 thread is active (n_threads_cur == 1). This commit adds a check to skip the polling for unused threads (ith >= n_threads_cur). n_threads_cur is now an atomic_int to explicitly tell thread sanitizer that it is written from one thread and read from other threads (not a race conditions).
1 parent 23e0d70 commit 2bd9f47

File tree

1 file changed

+34
-17
lines changed

1 file changed

+34
-17
lines changed

ggml/src/ggml.c

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2016,7 +2016,7 @@ struct ggml_threadpool {
20162016

20172017
struct ggml_compute_state * workers; // per thread state
20182018
int n_threads_max; // number of threads in the pool
2019-
int n_threads_cur; // number of threads used in the current graph
2019+
atomic_int n_threads_cur; // number of threads used in the current graph
20202020

20212021
int32_t prio; // Scheduling priority
20222022
uint32_t poll; // Polling level (0 - no polling)
@@ -3180,22 +3180,23 @@ inline static void ggml_critical_section_start(void) {
31803180

31813181
#ifdef GGML_USE_OPENMP
31823182
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3183-
if (threadpool->n_threads_cur == 1) {
3183+
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3184+
if (n_threads == 1) {
31843185
return;
31853186
}
31863187

31873188
#pragma omp barrier
31883189
}
31893190
#else
31903191
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3191-
if (threadpool->n_threads_cur == 1) {
3192+
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3193+
if (n_threads == 1) {
31923194
return;
31933195
}
31943196

31953197
atomic_int * n_barrier = &threadpool->n_barrier;
31963198
atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
31973199

3198-
int n_threads = threadpool->n_threads_cur;
31993200
int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
32003201

32013202
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
@@ -19968,15 +19969,21 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1996819969

1996919970
#ifndef GGML_USE_OPENMP
1997019971

19971-
static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
19972+
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
19973+
struct ggml_threadpool * threadpool = state->threadpool;
19974+
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
19975+
return (state->ith < n_threads);
19976+
}
19977+
19978+
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
1997219979
struct ggml_threadpool * threadpool = state->threadpool;
1997319980

1997419981
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
1997519982

1997619983
// check for new graph/work
1997719984
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
1997819985
if (new_graph != state->last_graph) {
19979-
state->pending = (state->ith < threadpool->n_threads_cur);
19986+
state->pending = ggml_graph_compute_thread_active(state);
1998019987
state->last_graph = new_graph;
1998119988
}
1998219989

@@ -19986,11 +19993,16 @@ static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
1998619993
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
1998719994
struct ggml_threadpool * threadpool = state->threadpool;
1998819995

19996+
// Skip polling for unused threads
19997+
if (!ggml_graph_compute_thread_active(state)) {
19998+
return state->pending;
19999+
}
20000+
1998920001
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
1999020002
// Perhaps, we can adjust it dynamically based on load and things.
1999120003
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
1999220004

19993-
for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
20005+
for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
1999420006
// No new work. Keep polling.
1999520007
ggml_thread_cpu_relax();
1999620008
}
@@ -20006,9 +20018,9 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
2000620018
}
2000720019

2000820020
ggml_mutex_lock_shared(&threadpool->mutex);
20009-
while (!ggml_graph_compute_ready(state)) {
20021+
while (!ggml_graph_compute_thread_ready(state)) {
2001020022
// No new work. Wait for the signal.
20011-
GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
20023+
GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
2001220024
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
2001320025
}
2001420026
ggml_mutex_unlock_shared(&threadpool->mutex);
@@ -20055,12 +20067,17 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
2005520067
}
2005620068

2005720069
// Start processing new graph
20058-
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool)
20070+
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
2005920071
{
2006020072
// always take the mutex here because the worker threads are doing hybrid poll/wait
2006120073

2006220074
ggml_mutex_lock(&threadpool->mutex);
2006320075

20076+
GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
20077+
20078+
// Update the number of active threads
20079+
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
20080+
2006420081
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
2006520082

2006620083
if (threadpool->pause) {
@@ -20195,15 +20212,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2019520212
// No worker threads should be accessing the parameters below at this stage
2019620213
threadpool->cgraph = cgraph;
2019720214
threadpool->cplan = cplan;
20198-
threadpool->n_threads_cur = n_threads;
2019920215
threadpool->current_chunk = 0;
2020020216
threadpool->ec = GGML_STATUS_SUCCESS;
2020120217
}
2020220218

20203-
if (n_threads > threadpool->n_threads_max) {
20204-
GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
20205-
}
20206-
2020720219
#ifdef GGML_USE_OPENMP
2020820220
if (n_threads > 1) {
2020920221
#pragma omp parallel num_threads(n_threads)
@@ -20212,7 +20224,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2021220224
{
2021320225
// update the number of threads from the actual number of threads that we got from OpenMP
2021420226
n_threads = omp_get_num_threads();
20215-
threadpool->n_threads_cur = n_threads;
20227+
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
2021620228
}
2021720229

2021820230
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
@@ -20221,8 +20233,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2022120233
ggml_graph_compute_thread(&threadpool->workers[0]);
2022220234
}
2022320235
#else
20236+
if (n_threads > threadpool->n_threads_max) {
20237+
GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
20238+
n_threads = threadpool->n_threads_max;
20239+
}
20240+
2022420241
// Kick all threads to start the new graph
20225-
ggml_graph_compute_kickoff(threadpool);
20242+
ggml_graph_compute_kickoff(threadpool, n_threads);
2022620243

2022720244
// This is a work thread too
2022820245
ggml_graph_compute_thread(&threadpool->workers[0]);

0 commit comments

Comments
 (0)