@@ -2016,7 +2016,7 @@ struct ggml_threadpool {
2016
2016
2017
2017
struct ggml_compute_state * workers; // per thread state
2018
2018
int n_threads_max; // number of threads in the pool
2019
- int n_threads_cur; // number of threads used in the current graph
2019
+ atomic_int n_threads_cur; // number of threads used in the current graph
2020
2020
2021
2021
int32_t prio; // Scheduling priority
2022
2022
uint32_t poll; // Polling level (0 - no polling)
@@ -3180,22 +3180,23 @@ inline static void ggml_critical_section_start(void) {
3180
3180
3181
3181
#ifdef GGML_USE_OPENMP
3182
3182
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3183
- if (threadpool->n_threads_cur == 1) {
3183
+ int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3184
+ if (n_threads == 1) {
3184
3185
return;
3185
3186
}
3186
3187
3187
3188
#pragma omp barrier
3188
3189
}
3189
3190
#else
3190
3191
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3191
- if (threadpool->n_threads_cur == 1) {
3192
+ int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3193
+ if (n_threads == 1) {
3192
3194
return;
3193
3195
}
3194
3196
3195
3197
atomic_int * n_barrier = &threadpool->n_barrier;
3196
3198
atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
3197
3199
3198
- int n_threads = threadpool->n_threads_cur;
3199
3200
int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
3200
3201
3201
3202
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
@@ -19968,15 +19969,21 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19968
19969
19969
19970
#ifndef GGML_USE_OPENMP
19970
19971
19971
- static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
19972
+ static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
19973
+ struct ggml_threadpool * threadpool = state->threadpool;
19974
+ int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
19975
+ return (state->ith < n_threads);
19976
+ }
19977
+
19978
+ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
19972
19979
struct ggml_threadpool * threadpool = state->threadpool;
19973
19980
19974
19981
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
19975
19982
19976
19983
// check for new graph/work
19977
19984
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
19978
19985
if (new_graph != state->last_graph) {
19979
- state->pending = (state->ith < threadpool->n_threads_cur );
19986
+ state->pending = ggml_graph_compute_thread_active (state);
19980
19987
state->last_graph = new_graph;
19981
19988
}
19982
19989
@@ -19986,11 +19993,16 @@ static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
19986
19993
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
19987
19994
struct ggml_threadpool * threadpool = state->threadpool;
19988
19995
19996
+ // Skip polling for unused threads
19997
+ if (!ggml_graph_compute_thread_active(state)) {
19998
+ return state->pending;
19999
+ }
20000
+
19989
20001
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
19990
20002
// Perhaps, we can adjust it dynamically based on load and things.
19991
20003
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
19992
20004
19993
- for (uint64_t i=0; !ggml_graph_compute_ready (state) && i< n_rounds; i++) {
20005
+ for (uint64_t i=0; !ggml_graph_compute_thread_ready (state) && i < n_rounds; i++) {
19994
20006
// No new work. Keep polling.
19995
20007
ggml_thread_cpu_relax();
19996
20008
}
@@ -20006,9 +20018,9 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
20006
20018
}
20007
20019
20008
20020
ggml_mutex_lock_shared(&threadpool->mutex);
20009
- while (!ggml_graph_compute_ready (state)) {
20021
+ while (!ggml_graph_compute_thread_ready (state)) {
20010
20022
// No new work. Wait for the signal.
20011
- GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
20023
+ GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping) \n", state->ith);
20012
20024
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
20013
20025
}
20014
20026
ggml_mutex_unlock_shared(&threadpool->mutex);
@@ -20055,12 +20067,17 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
20055
20067
}
20056
20068
20057
20069
// Start processing new graph
20058
- static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool)
20070
+ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads )
20059
20071
{
20060
20072
// always take the mutex here because the worker threads are doing hybrid poll/wait
20061
20073
20062
20074
ggml_mutex_lock(&threadpool->mutex);
20063
20075
20076
+ GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
20077
+
20078
+ // Update the number of active threads
20079
+ atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
20080
+
20064
20081
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
20065
20082
20066
20083
if (threadpool->pause) {
@@ -20195,15 +20212,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
20195
20212
// No worker threads should be accessing the parameters below at this stage
20196
20213
threadpool->cgraph = cgraph;
20197
20214
threadpool->cplan = cplan;
20198
- threadpool->n_threads_cur = n_threads;
20199
20215
threadpool->current_chunk = 0;
20200
20216
threadpool->ec = GGML_STATUS_SUCCESS;
20201
20217
}
20202
20218
20203
- if (n_threads > threadpool->n_threads_max) {
20204
- GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
20205
- }
20206
-
20207
20219
#ifdef GGML_USE_OPENMP
20208
20220
if (n_threads > 1) {
20209
20221
#pragma omp parallel num_threads(n_threads)
@@ -20212,7 +20224,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
20212
20224
{
20213
20225
// update the number of threads from the actual number of threads that we got from OpenMP
20214
20226
n_threads = omp_get_num_threads();
20215
- threadpool->n_threads_cur = n_threads;
20227
+ atomic_store_explicit(& threadpool->n_threads_cur, n_threads, memory_order_relaxed) ;
20216
20228
}
20217
20229
20218
20230
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
@@ -20221,8 +20233,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
20221
20233
ggml_graph_compute_thread(&threadpool->workers[0]);
20222
20234
}
20223
20235
#else
20236
+ if (n_threads > threadpool->n_threads_max) {
20237
+ GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
20238
+ n_threads = threadpool->n_threads_max;
20239
+ }
20240
+
20224
20241
// Kick all threads to start the new graph
20225
- ggml_graph_compute_kickoff(threadpool);
20242
+ ggml_graph_compute_kickoff(threadpool, n_threads );
20226
20243
20227
20244
// This is a work thread too
20228
20245
ggml_graph_compute_thread(&threadpool->workers[0]);
0 commit comments