Skip to content

Commit ed094a5

Browse files
threadpool: further simplify and improve ggml_barrier
Avoid using strict memory order while polling, yet make sure that all threads go through full memory barrier (memory fence) on ggml_barrier entrace and exit.
1 parent 2bd9f47 commit ed094a5

File tree

1 file changed

+15
-21
lines changed

1 file changed

+15
-21
lines changed

ggml/src/ggml.c

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3178,42 +3178,36 @@ inline static void ggml_critical_section_start(void) {
31783178
}
31793179
}
31803180

3181-
#ifdef GGML_USE_OPENMP
3182-
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3183-
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3181+
static void ggml_barrier(struct ggml_threadpool * tp) {
3182+
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
31843183
if (n_threads == 1) {
31853184
return;
31863185
}
31873186

3187+
#ifdef GGML_USE_OPENMP
31883188
#pragma omp barrier
3189-
}
31903189
#else
3191-
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3192-
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3193-
if (n_threads == 1) {
3194-
return;
3195-
}
3196-
3197-
atomic_int * n_barrier = &threadpool->n_barrier;
3198-
atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
3190+
int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
31993191

3200-
int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
3192+
// enter barrier (full seq-cst fence)
3193+
int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
32013194

3202-
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
3195+
int last = 0;
3196+
if (n_barrier == (n_threads - 1)) {
32033197
// last thread
3204-
atomic_store(n_barrier, 0);
3205-
atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
3198+
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
3199+
last = 1;
32063200
} else {
32073201
// wait for other threads
3208-
while (true) {
3209-
if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
3210-
return;
3211-
}
3202+
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
32123203
ggml_thread_cpu_relax();
32133204
}
32143205
}
3215-
}
3206+
3207+
// exit barrier (full seq-cst fence)
3208+
atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
32163209
#endif
3210+
}
32173211

32183212
// TODO: make this somehow automatically executed
32193213
// some sort of "sentry" mechanism

0 commit comments

Comments
 (0)