@@ -3178,42 +3178,36 @@ inline static void ggml_critical_section_start(void) {
3178
3178
}
3179
3179
}
3180
3180
3181
- #ifdef GGML_USE_OPENMP
3182
- static void ggml_barrier(struct ggml_threadpool * threadpool) {
3183
- int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3181
+ static void ggml_barrier(struct ggml_threadpool * tp) {
3182
+ int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
3184
3183
if (n_threads == 1) {
3185
3184
return;
3186
3185
}
3187
3186
3187
+ #ifdef GGML_USE_OPENMP
3188
3188
#pragma omp barrier
3189
- }
3190
3189
#else
3191
- static void ggml_barrier(struct ggml_threadpool * threadpool) {
3192
- int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3193
- if (n_threads == 1) {
3194
- return;
3195
- }
3196
-
3197
- atomic_int * n_barrier = &threadpool->n_barrier;
3198
- atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
3190
+ int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
3199
3191
3200
- int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
3192
+ // enter barrier (full seq-cst fence)
3193
+ int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
3201
3194
3202
- if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
3195
+ int last = 0;
3196
+ if (n_barrier == (n_threads - 1)) {
3203
3197
// last thread
3204
- atomic_store( n_barrier, 0);
3205
- atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed) ;
3198
+ atomic_store_explicit(&tp-> n_barrier, 0, memory_order_relaxed );
3199
+ last = 1 ;
3206
3200
} else {
3207
3201
// wait for other threads
3208
- while (true) {
3209
- if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
3210
- return;
3211
- }
3202
+ while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
3212
3203
ggml_thread_cpu_relax();
3213
3204
}
3214
3205
}
3215
- }
3206
+
3207
+ // exit barrier (full seq-cst fence)
3208
+ atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
3216
3209
#endif
3210
+ }
3217
3211
3218
3212
// TODO: make this somehow automatically executed
3219
3213
// some sort of "sentry" mechanism
0 commit comments