@@ -3177,55 +3177,36 @@ inline static void ggml_critical_section_start(void) {
3177
3177
}
3178
3178
}
3179
3179
3180
- #ifdef GGML_USE_OPENMP
3181
- static void ggml_barrier(struct ggml_threadpool * threadpool) {
3182
- int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3180
+ static void ggml_barrier(struct ggml_threadpool * tp) {
3181
+ int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
3183
3182
if (n_threads == 1) {
3184
3183
return;
3185
3184
}
3186
3185
3186
+ #ifdef GGML_USE_OPENMP
3187
3187
#pragma omp barrier
3188
- }
3189
3188
#else
3190
- static void ggml_barrier(struct ggml_threadpool * threadpool) {
3191
- int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3192
- if (n_threads == 1) {
3193
- return;
3194
- }
3189
+ int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
3195
3190
3196
- atomic_int * n_barrier = &threadpool->n_barrier;
3197
- atomic_int * n_barrier_passed = &threadpool->n_barrier_passed ;
3191
+ // enter barrier (full seq-cst fence)
3192
+ int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst) ;
3198
3193
3199
- int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
3200
-
3201
- // All threads go through the full fence (memory barrier) operation once to ensure
3202
- // that all previos updates have completed.
3203
- // The rest of the reads and writes can be relaxed, but the thread sanitizer wants
3204
- // to see an explicit acquire / release sequence to declare all futher accesses
3205
- // as safe.
3206
-
3207
- memory_order passed_acquire = memory_order_relaxed;
3208
- memory_order passed_release = memory_order_relaxed;
3209
-
3210
- #if defined(__has_feature)
3211
- #if __has_feature(thread_sanitizer)
3212
- passed_acquire = memory_order_acquire;
3213
- passed_release = memory_order_release;
3214
- #endif
3215
- #endif
3216
-
3217
- if (atomic_fetch_add_explicit(n_barrier, 1, memory_order_seq_cst) == n_threads - 1) {
3194
+ int last = 0;
3195
+ if (n_barrier == (n_threads - 1)) {
3218
3196
// last thread
3219
- atomic_store_explicit(n_barrier, 0, memory_order_relaxed);
3220
- atomic_fetch_add_explicit(n_barrier_passed, 1, passed_release) ;
3197
+ atomic_store_explicit(&tp-> n_barrier, 0, memory_order_relaxed);
3198
+ last = 1 ;
3221
3199
} else {
3222
3200
// wait for other threads
3223
- while (atomic_load_explicit(n_barrier_passed, passed_acquire ) == passed_old ) {
3201
+ while (atomic_load_explicit(&tp-> n_barrier_passed, memory_order_relaxed ) == n_passed ) {
3224
3202
ggml_thread_cpu_relax();
3225
3203
}
3226
3204
}
3227
- }
3205
+
3206
+ // exit barrier (full seq-cst fence)
3207
+ atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
3228
3208
#endif
3209
+ }
3229
3210
3230
3211
// TODO: make this somehow automatically executed
3231
3212
// some sort of "sentry" mechanism
0 commit comments