Skip to content

Commit 32f945d

Browse files
threads: improve ggml_barrier scaling with large number of threads
Make sure n_barrier and n_barrier do not share the cache line to avoid cache line bouncing. This optimization shows performance improvements even for n_threads <= 8 cases. Resurect TSAN (Thread Sanitizer) check so that we can avoid doing expensive read-modify-write in the normal case and just use thread-fence as originally intended. --- Here is the original description and suggestions from Willy Tarreau : There's currently some false sharing between n_barrier and n_barrier_passed that is amplified in ggml_barrier() by the fact that all threads need to increment n_barrier when entering, while all previous threads continue to read n_barrier_passed, waiting for the last one to release them all. The side effect is that all these readers are slowing down all new threads by making the cache line bounce back and forth between readers and writers. Just placing them in two distinct cache lines is sufficient to boost the performance by 21% on a 80-core ARM server compared to the no-openmp version, and by 3% compared to the openmp version. Note that the variables could have been spread apart in the structure as well, but it doesn't seem that the size of this threadpool struct is critical so here we're simply aligning them. Finally, the same issue was present when leaving the barrier since all threads had to update the n_barrier_passed counter, though only one would add a non-zero value. This alone is responsible for half of the cost due to undesired serialization. It might be possible that using a small array of n_barrier counters could make things even faster on many-core systems, but it would likely complicate the logic needed to detect the last thread. Co-authored-by: Willy Tarreau <[email protected]>
1 parent d09770c commit 32f945d

File tree

1 file changed

+34
-11
lines changed

1 file changed

+34
-11
lines changed

ggml/src/ggml.c

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,21 @@ int ggml_sve_cnt_b = 0;
6363
#pragma warning(disable: 4702)
6464
#endif
6565

66+
// Note: once we move threading into a separate C++ file
67+
// will use std::hardware_destructive_interference_size instead of hardcoding it here
68+
// and we'll use C++ attribute syntax.
69+
#define GGML_CACHE_LINE 64
70+
71+
#if defined(__clang__) || defined(__GNUC__)
72+
#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
73+
#endif
74+
75+
#if defined(__has_feature)
76+
#if __has_feature(thread_sanitizer)
77+
#define GGML_TSAN_ENABLED 1
78+
#endif
79+
#endif
80+
6681
#if defined(_WIN32)
6782

6883
#define WIN32_LEAN_AND_MEAN
@@ -72,6 +87,8 @@ int ggml_sve_cnt_b = 0;
7287
#include <windows.h>
7388

7489
#if !defined(__clang__)
90+
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
91+
7592
typedef volatile LONG atomic_int;
7693
typedef atomic_int atomic_bool;
7794
typedef atomic_int atomic_flag;
@@ -2007,8 +2024,8 @@ struct ggml_threadpool {
20072024

20082025
// synchronization primitives
20092026
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
2010-
atomic_int n_barrier;
2011-
atomic_int n_barrier_passed;
2027+
atomic_int GGML_CACHE_ALIGN n_barrier;
2028+
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
20122029
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
20132030

20142031
// these are atomic as an annotation for thread-sanitizer
@@ -3196,20 +3213,23 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
31963213
// enter barrier (full seq-cst fence)
31973214
int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
31983215

3199-
int last = 0;
32003216
if (n_barrier == (n_threads - 1)) {
32013217
// last thread
32023218
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
3203-
last = 1;
3219+
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
32043220
} else {
32053221
// wait for other threads
32063222
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
32073223
ggml_thread_cpu_relax();
32083224
}
3209-
}
32103225

3211-
// exit barrier (full seq-cst fence)
3212-
atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
3226+
#ifdef GGML_TSAN_ENABLED
3227+
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
3228+
atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
3229+
#else
3230+
atomic_thread_fence(memory_order_seq_cst);
3231+
#endif
3232+
}
32133233
#endif
32143234
}
32153235

@@ -20240,10 +20260,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
2024020260

2024120261
// sync thread state after polling
2024220262
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
20243-
struct ggml_threadpool * threadpool = state->threadpool;
20244-
// this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
20245-
// so instead we just use a dummy read-modify-write
20246-
atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
20263+
#ifdef GGML_TSAN_ENABLED
20264+
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
20265+
atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
20266+
#else
20267+
atomic_thread_fence(memory_order_seq_cst);
20268+
#endif
20269+
UNUSED(state);
2024720270
}
2024820271

2024920272
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {

0 commit comments

Comments
 (0)