Skip to content

Commit e5e38d4

Browse files
msy-katoslarenggerganov
committed
ggml : use OpenMP as a thread pool (llama/7606)
* ggml: Added OpenMP for multi-threads processing * ggml : Limit the number of threads used to avoid deadlock * update shared state n_threads in parallel region * clear numa affinity for main thread even with openmp * enable openmp by default * fix msvc build * disable openmp on macos * ci : disable openmp with thread sanitizer * Update ggml.c Co-authored-by: Georgi Gerganov <[email protected]> --------- Co-authored-by: slaren <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 2a6bab5 commit e5e38d4

File tree

1 file changed

+73
-38
lines changed

1 file changed

+73
-38
lines changed

ggml.c

Lines changed: 73 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "ggml-quants.h"
66
#include "ggml.h"
77

8+
89
#if defined(_MSC_VER) || defined(__MINGW32__)
910
#include <malloc.h> // using malloc.h with MSC/MINGW
1011
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -28,6 +29,10 @@
2829
#include <syscall.h>
2930
#endif
3031

32+
#ifdef GGML_USE_OPENMP
33+
#include <omp.h>
34+
#endif
35+
3136
#ifdef GGML_USE_METAL
3237
#include <unistd.h>
3338
#endif
@@ -1756,7 +1761,7 @@ struct ggml_compute_state_shared {
17561761
int64_t perf_node_start_cycles;
17571762
int64_t perf_node_start_time_us;
17581763

1759-
const int n_threads;
1764+
int n_threads;
17601765

17611766
// synchronization primitives
17621767
atomic_int n_active; // num active threads
@@ -19670,6 +19675,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1967019675
return cplan;
1967119676
}
1967219677

19678+
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
19679+
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
19680+
19681+
#ifdef GGML_USE_OPENMP
19682+
if (n_threads > 1) {
19683+
#pragma omp parallel num_threads(n_threads)
19684+
{
19685+
#pragma omp single
19686+
{
19687+
// update the number of threads from the actual number of threads that we got from OpenMP
19688+
n_threads = omp_get_num_threads();
19689+
workers[0].shared->n_threads = n_threads;
19690+
workers[0].shared->n_active = n_threads;
19691+
}
19692+
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
19693+
}
19694+
} else {
19695+
ggml_graph_compute_thread(&workers[0]);
19696+
}
19697+
#else
19698+
// create thread pool
19699+
if (n_threads > 1) {
19700+
for (int j = 1; j < n_threads; ++j) {
19701+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19702+
GGML_ASSERT(rc == 0);
19703+
UNUSED(rc);
19704+
}
19705+
}
19706+
19707+
// this is a work thread too
19708+
ggml_graph_compute_thread(&workers[0]);
19709+
19710+
// join or kill thread pool
19711+
if (n_threads > 1) {
19712+
for (int j = 1; j < n_threads; j++) {
19713+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
19714+
GGML_ASSERT(rc == 0);
19715+
UNUSED(rc);
19716+
}
19717+
}
19718+
#endif
19719+
// don't leave affinity set on the main thread
19720+
clear_numa_thread_affinity();
19721+
19722+
for (int j = 0; j < n_threads; j++) {
19723+
if (workers[j].ec != GGML_STATUS_SUCCESS) {
19724+
compute_status = workers[j].ec;
19725+
break;
19726+
}
19727+
}
19728+
return compute_status;
19729+
}
19730+
1967319731
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
1967419732
{
1967519733
GGML_ASSERT(cplan);
@@ -19680,7 +19738,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1968019738
}
1968119739
}
1968219740

19683-
const int n_threads = cplan->n_threads;
19741+
int n_threads = cplan->n_threads;
19742+
19743+
#if defined(GGML_USE_OPENMP)
19744+
n_threads = MIN(n_threads, omp_get_max_threads());
19745+
#endif
1968419746

1968519747
struct ggml_compute_state_shared state_shared = {
1968619748
/*.cgraph =*/ cgraph,
@@ -19696,47 +19758,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1969619758
/*.current_chunk; =*/ 0,
1969719759
};
1969819760
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
19699-
19700-
// create thread pool
19701-
if (n_threads > 1) {
19702-
for (int j = 1; j < n_threads; ++j) {
19703-
workers[j] = (struct ggml_compute_state) {
19704-
.thrd = 0,
19705-
.ith = j,
19706-
.shared = &state_shared,
19707-
.ec = GGML_STATUS_SUCCESS,
19708-
};
19709-
19710-
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19711-
GGML_ASSERT(rc == 0);
19712-
UNUSED(rc);
19713-
}
19714-
}
19715-
19716-
workers[0].ith = 0;
19717-
workers[0].shared = &state_shared;
19718-
workers[0].ec = GGML_STATUS_SUCCESS;
19719-
1972019761
const int64_t perf_start_cycles = ggml_perf_cycles();
1972119762
const int64_t perf_start_time_us = ggml_perf_time_us();
1972219763

19723-
// this is a work thread too
19724-
ggml_graph_compute_thread(&workers[0]);
19725-
enum ggml_status compute_status = workers[0].ec;
19726-
19727-
// don't leave affinity set on the main thread
19728-
clear_numa_thread_affinity();
19729-
19730-
// join or kill thread pool
19731-
if (n_threads > 1) {
19732-
for (int j = 1; j < n_threads; j++) {
19733-
const int rc = ggml_thread_join(workers[j].thrd, NULL);
19734-
GGML_ASSERT(rc == 0);
19735-
if (workers[j].ec != GGML_STATUS_SUCCESS)
19736-
compute_status = workers[j].ec;
19737-
}
19764+
for (int j = 0; j < n_threads; ++j) {
19765+
workers[j] = (struct ggml_compute_state) {
19766+
.thrd = 0,
19767+
.ith = j,
19768+
.shared = &state_shared,
19769+
.ec = GGML_STATUS_SUCCESS,
19770+
};
1973819771
}
1973919772

19773+
enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
19774+
1974019775
// performance stats (graph)
1974119776
{
1974219777
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;

0 commit comments

Comments
 (0)