Skip to content

Commit a1e7c69

Browse files
committed
ggml_graph_compute: deprecate using ggml_context, try resolve issue #287
1 parent 31cfbb1 commit a1e7c69

File tree

3 files changed

+86
-31
lines changed

3 files changed

+86
-31
lines changed

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1426,11 +1426,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
14261426

14271427
gf->n_nodes = 0;
14281428
gf->n_leafs = 0;
1429-
gf->work_size = 0;
14301429
gf->perf_runs = 0;
14311430
gf->perf_cycles = 0;
14321431
gf->perf_time_us = 0;
1433-
gf->work = NULL;
14341432

14351433
const auto & hparams = model->hparams;
14361434
//const int n_ctx = hparams.n_ctx;

ggml.c

Lines changed: 63 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15773,8 +15773,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
1577315773
/*.n_nodes =*/ 0,
1577415774
/*.n_leafs =*/ 0,
1577515775
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
15776-
/*.work_size =*/ 0,
15777-
/*.work =*/ NULL,
1577815776
/*.nodes =*/ { NULL },
1577915777
/*.grads =*/ { NULL },
1578015778
/*.leafs =*/ { NULL },
@@ -15946,6 +15944,7 @@ void clear_numa_thread_affinity(void) {}
1594615944

1594715945
struct ggml_compute_state_shared {
1594815946
struct ggml_cgraph * cgraph;
15947+
struct ggml_cgraph_context * cgraph_ctx;
1594915948

1595015949
int64_t perf_node_start_cycles;
1595115950
int64_t perf_node_start_time_us;
@@ -15975,6 +15974,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
1597515974
static thread_ret_t ggml_graph_compute_thread(void * data) {
1597615975
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
1597715976
struct ggml_cgraph * cgraph = state->shared->cgraph;
15977+
struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx;
1597815978

1597915979
const int n_threads = state->shared->n_threads;
1598015980
set_numa_thread_affinity(state->ith, n_threads);
@@ -15989,8 +15989,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1598915989
/*.type =*/ GGML_TASK_FINALIZE,
1599015990
/*.ith =*/ 0,
1599115991
/*.nth =*/ 0,
15992-
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
15993-
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
15992+
/*.wsize =*/ ctx->work_size,
15993+
/*.wdata =*/ ctx->work_data,
1599415994
};
1599515995

1599615996
if (node_n != -1) {
@@ -16057,8 +16057,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1605716057
/*.type =*/ GGML_TASK_COMPUTE,
1605816058
/*.ith =*/ state->ith,
1605916059
/*.nth =*/ node->n_tasks,
16060-
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16061-
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16060+
/*.wsize =*/ ctx->work_size,
16061+
/*.wdata =*/ ctx->work_data,
1606216062
};
1606316063

1606416064
if (state->ith < node->n_tasks) {
@@ -16069,23 +16069,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1606916069
return 0;
1607016070
}
1607116071

16072-
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
16073-
const int n_threads = cgraph->n_threads;
16072+
// Prepare for graph computing.
16073+
// Will set: node->n_tasks, ctx->{work_size, planned}
16074+
void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
16075+
GGML_ASSERT(ctx);
16076+
// This function is actually reentrant, but duplicate calls is unnecessary.
16077+
GGML_ASSERT(ctx->work_size == 0);
16078+
GGML_ASSERT(ctx->work_data == NULL);
16079+
GGML_ASSERT(!ctx->planned);
1607416080

16075-
struct ggml_compute_state_shared state_shared = {
16076-
/*.cgraph =*/ cgraph,
16077-
/*.perf_node_start_cycles =*/ 0,
16078-
/*.perf_node_start_time_us =*/ 0,
16079-
/*.n_threads =*/ n_threads,
16080-
/*.n_active =*/ n_threads,
16081-
/*.node_n =*/ -1,
16082-
};
16083-
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
16081+
int n_threads = cgraph->n_threads;
16082+
size_t work_size = 0;
1608416083

1608516084
// initialize tasks + work buffer
1608616085
{
16087-
size_t work_size = 0;
16088-
1608916086
// thread scheduling for the different operations
1609016087
for (int i = 0; i < cgraph->n_nodes; i++) {
1609116088
struct ggml_tensor * node = cgraph->nodes[i];
@@ -16399,19 +16396,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1639916396
} break;
1640016397
}
1640116398
}
16399+
}
1640216400

16403-
if (cgraph->work != NULL && work_size > cgraph->work_size) {
16404-
GGML_ASSERT(false); // TODO: better handling
16405-
}
16401+
if (work_size > 0) {
16402+
work_size += CACHE_LINE_SIZE*(n_threads - 1);
16403+
}
16404+
16405+
ctx->work_size = work_size;
16406+
ctx->work_data = NULL;
16407+
ctx->planned = true;
16408+
}
1640616409

16407-
if (work_size > 0 && cgraph->work == NULL) {
16408-
cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
16410+
void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
16411+
if (ctx == NULL) {
16412+
ctx = alloca(sizeof(struct ggml_cgraph_context));
16413+
GGML_ASSERT(ctx);
16414+
ctx->work_size = 0;
16415+
ctx->work_data = NULL;
16416+
ctx->planned = false;
16417+
} else {
16418+
// The work_size and work_data MAY have default values even if has been planned.
16419+
if (ctx->work_size > 0) {
16420+
GGML_ASSERT(ctx->work_data);
16421+
}
16422+
}
1640916423

16410-
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
16411-
cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
16424+
if (!ctx->planned) {
16425+
ggml_graph_compute_plan(ctx, cgraph);
16426+
if (ctx->work_size > 0) {
16427+
ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8));
16428+
GGML_ASSERT(ctx->work_data);
16429+
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size);
1641216430
}
1641316431
}
1641416432

16433+
const int n_threads = cgraph->n_threads;
16434+
16435+
struct ggml_compute_state_shared state_shared = {
16436+
/*.cgraph =*/ cgraph,
16437+
/*.cgraph_ctx =*/ ctx,
16438+
/*.perf_node_start_cycles =*/ 0,
16439+
/*.perf_node_start_time_us =*/ 0,
16440+
/*.n_threads =*/ n_threads,
16441+
/*.n_active =*/ n_threads,
16442+
/*.node_n =*/ -1,
16443+
};
16444+
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
16445+
1641516446
// create thread pool
1641616447
if (n_threads > 1) {
1641716448
for (int j = 1; j < n_threads; ++j) {
@@ -16463,6 +16494,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1646316494
}
1646416495
}
1646516496

16497+
// Deprecated, keep it only for backward compatibility.
16498+
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
16499+
UNUSED(ctx);
16500+
ggml_graph_compute_v2(NULL, cgraph);
16501+
}
16502+
1646616503
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
1646716504
for (int i = 0; i < cgraph->n_nodes; i++) {
1646816505
struct ggml_tensor * grad = cgraph->grads[i];

ggml.h

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -437,15 +437,23 @@ extern "C" {
437437

438438
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
439439

440+
// graph compute context
441+
struct ggml_cgraph_context {
442+
// After call to `ggml_graph_compute_plan()`, `planned` is set as true,
443+
// `work_size` will be updated as non-zero when buffer is required. When
444+
// need buffer, caller MUST allocate memory for `work_data`.
445+
// See https://github.com/ggerganov/ggml/issues/287
446+
size_t work_size;
447+
void * work_data;
448+
bool planned; // true means ready to compute graph nodes.
449+
};
450+
440451
// computation graph
441452
struct ggml_cgraph {
442453
int n_nodes;
443454
int n_leafs;
444455
int n_threads;
445456

446-
size_t work_size;
447-
struct ggml_tensor * work;
448-
449457
struct ggml_tensor * nodes[GGML_MAX_NODES];
450458
struct ggml_tensor * grads[GGML_MAX_NODES];
451459
struct ggml_tensor * leafs[GGML_MAX_NODES];
@@ -1297,6 +1305,18 @@ extern "C" {
12971305
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
12981306
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
12991307

1308+
// Since https://github.com/ggerganov/ggml/issues/287
1309+
GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
1310+
// Since https://github.com/ggerganov/ggml/issues/287
1311+
// When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`.
1312+
// Another use case: allocate buffer explicitly:
1313+
// - call `ggml_graph_compute_plan()`;
1314+
// - allocate memory for `ctx->work_data`;
1315+
// - finally call `ggml_graph_compute_v2()`.
1316+
// NOTE: don't manually set `ctx->planned`.
1317+
GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
1318+
// Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead.
1319+
// See https://github.com/ggerganov/ggml/issues/287
13001320
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
13011321
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
13021322

0 commit comments

Comments
 (0)