Skip to content

Commit 62ec4b8

Browse files
committed
reusable buffers
1 parent 6fed218 commit 62ec4b8

File tree

8 files changed

+126
-134
lines changed

8 files changed

+126
-134
lines changed

examples/baby-llama/baby-llama.cpp

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,6 +1569,8 @@ int main(int argc, char ** argv) {
15691569
int n_tokens = model.hparams.n_ctx;
15701570
int n_vocab = model.hparams.n_vocab;
15711571

1572+
auto compute_plan_buffer = std::vector<uint8_t>();
1573+
15721574
for (int ex=0; ex<n_examples; ++ex) {
15731575
struct ggml_init_params params = {
15741576
/*.mem_size =*/ compute_size,
@@ -1598,13 +1600,10 @@ int main(int argc, char ** argv) {
15981600
{
15991601
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
16001602
if (plan.work_size > 0) {
1601-
plan.work_data = malloc(plan.work_size);
1602-
GGML_ASSERT(plan.work_data);
1603+
compute_plan_buffer.resize(plan.work_size);
1604+
plan.work_data = compute_plan_buffer.data();
16031605
}
16041606
ggml_graph_compute(&plan, &gf);
1605-
if (plan.work_data) {
1606-
free(plan.work_data);
1607-
}
16081607
}
16091608

16101609
float error_before_opt = ggml_get_f32_1d(e, 0);
@@ -1625,13 +1624,10 @@ int main(int argc, char ** argv) {
16251624
{
16261625
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
16271626
if (plan.work_size > 0) {
1628-
plan.work_data = malloc(plan.work_size);
1629-
GGML_ASSERT(plan.work_data);
1627+
compute_plan_buffer.resize(plan.work_size);
1628+
plan.work_data = compute_plan_buffer.data();
16301629
}
16311630
ggml_graph_compute(&plan, &gf);
1632-
if (plan.work_data) {
1633-
free(plan.work_data);
1634-
}
16351631
}
16361632

16371633
float error_after_opt = ggml_get_f32_1d(e, 0);
@@ -1689,13 +1685,10 @@ int main(int argc, char ** argv) {
16891685
{
16901686
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
16911687
if (plan.work_size > 0) {
1692-
plan.work_data = malloc(plan.work_size);
1693-
GGML_ASSERT(plan.work_data);
1688+
compute_plan_buffer.resize(plan.work_size);
1689+
plan.work_data = compute_plan_buffer.data();
16941690
}
16951691
ggml_graph_compute(&plan, &gf);
1696-
if (plan.work_data) {
1697-
free(plan.work_data);
1698-
}
16991692
}
17001693

17011694
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);

examples/benchmark/benchmark-matmult.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -164,16 +164,15 @@ int main(int argc, char ** argv) {
164164
TENSOR_DUMP(m11);
165165
TENSOR_DUMP(m2);
166166

167+
auto compute_plan_buffer = std::vector<uint8_t>();
168+
167169
{
168-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
170+
auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
169171
if (plan.work_size > 0) {
170-
plan.work_data = malloc(plan.work_size);
171-
GGML_ASSERT(plan.work_data);
172+
compute_plan_buffer.resize(plan.work_size);
173+
plan.work_data = compute_plan_buffer.data();
172174
}
173175
ggml_graph_compute(&plan, &gf);
174-
if (plan.work_data) {
175-
free(plan.work_data);
176-
}
177176
}
178177

179178
TENSOR_DUMP(gf.nodes[0]);
@@ -229,15 +228,12 @@ int main(int argc, char ** argv) {
229228
long long int start = ggml_time_us();
230229
//printf("Running ggml_graph_compute\n");
231230
{
232-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
231+
auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
233232
if (plan.work_size > 0) {
234-
plan.work_data = malloc(plan.work_size);
235-
GGML_ASSERT(plan.work_data);
233+
compute_plan_buffer.resize(plan.work_size);
234+
plan.work_data = compute_plan_buffer.data();
236235
}
237236
ggml_graph_compute(&plan, &gf31);
238-
if (plan.work_data) {
239-
free(plan.work_data);
240-
}
241237
}
242238

243239
long long int stop = ggml_time_us();
@@ -272,15 +268,12 @@ int main(int argc, char ** argv) {
272268

273269
// Running a different graph computation to make sure we override the CPU cache lines
274270
{
275-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
271+
auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
276272
if (plan.work_size > 0) {
277-
plan.work_data = malloc(plan.work_size);
278-
GGML_ASSERT(plan.work_data);
273+
compute_plan_buffer.resize(plan.work_size);
274+
plan.work_data = compute_plan_buffer.data();
279275
}
280276
ggml_graph_compute(&plan, &gf32);
281-
if (plan.work_data) {
282-
free(plan.work_data);
283-
}
284277
}
285278
}
286279
printf("\n");

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3181,6 +3181,8 @@ int main(int argc, char ** argv) {
31813181
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
31823182
}
31833183

3184+
auto compute_plan_buffer = std::vector<uint8_t>();
3185+
31843186
printf("%s: begin training\n", __func__);
31853187

31863188
for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3244,15 +3246,12 @@ int main(int argc, char ** argv) {
32443246
}
32453247

32463248
{
3247-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
3249+
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
32483250
if (plan.work_size > 0) {
3249-
plan.work_data = malloc(plan.work_size);
3250-
GGML_ASSERT(plan.work_data);
3251+
compute_plan_buffer.resize(plan.work_size);
3252+
plan.work_data = compute_plan_buffer.data();
32513253
}
32523254
ggml_graph_compute(&plan, gf);
3253-
if (plan.work_data) {
3254-
free(plan.work_data);
3255-
}
32563255
}
32573256

32583257
size_t used_mem_before_opt = ggml_used_mem(ctx0);
@@ -3278,15 +3277,12 @@ int main(int argc, char ** argv) {
32783277
model.train_tokens += n_batch * n_tokens;
32793278

32803279
{
3281-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
3280+
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
32823281
if (plan.work_size > 0) {
3283-
plan.work_data = malloc(plan.work_size);
3284-
GGML_ASSERT(plan.work_data);
3282+
compute_plan_buffer.resize(plan.work_size);
3283+
plan.work_data = compute_plan_buffer.data();
32853284
}
32863285
ggml_graph_compute(&plan, gf);
3287-
if (plan.work_data) {
3288-
free(plan.work_data);
3289-
}
32903286
}
32913287

32923288
float error_after_opt = ggml_get_f32_1d(loss, 0);
@@ -3376,15 +3372,12 @@ int main(int argc, char ** argv) {
33763372
ggml_build_forward_expand(&gf, logits);
33773373

33783374
{
3379-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
3375+
auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
33803376
if (plan.work_size > 0) {
3381-
plan.work_data = malloc(plan.work_size);
3382-
GGML_ASSERT(plan.work_data);
3377+
compute_plan_buffer.resize(plan.work_size);
3378+
plan.work_data = compute_plan_buffer.data();
33833379
}
33843380
ggml_graph_compute(&plan, &gf);
3385-
if (plan.work_data) {
3386-
free(plan.work_data);
3387-
}
33883381
}
33893382

33903383
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);

ggml.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16806,7 +16806,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1680616806
const struct ggml_cgraph * cgraph = state->shared->cgraph;
1680716807

1680816808
const struct ggml_graph_compute_plan * plan = state->shared->plan;
16809-
const int *n_tasks_arr = plan->n_tasks;
16809+
const int * n_tasks_arr = plan->n_tasks;
1681016810

1681116811
const int n_threads = state->shared->n_threads;
1681216812
set_numa_thread_affinity(state->ith, n_threads);
@@ -17338,6 +17338,7 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
1733817338
}
1733917339
}
1734017340

17341+
// TODO: avoid allocating memory frequently.
1734117342
static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
1734217343
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
1734317344
if (plan.work_size > 0) {

ggml.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ extern "C" {
421421
// Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
422422
size_t work_size;
423423
// Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
424-
void * work_data;
424+
uint8_t * work_data;
425425

426426
int n_threads;
427427

llama.cpp

Lines changed: 25 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,10 @@ struct llama_context {
321321
// input embedding (1-dimensional array: [n_embd])
322322
std::vector<float> embedding;
323323

324+
// reusable buffer for `struct ggml_graph_compute_plan.work_data`
325+
// std::vector guarantees the elements are stored contiguously.
326+
std::vector<uint8_t> compute_plan_buffer;
327+
324328
// memory buffers used to evaluate the model
325329
// TODO: move in llama_state
326330
llama_ctx_buffer buf_compute;
@@ -1584,10 +1588,13 @@ static bool llama_eval_internal(
15841588
// run the computation
15851589
ggml_build_forward_expand(&gf, cur);
15861590

1591+
bool call_ggml_graph_compute = true;
1592+
15871593
#ifdef GGML_USE_METAL
15881594
if (lctx.ctx_metal && N == 1) {
15891595
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
15901596
ggml_metal_get_tensor (lctx.ctx_metal, cur);
1597+
call_ggml_graph_compute = false;
15911598
} else {
15921599
// IMPORTANT:
15931600
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1604,32 +1611,17 @@ static bool llama_eval_internal(
16041611
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
16051612
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
16061613
}
1607-
1608-
{
1609-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
1610-
if (plan.work_size > 0) {
1611-
plan.work_data = malloc(plan.work_size);
1612-
GGML_ASSERT(plan.work_data);
1613-
}
1614-
ggml_graph_compute(&plan, &gf);
1615-
if (plan.work_data) {
1616-
free(plan.work_data);
1617-
}
1618-
}
16191614
}
1620-
#else
1621-
{
1622-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
1615+
#endif
1616+
1617+
if (call_ggml_graph_compute) {
1618+
auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
16231619
if (plan.work_size > 0) {
1624-
plan.work_data = malloc(plan.work_size);
1625-
GGML_ASSERT(plan.work_data);
1620+
lctx.compute_plan_buffer.resize(plan.work_size);
1621+
plan.work_data = lctx.compute_plan_buffer.data();
16261622
}
16271623
ggml_graph_compute(&plan, &gf);
1628-
if (plan.work_data) {
1629-
free(plan.work_data);
1630-
}
16311624
}
1632-
#endif
16331625

16341626
if (cgraph_fname) {
16351627
ggml_graph_export(&gf, cgraph_fname);
@@ -2817,6 +2809,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
28172809
// read tensors and apply
28182810
bool warned = false;
28192811
int n_tensors = 0;
2812+
2813+
auto compute_plan_buffer = std::vector<uint8_t>();
2814+
28202815
while (true) {
28212816
int32_t n_dims;
28222817
int32_t length;
@@ -2983,15 +2978,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
29832978
struct ggml_cgraph gf = ggml_build_forward(r);
29842979

29852980
{
2986-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
2981+
auto plan = ggml_graph_compute_make_plan(&gf, n_threads);
29872982
if (plan.work_size > 0) {
2988-
plan.work_data = malloc(plan.work_size);
2989-
GGML_ASSERT(plan.work_data);
2983+
compute_plan_buffer.resize(plan.work_size);
2984+
plan.work_data = compute_plan_buffer.data();
29902985
}
29912986
ggml_graph_compute(&plan, &gf);
2992-
if (plan.work_data) {
2993-
free(plan.work_data);
2994-
}
29952987
}
29962988

29972989
// we won't need these tensors again, reset the context to save memory
@@ -3166,15 +3158,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
31663158
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
31673159

31683160
{
3169-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
3161+
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
31703162
if (plan.work_size > 0) {
3171-
plan.work_data = malloc(plan.work_size);
3172-
GGML_ASSERT(plan.work_data);
3163+
ctx->compute_plan_buffer.resize(plan.work_size);
3164+
plan.work_data = ctx->compute_plan_buffer.data();
31733165
}
31743166
ggml_graph_compute(&plan, &gf);
3175-
if (plan.work_data) {
3176-
free(plan.work_data);
3177-
}
31783167
}
31793168

31803169
ggml_free(cpy_ctx);
@@ -3282,15 +3271,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
32823271
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
32833272

32843273
{
3285-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
3274+
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
32863275
if (plan.work_size > 0) {
3287-
plan.work_data = malloc(plan.work_size);
3288-
GGML_ASSERT(plan.work_data);
3276+
ctx->compute_plan_buffer.resize(plan.work_size);
3277+
plan.work_data = ctx->compute_plan_buffer.data();
32893278
}
32903279
ggml_graph_compute(&plan, &gf);
3291-
if (plan.work_data) {
3292-
free(plan.work_data);
3293-
}
32943280
}
32953281

32963282
ggml_free(cpy_ctx);

0 commit comments

Comments
 (0)