Skip to content

Commit bf63002

Browse files
committed
reusable buffers
1 parent 8fa7e06 commit bf63002

File tree

8 files changed

+126
-134
lines changed

8 files changed

+126
-134
lines changed

examples/baby-llama/baby-llama.cpp

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,6 +1569,8 @@ int main(int argc, char ** argv) {
15691569
int n_tokens = model.hparams.n_ctx;
15701570
int n_vocab = model.hparams.n_vocab;
15711571

1572+
auto compute_plan_buffer = std::vector<uint8_t>();
1573+
15721574
for (int ex=0; ex<n_examples; ++ex) {
15731575
struct ggml_init_params params = {
15741576
/*.mem_size =*/ compute_size,
@@ -1598,13 +1600,10 @@ int main(int argc, char ** argv) {
15981600
{
15991601
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
16001602
if (plan.work_size > 0) {
1601-
plan.work_data = malloc(plan.work_size);
1602-
GGML_ASSERT(plan.work_data);
1603+
compute_plan_buffer.resize(plan.work_size);
1604+
plan.work_data = compute_plan_buffer.data();
16031605
}
16041606
ggml_graph_compute(&plan, &gf);
1605-
if (plan.work_data) {
1606-
free(plan.work_data);
1607-
}
16081607
}
16091608

16101609
float error_before_opt = ggml_get_f32_1d(e, 0);
@@ -1625,13 +1624,10 @@ int main(int argc, char ** argv) {
16251624
{
16261625
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
16271626
if (plan.work_size > 0) {
1628-
plan.work_data = malloc(plan.work_size);
1629-
GGML_ASSERT(plan.work_data);
1627+
compute_plan_buffer.resize(plan.work_size);
1628+
plan.work_data = compute_plan_buffer.data();
16301629
}
16311630
ggml_graph_compute(&plan, &gf);
1632-
if (plan.work_data) {
1633-
free(plan.work_data);
1634-
}
16351631
}
16361632

16371633
float error_after_opt = ggml_get_f32_1d(e, 0);
@@ -1689,13 +1685,10 @@ int main(int argc, char ** argv) {
16891685
{
16901686
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
16911687
if (plan.work_size > 0) {
1692-
plan.work_data = malloc(plan.work_size);
1693-
GGML_ASSERT(plan.work_data);
1688+
compute_plan_buffer.resize(plan.work_size);
1689+
plan.work_data = compute_plan_buffer.data();
16941690
}
16951691
ggml_graph_compute(&plan, &gf);
1696-
if (plan.work_data) {
1697-
free(plan.work_data);
1698-
}
16991692
}
17001693

17011694
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);

examples/benchmark/benchmark-matmult.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -164,16 +164,15 @@ int main(int argc, char ** argv) {
164164
TENSOR_DUMP(m11);
165165
TENSOR_DUMP(m2);
166166

167+
auto compute_plan_buffer = std::vector<uint8_t>();
168+
167169
{
168-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
170+
auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
169171
if (plan.work_size > 0) {
170-
plan.work_data = malloc(plan.work_size);
171-
GGML_ASSERT(plan.work_data);
172+
compute_plan_buffer.resize(plan.work_size);
173+
plan.work_data = compute_plan_buffer.data();
172174
}
173175
ggml_graph_compute(&plan, &gf);
174-
if (plan.work_data) {
175-
free(plan.work_data);
176-
}
177176
}
178177

179178
TENSOR_DUMP(gf.nodes[0]);
@@ -229,15 +228,12 @@ int main(int argc, char ** argv) {
229228
long long int start = ggml_time_us();
230229
//printf("Running ggml_graph_compute\n");
231230
{
232-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
231+
auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
233232
if (plan.work_size > 0) {
234-
plan.work_data = malloc(plan.work_size);
235-
GGML_ASSERT(plan.work_data);
233+
compute_plan_buffer.resize(plan.work_size);
234+
plan.work_data = compute_plan_buffer.data();
236235
}
237236
ggml_graph_compute(&plan, &gf31);
238-
if (plan.work_data) {
239-
free(plan.work_data);
240-
}
241237
}
242238

243239
long long int stop = ggml_time_us();
@@ -272,15 +268,12 @@ int main(int argc, char ** argv) {
272268

273269
// Running a different graph computation to make sure we override the CPU cache lines
274270
{
275-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
271+
auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
276272
if (plan.work_size > 0) {
277-
plan.work_data = malloc(plan.work_size);
278-
GGML_ASSERT(plan.work_data);
273+
compute_plan_buffer.resize(plan.work_size);
274+
plan.work_data = compute_plan_buffer.data();
279275
}
280276
ggml_graph_compute(&plan, &gf32);
281-
if (plan.work_data) {
282-
free(plan.work_data);
283-
}
284277
}
285278
}
286279
printf("\n");

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3181,6 +3181,8 @@ int main(int argc, char ** argv) {
31813181
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
31823182
}
31833183

3184+
auto compute_plan_buffer = std::vector<uint8_t>();
3185+
31843186
printf("%s: begin training\n", __func__);
31853187

31863188
for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3244,15 +3246,12 @@ int main(int argc, char ** argv) {
32443246
}
32453247

32463248
{
3247-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
3249+
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
32483250
if (plan.work_size > 0) {
3249-
plan.work_data = malloc(plan.work_size);
3250-
GGML_ASSERT(plan.work_data);
3251+
compute_plan_buffer.resize(plan.work_size);
3252+
plan.work_data = compute_plan_buffer.data();
32513253
}
32523254
ggml_graph_compute(&plan, gf);
3253-
if (plan.work_data) {
3254-
free(plan.work_data);
3255-
}
32563255
}
32573256

32583257
size_t used_mem_before_opt = ggml_used_mem(ctx0);
@@ -3278,15 +3277,12 @@ int main(int argc, char ** argv) {
32783277
model.train_tokens += n_batch * n_tokens;
32793278

32803279
{
3281-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
3280+
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
32823281
if (plan.work_size > 0) {
3283-
plan.work_data = malloc(plan.work_size);
3284-
GGML_ASSERT(plan.work_data);
3282+
compute_plan_buffer.resize(plan.work_size);
3283+
plan.work_data = compute_plan_buffer.data();
32853284
}
32863285
ggml_graph_compute(&plan, gf);
3287-
if (plan.work_data) {
3288-
free(plan.work_data);
3289-
}
32903286
}
32913287

32923288
float error_after_opt = ggml_get_f32_1d(loss, 0);
@@ -3376,15 +3372,12 @@ int main(int argc, char ** argv) {
33763372
ggml_build_forward_expand(&gf, logits);
33773373

33783374
{
3379-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
3375+
auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
33803376
if (plan.work_size > 0) {
3381-
plan.work_data = malloc(plan.work_size);
3382-
GGML_ASSERT(plan.work_data);
3377+
compute_plan_buffer.resize(plan.work_size);
3378+
plan.work_data = compute_plan_buffer.data();
33833379
}
33843380
ggml_graph_compute(&plan, &gf);
3385-
if (plan.work_data) {
3386-
free(plan.work_data);
3387-
}
33883381
}
33893382

33903383
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);

ggml.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16330,7 +16330,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1633016330
const struct ggml_cgraph * cgraph = state->shared->cgraph;
1633116331

1633216332
const struct ggml_graph_compute_plan * plan = state->shared->plan;
16333-
const int *n_tasks_arr = plan->n_tasks;
16333+
const int * n_tasks_arr = plan->n_tasks;
1633416334

1633516335
const int n_threads = state->shared->n_threads;
1633616336
set_numa_thread_affinity(state->ith, n_threads);
@@ -16864,6 +16864,7 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
1686416864
}
1686516865
}
1686616866

16867+
// TODO: avoid allocating memory frequently.
1686716868
static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
1686816869
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
1686916870
if (plan.work_size > 0) {

ggml.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ extern "C" {
449449
// Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
450450
size_t work_size;
451451
// Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
452-
void * work_data;
452+
uint8_t * work_data;
453453

454454
int n_threads;
455455

llama.cpp

Lines changed: 25 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,10 @@ struct llama_context {
321321
// input embedding (1-dimensional array: [n_embd])
322322
std::vector<float> embedding;
323323

324+
// reusable buffer for `struct ggml_graph_compute_plan.work_data`
325+
// std::vector guarantees the elements are stored contiguously.
326+
std::vector<uint8_t> compute_plan_buffer;
327+
324328
// memory buffers used to evaluate the model
325329
// TODO: move in llama_state
326330
llama_ctx_buffer buf_compute;
@@ -1582,10 +1586,13 @@ static bool llama_eval_internal(
15821586
// run the computation
15831587
ggml_build_forward_expand(&gf, cur);
15841588

1589+
bool call_ggml_graph_compute = true;
1590+
15851591
#ifdef GGML_USE_METAL
15861592
if (lctx.ctx_metal && N == 1) {
15871593
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
15881594
ggml_metal_get_tensor (lctx.ctx_metal, cur);
1595+
call_ggml_graph_compute = false;
15891596
} else {
15901597
// IMPORTANT:
15911598
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1602,32 +1609,17 @@ static bool llama_eval_internal(
16021609
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
16031610
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
16041611
}
1605-
1606-
{
1607-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
1608-
if (plan.work_size > 0) {
1609-
plan.work_data = malloc(plan.work_size);
1610-
GGML_ASSERT(plan.work_data);
1611-
}
1612-
ggml_graph_compute(&plan, &gf);
1613-
if (plan.work_data) {
1614-
free(plan.work_data);
1615-
}
1616-
}
16171612
}
1618-
#else
1619-
{
1620-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
1613+
#endif
1614+
1615+
if (call_ggml_graph_compute) {
1616+
auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
16211617
if (plan.work_size > 0) {
1622-
plan.work_data = malloc(plan.work_size);
1623-
GGML_ASSERT(plan.work_data);
1618+
lctx.compute_plan_buffer.resize(plan.work_size);
1619+
plan.work_data = lctx.compute_plan_buffer.data();
16241620
}
16251621
ggml_graph_compute(&plan, &gf);
1626-
if (plan.work_data) {
1627-
free(plan.work_data);
1628-
}
16291622
}
1630-
#endif
16311623

16321624
if (cgraph_fname) {
16331625
ggml_graph_export(&gf, cgraph_fname);
@@ -2815,6 +2807,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
28152807
// read tensors and apply
28162808
bool warned = false;
28172809
int n_tensors = 0;
2810+
2811+
auto compute_plan_buffer = std::vector<uint8_t>();
2812+
28182813
while (true) {
28192814
int32_t n_dims;
28202815
int32_t length;
@@ -2981,15 +2976,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
29812976
struct ggml_cgraph gf = ggml_build_forward(r);
29822977

29832978
{
2984-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
2979+
auto plan = ggml_graph_compute_make_plan(&gf, n_threads);
29852980
if (plan.work_size > 0) {
2986-
plan.work_data = malloc(plan.work_size);
2987-
GGML_ASSERT(plan.work_data);
2981+
compute_plan_buffer.resize(plan.work_size);
2982+
plan.work_data = compute_plan_buffer.data();
29882983
}
29892984
ggml_graph_compute(&plan, &gf);
2990-
if (plan.work_data) {
2991-
free(plan.work_data);
2992-
}
29932985
}
29942986

29952987
// we won't need these tensors again, reset the context to save memory
@@ -3164,15 +3156,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
31643156
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
31653157

31663158
{
3167-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
3159+
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
31683160
if (plan.work_size > 0) {
3169-
plan.work_data = malloc(plan.work_size);
3170-
GGML_ASSERT(plan.work_data);
3161+
ctx->compute_plan_buffer.resize(plan.work_size);
3162+
plan.work_data = ctx->compute_plan_buffer.data();
31713163
}
31723164
ggml_graph_compute(&plan, &gf);
3173-
if (plan.work_data) {
3174-
free(plan.work_data);
3175-
}
31763165
}
31773166

31783167
ggml_free(cpy_ctx);
@@ -3280,15 +3269,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
32803269
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
32813270

32823271
{
3283-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
3272+
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
32843273
if (plan.work_size > 0) {
3285-
plan.work_data = malloc(plan.work_size);
3286-
GGML_ASSERT(plan.work_data);
3274+
ctx->compute_plan_buffer.resize(plan.work_size);
3275+
plan.work_data = ctx->compute_plan_buffer.data();
32873276
}
32883277
ggml_graph_compute(&plan, &gf);
3289-
if (plan.work_data) {
3290-
free(plan.work_data);
3291-
}
32923278
}
32933279

32943280
ggml_free(cpy_ctx);

0 commit comments

Comments
 (0)