Skip to content

Commit b1331d7

Browse files
committed
reusable buffers
1 parent cb1dec0 commit b1331d7

File tree

8 files changed

+126
-134
lines changed

8 files changed

+126
-134
lines changed

examples/baby-llama/baby-llama.cpp

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,6 +1569,8 @@ int main(int argc, char ** argv) {
15691569
int n_tokens = model.hparams.n_ctx;
15701570
int n_vocab = model.hparams.n_vocab;
15711571

1572+
auto compute_plan_buffer = std::vector<uint8_t>();
1573+
15721574
for (int ex=0; ex<n_examples; ++ex) {
15731575
struct ggml_init_params params = {
15741576
/*.mem_size =*/ compute_size,
@@ -1598,13 +1600,10 @@ int main(int argc, char ** argv) {
15981600
{
15991601
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
16001602
if (plan.work_size > 0) {
1601-
plan.work_data = malloc(plan.work_size);
1602-
GGML_ASSERT(plan.work_data);
1603+
compute_plan_buffer.resize(plan.work_size);
1604+
plan.work_data = compute_plan_buffer.data();
16031605
}
16041606
ggml_graph_compute(&plan, &gf);
1605-
if (plan.work_data) {
1606-
free(plan.work_data);
1607-
}
16081607
}
16091608

16101609
float error_before_opt = ggml_get_f32_1d(e, 0);
@@ -1625,13 +1624,10 @@ int main(int argc, char ** argv) {
16251624
{
16261625
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
16271626
if (plan.work_size > 0) {
1628-
plan.work_data = malloc(plan.work_size);
1629-
GGML_ASSERT(plan.work_data);
1627+
compute_plan_buffer.resize(plan.work_size);
1628+
plan.work_data = compute_plan_buffer.data();
16301629
}
16311630
ggml_graph_compute(&plan, &gf);
1632-
if (plan.work_data) {
1633-
free(plan.work_data);
1634-
}
16351631
}
16361632

16371633
float error_after_opt = ggml_get_f32_1d(e, 0);
@@ -1689,13 +1685,10 @@ int main(int argc, char ** argv) {
16891685
{
16901686
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
16911687
if (plan.work_size > 0) {
1692-
plan.work_data = malloc(plan.work_size);
1693-
GGML_ASSERT(plan.work_data);
1688+
compute_plan_buffer.resize(plan.work_size);
1689+
plan.work_data = compute_plan_buffer.data();
16941690
}
16951691
ggml_graph_compute(&plan, &gf);
1696-
if (plan.work_data) {
1697-
free(plan.work_data);
1698-
}
16991692
}
17001693

17011694
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);

examples/benchmark/benchmark-matmult.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -164,16 +164,15 @@ int main(int argc, char ** argv) {
164164
TENSOR_DUMP(m11);
165165
TENSOR_DUMP(m2);
166166

167+
auto compute_plan_buffer = std::vector<uint8_t>();
168+
167169
{
168-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
170+
auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
169171
if (plan.work_size > 0) {
170-
plan.work_data = malloc(plan.work_size);
171-
GGML_ASSERT(plan.work_data);
172+
compute_plan_buffer.resize(plan.work_size);
173+
plan.work_data = compute_plan_buffer.data();
172174
}
173175
ggml_graph_compute(&plan, &gf);
174-
if (plan.work_data) {
175-
free(plan.work_data);
176-
}
177176
}
178177

179178
TENSOR_DUMP(gf.nodes[0]);
@@ -229,15 +228,12 @@ int main(int argc, char ** argv) {
229228
long long int start = ggml_time_us();
230229
//printf("Running ggml_graph_compute\n");
231230
{
232-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
231+
auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
233232
if (plan.work_size > 0) {
234-
plan.work_data = malloc(plan.work_size);
235-
GGML_ASSERT(plan.work_data);
233+
compute_plan_buffer.resize(plan.work_size);
234+
plan.work_data = compute_plan_buffer.data();
236235
}
237236
ggml_graph_compute(&plan, &gf31);
238-
if (plan.work_data) {
239-
free(plan.work_data);
240-
}
241237
}
242238

243239
long long int stop = ggml_time_us();
@@ -272,15 +268,12 @@ int main(int argc, char ** argv) {
272268

273269
// Running a different graph computation to make sure we override the CPU cache lines
274270
{
275-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
271+
auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
276272
if (plan.work_size > 0) {
277-
plan.work_data = malloc(plan.work_size);
278-
GGML_ASSERT(plan.work_data);
273+
compute_plan_buffer.resize(plan.work_size);
274+
plan.work_data = compute_plan_buffer.data();
279275
}
280276
ggml_graph_compute(&plan, &gf32);
281-
if (plan.work_data) {
282-
free(plan.work_data);
283-
}
284277
}
285278
}
286279
printf("\n");

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3181,6 +3181,8 @@ int main(int argc, char ** argv) {
31813181
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
31823182
}
31833183

3184+
auto compute_plan_buffer = std::vector<uint8_t>();
3185+
31843186
printf("%s: begin training\n", __func__);
31853187

31863188
for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3244,15 +3246,12 @@ int main(int argc, char ** argv) {
32443246
}
32453247

32463248
{
3247-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
3249+
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
32483250
if (plan.work_size > 0) {
3249-
plan.work_data = malloc(plan.work_size);
3250-
GGML_ASSERT(plan.work_data);
3251+
compute_plan_buffer.resize(plan.work_size);
3252+
plan.work_data = compute_plan_buffer.data();
32513253
}
32523254
ggml_graph_compute(&plan, gf);
3253-
if (plan.work_data) {
3254-
free(plan.work_data);
3255-
}
32563255
}
32573256

32583257
size_t used_mem_before_opt = ggml_used_mem(ctx0);
@@ -3278,15 +3277,12 @@ int main(int argc, char ** argv) {
32783277
model.train_tokens += n_batch * n_tokens;
32793278

32803279
{
3281-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
3280+
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
32823281
if (plan.work_size > 0) {
3283-
plan.work_data = malloc(plan.work_size);
3284-
GGML_ASSERT(plan.work_data);
3282+
compute_plan_buffer.resize(plan.work_size);
3283+
plan.work_data = compute_plan_buffer.data();
32853284
}
32863285
ggml_graph_compute(&plan, gf);
3287-
if (plan.work_data) {
3288-
free(plan.work_data);
3289-
}
32903286
}
32913287

32923288
float error_after_opt = ggml_get_f32_1d(loss, 0);
@@ -3376,15 +3372,12 @@ int main(int argc, char ** argv) {
33763372
ggml_build_forward_expand(&gf, logits);
33773373

33783374
{
3379-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
3375+
auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
33803376
if (plan.work_size > 0) {
3381-
plan.work_data = malloc(plan.work_size);
3382-
GGML_ASSERT(plan.work_data);
3377+
compute_plan_buffer.resize(plan.work_size);
3378+
plan.work_data = compute_plan_buffer.data();
33833379
}
33843380
ggml_graph_compute(&plan, &gf);
3385-
if (plan.work_data) {
3386-
free(plan.work_data);
3387-
}
33883381
}
33893382

33903383
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);

ggml.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15974,7 +15974,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1597415974
const struct ggml_cgraph * cgraph = state->shared->cgraph;
1597515975

1597615976
const struct ggml_graph_compute_plan * plan = state->shared->plan;
15977-
const int *n_tasks_arr = plan->n_tasks;
15977+
const int * n_tasks_arr = plan->n_tasks;
1597815978

1597915979
const int n_threads = state->shared->n_threads;
1598015980
set_numa_thread_affinity(state->ith, n_threads);
@@ -16490,6 +16490,7 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
1649016490
}
1649116491
}
1649216492

16493+
// TODO: avoid allocating memory frequently.
1649316494
static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
1649416495
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
1649516496
if (plan.work_size > 0) {

ggml.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ extern "C" {
449449
// Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
450450
size_t work_size;
451451
// Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
452-
void * work_data;
452+
uint8_t * work_data;
453453

454454
int n_threads;
455455

llama.cpp

Lines changed: 25 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,10 @@ struct llama_context {
321321
// input embedding (1-dimensional array: [n_embd])
322322
std::vector<float> embedding;
323323

324+
// reusable buffer for `struct ggml_graph_compute_plan.work_data`
325+
// std::vector guarantees the elements are stored contiguously.
326+
std::vector<uint8_t> compute_plan_buffer;
327+
324328
// memory buffers used to evaluate the model
325329
// TODO: move in llama_state
326330
llama_ctx_buffer buf_compute;
@@ -1591,10 +1595,13 @@ static bool llama_eval_internal(
15911595
// run the computation
15921596
ggml_build_forward_expand(&gf, cur);
15931597

1598+
bool call_ggml_graph_compute = true;
1599+
15941600
#ifdef GGML_USE_METAL
15951601
if (lctx.ctx_metal && N == 1) {
15961602
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
15971603
ggml_metal_get_tensor (lctx.ctx_metal, cur);
1604+
call_ggml_graph_compute = false;
15981605
} else {
15991606
// IMPORTANT:
16001607
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1611,32 +1618,17 @@ static bool llama_eval_internal(
16111618
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
16121619
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
16131620
}
1614-
1615-
{
1616-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
1617-
if (plan.work_size > 0) {
1618-
plan.work_data = malloc(plan.work_size);
1619-
GGML_ASSERT(plan.work_data);
1620-
}
1621-
ggml_graph_compute(&plan, &gf);
1622-
if (plan.work_data) {
1623-
free(plan.work_data);
1624-
}
1625-
}
16261621
}
1627-
#else
1628-
{
1629-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
1622+
#endif
1623+
1624+
if (call_ggml_graph_compute) {
1625+
auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
16301626
if (plan.work_size > 0) {
1631-
plan.work_data = malloc(plan.work_size);
1632-
GGML_ASSERT(plan.work_data);
1627+
lctx.compute_plan_buffer.resize(plan.work_size);
1628+
plan.work_data = lctx.compute_plan_buffer.data();
16331629
}
16341630
ggml_graph_compute(&plan, &gf);
1635-
if (plan.work_data) {
1636-
free(plan.work_data);
1637-
}
16381631
}
1639-
#endif
16401632

16411633
if (cgraph_fname) {
16421634
ggml_graph_export(&gf, cgraph_fname);
@@ -2822,6 +2814,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
28222814
// read tensors and apply
28232815
bool warned = false;
28242816
int n_tensors = 0;
2817+
2818+
auto compute_plan_buffer = std::vector<uint8_t>();
2819+
28252820
while (true) {
28262821
int32_t n_dims;
28272822
int32_t length;
@@ -2988,15 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
29882983
struct ggml_cgraph gf = ggml_build_forward(r);
29892984

29902985
{
2991-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
2986+
auto plan = ggml_graph_compute_make_plan(&gf, n_threads);
29922987
if (plan.work_size > 0) {
2993-
plan.work_data = malloc(plan.work_size);
2994-
GGML_ASSERT(plan.work_data);
2988+
compute_plan_buffer.resize(plan.work_size);
2989+
plan.work_data = compute_plan_buffer.data();
29952990
}
29962991
ggml_graph_compute(&plan, &gf);
2997-
if (plan.work_data) {
2998-
free(plan.work_data);
2999-
}
30002992
}
30012993

30022994
// we won't need these tensors again, reset the context to save memory
@@ -3171,15 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
31713163
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
31723164

31733165
{
3174-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
3166+
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
31753167
if (plan.work_size > 0) {
3176-
plan.work_data = malloc(plan.work_size);
3177-
GGML_ASSERT(plan.work_data);
3168+
ctx->compute_plan_buffer.resize(plan.work_size);
3169+
plan.work_data = ctx->compute_plan_buffer.data();
31783170
}
31793171
ggml_graph_compute(&plan, &gf);
3180-
if (plan.work_data) {
3181-
free(plan.work_data);
3182-
}
31833172
}
31843173

31853174
ggml_free(cpy_ctx);
@@ -3287,15 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
32873276
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
32883277

32893278
{
3290-
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
3279+
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
32913280
if (plan.work_size > 0) {
3292-
plan.work_data = malloc(plan.work_size);
3293-
GGML_ASSERT(plan.work_data);
3281+
ctx->compute_plan_buffer.resize(plan.work_size);
3282+
plan.work_data = ctx->compute_plan_buffer.data();
32943283
}
32953284
ggml_graph_compute(&plan, &gf);
3296-
if (plan.work_data) {
3297-
free(plan.work_data);
3298-
}
32993285
}
33003286

33013287
ggml_free(cpy_ctx);

0 commit comments

Comments
 (0)