Skip to content

Commit 5488fb7

Browse files
slarenggerganov
andauthored
ggml : allocate graphs in a context (#2392)
* ggml : graph allocation in contexts * allocate work buffer as a ggml_object in ggml_graph_compute_with_ctx * llama.cpp : allocate graph in the context * add GGML_PAD --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent eb542d3 commit 5488fb7

File tree

3 files changed

+134
-88
lines changed

3 files changed

+134
-88
lines changed

ggml.c

Lines changed: 100 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -4071,8 +4071,8 @@ bool ggml_is_numa(void) {
40714071
////////////////////////////////////////////////////////////////////////////////
40724072

40734073
void ggml_print_object(const struct ggml_object * obj) {
4074-
GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
4075-
obj->offs, obj->size, (const void *) obj->next);
4074+
GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
4075+
obj->type, obj->offs, obj->size, (const void *) obj->next);
40764076
}
40774077

40784078
void ggml_print_objects(const struct ggml_context * ctx) {
@@ -4212,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
42124212
}
42134213

42144214
size_t ggml_tensor_overhead(void) {
4215-
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
4215+
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
42164216
}
42174217

42184218
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@@ -4383,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
43834383
return NULL;
43844384
}
43854385

4386-
const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
4386+
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
43874387

43884388
*ctx = (struct ggml_context) {
43894389
/*.mem_size =*/ mem_size,
@@ -4472,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
44724472
struct ggml_object * obj = ctx->objects_begin;
44734473

44744474
while (obj != NULL) {
4475-
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4475+
if (obj->type == GGML_OBJECT_TENSOR) {
4476+
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
44764477

4477-
const size_t size = ggml_nbytes(tensor);
4478+
const size_t size = ggml_nbytes(tensor);
44784479

4479-
if (max_size < size) {
4480-
max_size = size;
4480+
if (max_size < size) {
4481+
max_size = size;
4482+
}
44814483
}
44824484

44834485
obj = obj->next;
@@ -4509,90 +4511,87 @@ static void ggml_scratch_load(struct ggml_context * ctx) {
45094511

45104512
////////////////////////////////////////////////////////////////////////////////
45114513

4512-
static struct ggml_tensor * ggml_new_tensor_impl(
4513-
struct ggml_context * ctx,
4514-
enum ggml_type type,
4515-
int n_dims,
4516-
const int64_t* ne,
4517-
void* data) {
4514+
static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
45184515
// always insert objects at the end of the context's memory pool
45194516
struct ggml_object * obj_cur = ctx->objects_end;
45204517

45214518
const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
45224519
const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
45234520
const size_t cur_end = cur_offs + cur_size;
45244521

4525-
size_t size_needed = 0;
4526-
4527-
if (data == NULL && !ctx->no_alloc) {
4528-
size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4529-
for (int i = 1; i < n_dims; i++) {
4530-
size_needed *= ne[i];
4531-
}
4532-
// align to GGML_MEM_ALIGN
4533-
size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
4534-
}
4522+
// align to GGML_MEM_ALIGN
4523+
size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
45354524

45364525
char * const mem_buffer = ctx->mem_buffer;
45374526
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
45384527

4539-
if (ctx->scratch.data == NULL || data != NULL) {
4540-
size_needed += GGML_TENSOR_SIZE;
4528+
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4529+
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4530+
__func__, cur_end + size_needed, ctx->mem_size);
4531+
assert(false);
4532+
return NULL;
4533+
}
45414534

4542-
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4543-
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4544-
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4545-
assert(false);
4546-
return NULL;
4547-
}
4535+
*obj_new = (struct ggml_object) {
4536+
.offs = cur_end + GGML_OBJECT_SIZE,
4537+
.size = size_needed,
4538+
.next = NULL,
4539+
.type = type,
4540+
};
45484541

4549-
*obj_new = (struct ggml_object) {
4550-
.offs = cur_end + GGML_OBJECT_SIZE,
4551-
.size = size_needed,
4552-
.next = NULL,
4553-
};
4542+
ggml_assert_aligned(mem_buffer + obj_new->offs);
4543+
4544+
if (obj_cur != NULL) {
4545+
obj_cur->next = obj_new;
45544546
} else {
4555-
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4556-
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4557-
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4558-
assert(false);
4559-
return NULL;
4547+
// this is the first object in this context
4548+
ctx->objects_begin = obj_new;
4549+
}
4550+
4551+
ctx->objects_end = obj_new;
4552+
4553+
//printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4554+
4555+
return obj_new;
4556+
}
4557+
4558+
static struct ggml_tensor * ggml_new_tensor_impl(
4559+
struct ggml_context * ctx,
4560+
enum ggml_type type,
4561+
int n_dims,
4562+
const int64_t* ne,
4563+
void* data) {
4564+
4565+
size_t data_size = 0;
4566+
4567+
if (data == NULL && !ctx->no_alloc) {
4568+
data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4569+
for (int i = 1; i < n_dims; i++) {
4570+
data_size *= ne[i];
45604571
}
4572+
}
45614573

4562-
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4563-
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4564-
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4574+
if (ctx->scratch.data != NULL && data == NULL) {
4575+
// allocate tensor data in the scratch buffer
4576+
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4577+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4578+
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
45654579
assert(false);
45664580
return NULL;
45674581
}
45684582

45694583
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
45704584

4571-
*obj_new = (struct ggml_object) {
4572-
.offs = cur_end + GGML_OBJECT_SIZE,
4573-
.size = GGML_TENSOR_SIZE,
4574-
.next = NULL,
4575-
};
4576-
4577-
//printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
4585+
ctx->scratch.offs += data_size;
45784586

4579-
ctx->scratch.offs += size_needed;
4587+
data_size = 0;
45804588
}
45814589

4582-
if (obj_cur != NULL) {
4583-
obj_cur->next = obj_new;
4584-
} else {
4585-
// this is the first object in this context
4586-
ctx->objects_begin = obj_new;
4587-
}
4588-
4589-
ctx->objects_end = obj_new;
4590-
4591-
//printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4590+
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
45924591

4593-
struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
4592+
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
45944593

4595-
ggml_assert_aligned(result);
4594+
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
45964595

45974596
*result = (struct ggml_tensor) {
45984597
/*.type =*/ type,
@@ -5026,9 +5025,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
50265025
char * const mem_buffer = ctx->mem_buffer;
50275026

50285027
while (obj != NULL) {
5029-
struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5030-
if (strcmp(cur->name, name) == 0) {
5031-
return cur;
5028+
if (obj->type == GGML_OBJECT_TENSOR) {
5029+
struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5030+
if (strcmp(cur->name, name) == 0) {
5031+
return cur;
5032+
}
50325033
}
50335034

50345035
obj = obj->next;
@@ -15829,6 +15830,35 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
1582915830
return result;
1583015831
}
1583115832

15833+
struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15834+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15835+
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15836+
15837+
*cgraph = (struct ggml_cgraph) {
15838+
/*.n_nodes =*/ 0,
15839+
/*.n_leafs =*/ 0,
15840+
/*.nodes =*/ { NULL },
15841+
/*.grads =*/ { NULL },
15842+
/*.leafs =*/ { NULL },
15843+
/*.hash_table =*/ { NULL },
15844+
/*.perf_runs =*/ 0,
15845+
/*.perf_cycles =*/ 0,
15846+
/*.perf_time_us =*/ 0,
15847+
};
15848+
15849+
return cgraph;
15850+
}
15851+
15852+
struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15853+
struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15854+
ggml_build_forward_impl(cgraph, tensor, false);
15855+
return cgraph;
15856+
}
15857+
15858+
size_t ggml_graph_overhead(void) {
15859+
return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15860+
}
15861+
1583215862
//
1583315863
// thread data
1583415864
//
@@ -16544,10 +16574,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
1654416574
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
1654516575
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
1654616576

16547-
struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
16548-
GGML_ASSERT(buf);
16577+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
1654916578

16550-
cplan.work_data = buf->data;
16579+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
1655116580

1655216581
ggml_graph_compute(cgraph, &cplan);
1655316582
}

ggml.h

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@
208208

209209
#define GGML_UNUSED(x) (void)(x)
210210

211+
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
211212

212213
#define GGML_ASSERT(x) \
213214
do { \
@@ -396,14 +397,22 @@ extern "C" {
396397
GGML_UNARY_OP_SILU,
397398
};
398399

400+
enum ggml_object_type {
401+
GGML_OBJECT_TENSOR,
402+
GGML_OBJECT_GRAPH,
403+
GGML_OBJECT_WORK_BUFFER
404+
};
405+
399406
// ggml object
400407
struct ggml_object {
401408
size_t offs;
402409
size_t size;
403410

404411
struct ggml_object * next;
405412

406-
char padding[8];
413+
enum ggml_object_type type;
414+
415+
char padding[4];
407416
};
408417

409418
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
@@ -424,7 +433,7 @@ extern "C" {
424433
enum ggml_op op;
425434

426435
// op params - allocated as int32_t for alignment
427-
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
436+
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
428437

429438
bool is_param;
430439

@@ -485,6 +494,8 @@ extern "C" {
485494
int64_t perf_time_us;
486495
};
487496

497+
static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
498+
488499
// scratch buffer
489500
struct ggml_scratch {
490501
size_t offs;
@@ -1391,11 +1402,17 @@ extern "C" {
13911402
struct ggml_context * ctx,
13921403
struct ggml_tensor * tensor);
13931404

1405+
13941406
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
13951407

13961408
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
13971409
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
13981410

1411+
// graph allocation in a context
1412+
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
1413+
GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
1414+
GGML_API size_t ggml_graph_overhead(void);
1415+
13991416
// ggml_graph_plan() has to be called before ggml_graph_compute()
14001417
// when plan.work_size > 0, caller must allocate memory for plan.work_data
14011418
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);

0 commit comments

Comments
 (0)