Skip to content

Commit bea440d

Browse files
committed
llama : refactor model loader with backend registry
1 parent 19d900a commit bea440d

15 files changed

+1512
-1707
lines changed

ggml/include/ggml-backend.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ extern "C" {
169169

170170

171171
// Functions that may be obtained using ggml_backend_reg_get_proc_address
172-
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
173-
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
172+
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tesor_split);
173+
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
174174

175175
//
176176
// Backend registry

ggml/include/ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
2828
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
2929

3030
// split tensor buffer that splits matrices by rows across multiple devices
31-
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
31+
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
3232

3333
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
3434
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

ggml/src/ggml-amx.cpp

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@
1616
#if defined(__AMX_INT8__)
1717

1818
// AMX buffer interface
19-
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
20-
return "AMX";
21-
22-
GGML_UNUSED(buffer);
23-
}
24-
2519
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2620
free(buffer->context);
2721
}
@@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
7266
}
7367

7468
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
75-
/* .get_name = */ ggml_backend_amx_buffer_get_name,
7669
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
7770
/* .get_base = */ ggml_backend_amx_buffer_get_base,
7871
/* .init_tensor = */ NULL, // no initialization required
@@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
149142
delete backend;
150143
}
151144

152-
static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
153-
return ggml_backend_amx_buffer_type();
154-
155-
GGML_UNUSED(backend);
156-
}
157-
158145
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
159146
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
160147

@@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
187174
static struct ggml_backend_i ggml_backend_amx_i = {
188175
/* .get_name = */ ggml_backend_amx_name,
189176
/* .free = */ ggml_backend_amx_free,
190-
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
191177
/* .set_tensor_async = */ NULL,
192178
/* .get_tensor_async = */ NULL,
193179
/* .cpy_tensor_async = */ NULL,
@@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
197183
/* .graph_plan_update = */ NULL,
198184
/* .graph_plan_compute = */ NULL,
199185
/* .graph_compute = */ ggml_backend_amx_graph_compute,
200-
/* .supports_op = */ NULL,
201-
/* .supports_buft = */ NULL,
202-
/* .offload_op = */ NULL,
203186
/* .event_record = */ NULL,
204187
/* .event_wait = */ NULL,
205188
};

ggml/src/ggml-backend-impl.h

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ extern "C" {
2222
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
2323
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
2424
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25-
// (optional) check if tensor data is in host memory (defaults to false)
25+
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
2626
bool (*is_host) (ggml_backend_buffer_type_t buft);
2727
};
2828

@@ -37,7 +37,6 @@ extern "C" {
3737
//
3838

3939
struct ggml_backend_buffer_i {
40-
const char * (*get_name) (ggml_backend_buffer_t buffer);
4140
// (optional) free the buffer
4241
void (*free_buffer) (ggml_backend_buffer_t buffer);
4342
// base address of the buffer
@@ -88,19 +87,16 @@ extern "C" {
8887

8988
void (*free)(ggml_backend_t backend);
9089

91-
// Will be moved to the device interface
92-
// buffer allocation
93-
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
94-
9590
// (optional) asynchronous tensor data access
9691
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
9792
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
9893
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
9994

100-
// (optional) complete all pending operations
95+
// (optional) complete all pending operations (required if the backend supports async operations)
10196
void (*synchronize)(ggml_backend_t backend);
10297

103-
// (optional) compute graph with a plan (not used currently)
98+
// (optional) graph plans
99+
// compute graph with a plan (not used currently)
104100
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
105101
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
106102
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@@ -111,13 +107,6 @@ extern "C" {
111107
// compute graph (always async if supported by the backend)
112108
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
113109

114-
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
115-
// new backends should implement the device interface instead
116-
// These functions are being moved to the device interface
117-
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
118-
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
119-
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
120-
121110
// (optional) event synchronization
122111
// record an event on this stream
123112
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);

ggml/src/ggml-backend.cpp

Lines changed: 32 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
3434
}
3535

3636
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
37+
if (size == 0) {
38+
// return a dummy buffer for zero-sized allocations
39+
return ggml_backend_buffer_init(buft, {}, NULL, 0);
40+
}
41+
3742
return buft->iface.alloc_buffer(buft, size);
3843
}
3944

@@ -89,7 +94,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
8994
}
9095

9196
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
92-
return buffer->iface.get_name(buffer);
97+
return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
9398
}
9499

95100
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -108,6 +113,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
108113
}
109114

110115
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
116+
// get_base is optional if the buffer is zero-sized
117+
if (buffer->iface.get_base == NULL && buffer->size == 0) {
118+
return NULL;
119+
}
120+
111121
void * base = buffer->iface.get_base(buffer);
112122

113123
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -198,7 +208,7 @@ void ggml_backend_free(ggml_backend_t backend) {
198208
}
199209

200210
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
201-
return backend->iface.get_default_buffer_type(backend);
211+
return ggml_backend_dev_buffer_type(backend->device);
202212
}
203213

204214
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
@@ -238,43 +248,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
238248
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
239249
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240250

251+
if (size == 0) {
252+
return;
253+
}
254+
241255
GGML_ASSERT(buf != NULL && "tensor buffer not set");
242256
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243257
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
244258

245-
if (!size) {
246-
return;
247-
}
248-
249259
buf->iface.set_tensor(buf, tensor, data, offset, size);
250260
}
251261

252262
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
253263
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254264

265+
if (size == 0) {
266+
return;
267+
}
268+
255269
GGML_ASSERT(buf != NULL && "tensor buffer not set");
256270
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257271
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
258272

259-
if (!size) {
260-
return;
261-
}
262-
263273
buf->iface.get_tensor(buf, tensor, data, offset, size);
264274
}
265275

266276
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267277
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268278

269-
GGML_ASSERT(buf != NULL && "tensor buffer not set");
270-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
272-
273-
if (!size) {
279+
if (size == 0) {
274280
return;
275281
}
276282

277-
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
283+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
284+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
285+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
286+
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
278287

279288
buf->iface.memset_tensor(buf, tensor, value, offset, size);
280289
}
@@ -316,32 +325,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
316325
}
317326

318327
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
319-
// helper to ease transition to device interface
320-
if (backend->device) {
321-
return ggml_backend_dev_supports_op(backend->device, op);
322-
}
323-
324-
return backend->iface.supports_op(backend, op);
328+
return ggml_backend_dev_supports_op(backend->device, op);
325329
}
326330

327331
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
328-
// helper to ease transition to device interface
329-
if (backend->device) {
330-
return ggml_backend_dev_supports_buft(backend->device, buft);
331-
}
332-
return backend->iface.supports_buft(backend, buft);
332+
return ggml_backend_dev_supports_buft(backend->device, buft);
333333
}
334334

335335
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
336-
// helper to ease transition to device interface
337-
if (backend->device) {
338-
return ggml_backend_dev_offload_op(backend->device, op);
339-
}
340-
341-
if (backend->iface.offload_op != NULL) {
342-
return backend->iface.offload_op(backend, op);
343-
}
344-
return false;
336+
return ggml_backend_dev_offload_op(backend->device, op);
345337
}
346338

347339
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
@@ -713,12 +705,6 @@ ggml_backend_t ggml_backend_init_best(void) {
713705

714706
// backend CPU
715707

716-
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
717-
return "CPU";
718-
719-
GGML_UNUSED(buffer);
720-
}
721-
722708
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
723709
uintptr_t data = (uintptr_t)buffer->context;
724710

@@ -767,7 +753,6 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
767753
}
768754

769755
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
770-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
771756
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
772757
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
773758
/* .init_tensor = */ NULL, // no initialization required
@@ -780,7 +765,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
780765
};
781766

782767
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
783-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
784768
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
785769
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
786770
/* .init_tensor = */ NULL, // no initialization required
@@ -799,19 +783,14 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
799783
}
800784

801785
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
802-
auto alloc_size = size;
803-
if (alloc_size == 0) {
804-
alloc_size = 1;
805-
}
806-
807-
void * data = ggml_aligned_malloc(alloc_size);
786+
void * data = ggml_aligned_malloc(size);
808787

809788
if (data == NULL) {
810-
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
789+
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
811790
return NULL;
812791
}
813792

814-
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
793+
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
815794
}
816795

817796
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -923,12 +902,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) {
923902
delete backend;
924903
}
925904

926-
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
927-
return ggml_backend_cpu_buffer_type();
928-
929-
GGML_UNUSED(backend);
930-
}
931-
932905
struct ggml_backend_plan_cpu {
933906
struct ggml_cplan cplan;
934907
struct ggml_cgraph cgraph;
@@ -998,7 +971,6 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
998971
static const struct ggml_backend_i ggml_backend_cpu_i = {
999972
/* .get_name = */ ggml_backend_cpu_get_name,
1000973
/* .free = */ ggml_backend_cpu_free,
1001-
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
1002974
/* .set_tensor_async = */ NULL,
1003975
/* .get_tensor_async = */ NULL,
1004976
/* .cpy_tensor_async = */ NULL,
@@ -1008,9 +980,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
1008980
/* .graph_plan_update = */ NULL,
1009981
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
1010982
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
1011-
/* .supports_op = */ NULL,
1012-
/* .supports_buft = */ NULL,
1013-
/* .offload_op = */ NULL,
1014983
/* .event_record = */ NULL,
1015984
/* .event_wait = */ NULL,
1016985
};
@@ -1315,12 +1284,6 @@ struct ggml_backend_multi_buffer_context {
13151284
size_t n_buffers;
13161285
};
13171286

1318-
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
1319-
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1320-
1321-
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1322-
}
1323-
13241287
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
13251288
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
13261289
for (size_t i = 0; i < ctx->n_buffers; i++) {
@@ -1339,7 +1302,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
13391302
}
13401303

13411304
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
1342-
/* .get_name = */ ggml_backend_multi_buffer_get_name,
13431305
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
13441306
/* .get_base = */ NULL,
13451307
/* .init_tensor = */ NULL,
@@ -1368,7 +1330,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
13681330
}
13691331

13701332
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
1371-
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
1333+
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
13721334
}
13731335

13741336
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {

0 commit comments

Comments
 (0)