Skip to content

Commit 369213e

Browse files
committed
llama : offload to RPC in addition to other backends
1 parent 549279d commit 369213e

File tree

2 files changed

+53
-38
lines changed

2 files changed

+53
-38
lines changed

ggml-backend.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
321321
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
322322
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
323323
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
324-
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
324+
}
325+
bool same_backend = strcmp(ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)) == 0;
326+
if (!same_backend || !ggml_backend_buffer_copy_tensor(src, dst)) {
325327
#ifndef NDEBUG
326328
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
327329
#endif

llama.cpp

Lines changed: 50 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2371,13 +2371,34 @@ struct llama_context {
23712371
struct llama_control_vector cvec;
23722372
};
23732373

2374+
static size_t llama_get_device_count(const llama_model & model) {
2375+
size_t count = 1;
2376+
#if defined(GGML_USE_CUDA)
2377+
count = ggml_backend_cuda_get_device_count();
2378+
#elif defined(GGML_USE_SYCL)
2379+
count = ggml_backend_sycl_get_device_count();
2380+
#elif defined(GGML_USE_VULKAN)
2381+
count = ggml_backend_vk_get_device_count();
2382+
#endif
2383+
#if defined(GGML_USE_RPC)
2384+
count += model.rpc_servers.size();
2385+
#endif
2386+
return count;
2387+
GGML_UNUSED(model);
2388+
}
2389+
23742390
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
23752391
ggml_backend_buffer_type_t buft = nullptr;
23762392

2377-
#ifdef GGML_USE_RPC
2378-
std::string endpoint = model.rpc_servers[gpu];
2379-
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2380-
#elif defined(GGML_USE_METAL)
2393+
#if defined(GGML_USE_RPC)
2394+
int dev_count = (int)llama_get_device_count(model);
2395+
int rpc_count = (int)model.rpc_servers.size();
2396+
if (gpu >= dev_count - rpc_count) {
2397+
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2398+
return ggml_backend_rpc_buffer_type(endpoint);
2399+
}
2400+
#endif
2401+
#if defined(GGML_USE_METAL)
23812402
buft = ggml_backend_metal_buffer_type();
23822403
#elif defined(GGML_USE_CUDA)
23832404
buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
24252446
GGML_UNUSED(tensor_split);
24262447
}
24272448

2428-
static size_t llama_get_device_count(const llama_model & model) {
2429-
#if defined(GGML_USE_RPC)
2430-
return model.rpc_servers.size();
2431-
#elif defined(GGML_USE_CUDA)
2432-
return ggml_backend_cuda_get_device_count();
2433-
#elif defined(GGML_USE_SYCL)
2434-
return ggml_backend_sycl_get_device_count();
2435-
#elif defined(GGML_USE_VULKAN)
2436-
return ggml_backend_vk_get_device_count();
2437-
#else
2438-
return 1;
2439-
#endif
2440-
GGML_UNUSED(model);
2441-
}
2442-
24432449
static size_t llama_get_device_memory(const llama_model & model, int device) {
24442450
#if defined(GGML_USE_RPC)
2445-
size_t total;
2446-
size_t free;
2447-
std::string endpoint = model.rpc_servers[device];
2448-
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2449-
return free;
2450-
#elif defined(GGML_USE_CUDA)
2451+
int dev_count = (int)llama_get_device_count(model);
2452+
int rpc_count = (int)model.rpc_servers.size();
2453+
if (device >= dev_count - rpc_count) {
2454+
size_t total;
2455+
size_t free;
2456+
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2457+
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2458+
return free;
2459+
}
2460+
#endif
2461+
#if defined(GGML_USE_CUDA)
24512462
size_t total;
24522463
size_t free;
24532464
ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -16162,7 +16173,7 @@ struct llama_model * llama_load_model_from_file(
1616216173
return true;
1616316174
};
1616416175
}
16165-
if (params.rpc_servers != nullptr) {
16176+
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
1616616177
// split the servers set them into model->rpc_servers
1616716178
std::string servers(params.rpc_servers);
1616816179
size_t pos = 0;
@@ -16325,17 +16336,7 @@ struct llama_context * llama_new_context_with_model(
1632516336

1632616337
if (!hparams.vocab_only) {
1632716338
// initialize backends
16328-
#if defined(GGML_USE_RPC)
16329-
for (auto & server : model->rpc_servers) {
16330-
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16331-
if (backend == nullptr) {
16332-
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16333-
llama_free(ctx);
16334-
return nullptr;
16335-
}
16336-
ctx->backends.push_back(backend);
16337-
}
16338-
#elif defined(GGML_USE_METAL)
16339+
#if defined(GGML_USE_METAL)
1633916340
if (model->n_gpu_layers > 0) {
1634016341
ctx->backend_metal = ggml_backend_metal_init();
1634116342
if (ctx->backend_metal == nullptr) {
@@ -16427,6 +16428,18 @@ struct llama_context * llama_new_context_with_model(
1642716428
}
1642816429
ctx->backends.push_back(backend);
1642916430
}
16431+
#endif
16432+
#if defined(GGML_USE_RPC)
16433+
for (int i = 0; i < (int)model->rpc_servers.size(); i++) {
16434+
const char * endpoint = model->rpc_servers[i].c_str();
16435+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
16436+
if (backend == nullptr) {
16437+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint);
16438+
llama_free(ctx);
16439+
return nullptr;
16440+
}
16441+
ctx->backends.push_back(backend);
16442+
}
1643016443
#endif
1643116444
ctx->backend_cpu = ggml_backend_cpu_init();
1643216445
if (ctx->backend_cpu == nullptr) {

0 commit comments

Comments
 (0)