Skip to content

Commit a0e5fa4

Browse files
committed
llama : offload to RPC in addition to other backends
1 parent 7846540 commit a0e5fa4

File tree

3 files changed

+87
-44
lines changed

3 files changed

+87
-44
lines changed

ggml-rpc.cpp

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ struct ggml_backend_rpc_buffer_type_context {
106106
};
107107

108108
struct ggml_backend_rpc_context {
109+
int device;
109110
std::string endpoint;
110111
std::string name;
111112
};
@@ -117,6 +118,9 @@ struct ggml_backend_rpc_buffer_context {
117118
std::string name;
118119
};
119120

121+
// device -> endpoint mapping
122+
static std::unordered_map<int, std::string> endpoints;
123+
120124
// RPC helper functions
121125

122126
static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
@@ -573,7 +577,7 @@ GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
573577

574578
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
575579
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
576-
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
580+
return ggml_backend_rpc_buffer_type(ctx->device);
577581
}
578582

579583
GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
@@ -659,9 +663,13 @@ static ggml_backend_i ggml_backend_rpc_interface = {
659663
/* .event_synchronize = */ NULL,
660664
};
661665

662-
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
666+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(int device) {
663667
static std::mutex mutex;
664668
std::lock_guard<std::mutex> lock(mutex);
669+
if (endpoints.find(device) == endpoints.end()) {
670+
return nullptr;
671+
}
672+
auto endpoint = endpoints[device];
665673
// NOTE: buffer types are allocated and never freed; this is by design
666674
static std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_map;
667675
auto it = buft_map.find(endpoint);
@@ -689,8 +697,17 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
689697
return buft;
690698
}
691699

692-
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
700+
GGML_API GGML_CALL void ggml_backend_rpc_setdevice(const char * endpoint, int device) {
701+
endpoints[device] = endpoint;
702+
}
703+
704+
GGML_CALL ggml_backend_t ggml_backend_rpc_init(int device) {
705+
if (endpoints.find(device) == endpoints.end()) {
706+
return nullptr;
707+
}
708+
auto endpoint = endpoints[device];
693709
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
710+
/* .device = */ device,
694711
/* .endpoint = */ endpoint,
695712
/* .name = */ "RPC",
696713
};
@@ -723,7 +740,13 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
723740
*total = total_mem;
724741
}
725742

726-
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
743+
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(int device, size_t * free, size_t * total) {
744+
if (endpoints.find(device) == endpoints.end()) {
745+
*free = 0;
746+
*total = 0;
747+
return;
748+
}
749+
auto endpoint = endpoints[device];
727750
auto sock = get_socket(endpoint);
728751
if (sock == nullptr) {
729752
*free = 0;

ggml-rpc.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ extern "C" {
1010
#define GGML_RPC_MAX_SERVERS 16
1111

1212
// backend API
13-
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
13+
GGML_API GGML_CALL void ggml_backend_rpc_setdevice(const char * endpoint, int device);
14+
15+
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(int device);
1416
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
1517

16-
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
18+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(int device);
1719

18-
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
20+
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(int device, size_t * free, size_t * total);
1921

2022
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
2123

llama.cpp

Lines changed: 55 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2369,13 +2369,39 @@ struct llama_context {
23692369
struct llama_control_vector cvec;
23702370
};
23712371

2372+
static size_t llama_get_device_count(const llama_model & model) {
2373+
size_t count = 1;
2374+
#if defined(GGML_USE_CUDA)
2375+
count = ggml_backend_cuda_get_device_count();
2376+
#elif defined(GGML_USE_SYCL)
2377+
count = ggml_backend_sycl_get_device_count();
2378+
#elif defined(GGML_USE_VULKAN)
2379+
count = ggml_backend_vk_get_device_count();
2380+
#endif
2381+
#if defined(GGML_USE_RPC)
2382+
int rpc_count = (int)model.rpc_servers.size();
2383+
for (int i = 0; i < rpc_count; i++) {
2384+
int device = count + i;
2385+
const char * endpoint = model.rpc_servers[i].c_str();
2386+
ggml_backend_rpc_setdevice(endpoint, device);
2387+
}
2388+
count += rpc_count;
2389+
#endif
2390+
return count;
2391+
GGML_UNUSED(model);
2392+
}
2393+
23722394
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
23732395
ggml_backend_buffer_type_t buft = nullptr;
23742396

2375-
#ifdef GGML_USE_RPC
2376-
std::string endpoint = model.rpc_servers[gpu];
2377-
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2378-
#elif defined(GGML_USE_METAL)
2397+
#if defined(GGML_USE_RPC)
2398+
int dev_count = (int)llama_get_device_count(model);
2399+
int rpc_count = (int)model.rpc_servers.size();
2400+
if (gpu >= dev_count - rpc_count) {
2401+
return ggml_backend_rpc_buffer_type(gpu);
2402+
}
2403+
#endif
2404+
#if defined(GGML_USE_METAL)
23792405
buft = ggml_backend_metal_buffer_type();
23802406
#elif defined(GGML_USE_CUDA)
23812407
buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2423,29 +2449,18 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
24232449
GGML_UNUSED(tensor_split);
24242450
}
24252451

2426-
static size_t llama_get_device_count(const llama_model & model) {
2427-
#if defined(GGML_USE_RPC)
2428-
return model.rpc_servers.size();
2429-
#elif defined(GGML_USE_CUDA)
2430-
return ggml_backend_cuda_get_device_count();
2431-
#elif defined(GGML_USE_SYCL)
2432-
return ggml_backend_sycl_get_device_count();
2433-
#elif defined(GGML_USE_VULKAN)
2434-
return ggml_backend_vk_get_device_count();
2435-
#else
2436-
return 1;
2437-
#endif
2438-
GGML_UNUSED(model);
2439-
}
2440-
24412452
static size_t llama_get_device_memory(const llama_model & model, int device) {
24422453
#if defined(GGML_USE_RPC)
2443-
size_t total;
2444-
size_t free;
2445-
std::string endpoint = model.rpc_servers[device];
2446-
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2447-
return free;
2448-
#elif defined(GGML_USE_CUDA)
2454+
int dev_count = (int)llama_get_device_count(model);
2455+
int rpc_count = (int)model.rpc_servers.size();
2456+
if (device >= dev_count - rpc_count) {
2457+
size_t total;
2458+
size_t free;
2459+
ggml_backend_rpc_get_device_memory(device, &free, &total);
2460+
return free;
2461+
}
2462+
#endif
2463+
#if defined(GGML_USE_CUDA)
24492464
size_t total;
24502465
size_t free;
24512466
ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -16146,7 +16161,7 @@ struct llama_model * llama_load_model_from_file(
1614616161
return true;
1614716162
};
1614816163
}
16149-
if (params.rpc_servers != nullptr) {
16164+
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
1615016165
// split the servers set them into model->rpc_servers
1615116166
std::string servers(params.rpc_servers);
1615216167
size_t pos = 0;
@@ -16304,17 +16319,7 @@ struct llama_context * llama_new_context_with_model(
1630416319

1630516320
if (!hparams.vocab_only) {
1630616321
// initialize backends
16307-
#if defined(GGML_USE_RPC)
16308-
for (auto & server : model->rpc_servers) {
16309-
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16310-
if (backend == nullptr) {
16311-
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16312-
llama_free(ctx);
16313-
return nullptr;
16314-
}
16315-
ctx->backends.push_back(backend);
16316-
}
16317-
#elif defined(GGML_USE_METAL)
16322+
#if defined(GGML_USE_METAL)
1631816323
if (model->n_gpu_layers > 0) {
1631916324
ctx->backend_metal = ggml_backend_metal_init();
1632016325
if (ctx->backend_metal == nullptr) {
@@ -16406,6 +16411,19 @@ struct llama_context * llama_new_context_with_model(
1640616411
}
1640716412
ctx->backends.push_back(backend);
1640816413
}
16414+
#endif
16415+
#if defined(GGML_USE_RPC)
16416+
int dev_count = (int)llama_get_device_count(*model);
16417+
int rpc_count = (int)model->rpc_servers.size();
16418+
for (int i = dev_count - rpc_count; i < dev_count; i++) {
16419+
ggml_backend_t backend = ggml_backend_rpc_init(i);
16420+
if (backend == nullptr) {
16421+
LLAMA_LOG_ERROR("%s: failed to initialize RPC #%d\n", __func__, i);
16422+
llama_free(ctx);
16423+
return nullptr;
16424+
}
16425+
ctx->backends.push_back(backend);
16426+
}
1640916427
#endif
1641016428
ctx->backend_cpu = ggml_backend_cpu_init();
1641116429
if (ctx->backend_cpu == nullptr) {

0 commit comments

Comments
 (0)