@@ -2371,13 +2371,34 @@ struct llama_context {
2371
2371
struct llama_control_vector cvec;
2372
2372
};
2373
2373
2374
+ static size_t llama_get_device_count(const llama_model & model) {
2375
+ size_t count = 1;
2376
+ #if defined(GGML_USE_CUDA)
2377
+ count = ggml_backend_cuda_get_device_count();
2378
+ #elif defined(GGML_USE_SYCL)
2379
+ count = ggml_backend_sycl_get_device_count();
2380
+ #elif defined(GGML_USE_VULKAN)
2381
+ count = ggml_backend_vk_get_device_count();
2382
+ #endif
2383
+ #if defined(GGML_USE_RPC)
2384
+ count += model.rpc_servers.size();
2385
+ #endif
2386
+ return count;
2387
+ GGML_UNUSED(model);
2388
+ }
2389
+
2374
2390
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2375
2391
ggml_backend_buffer_type_t buft = nullptr;
2376
2392
2377
- #ifdef GGML_USE_RPC
2378
- std::string endpoint = model.rpc_servers[gpu];
2379
- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2380
- #elif defined(GGML_USE_METAL)
2393
+ #if defined(GGML_USE_RPC)
2394
+ int dev_count = (int)llama_get_device_count(model);
2395
+ int rpc_count = (int)model.rpc_servers.size();
2396
+ if (gpu >= dev_count - rpc_count) {
2397
+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2398
+ return ggml_backend_rpc_buffer_type(endpoint);
2399
+ }
2400
+ #endif
2401
+ #if defined(GGML_USE_METAL)
2381
2402
buft = ggml_backend_metal_buffer_type();
2382
2403
#elif defined(GGML_USE_CUDA)
2383
2404
buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
2425
2446
GGML_UNUSED(tensor_split);
2426
2447
}
2427
2448
2428
- static size_t llama_get_device_count(const llama_model & model) {
2429
- #if defined(GGML_USE_RPC)
2430
- return model.rpc_servers.size();
2431
- #elif defined(GGML_USE_CUDA)
2432
- return ggml_backend_cuda_get_device_count();
2433
- #elif defined(GGML_USE_SYCL)
2434
- return ggml_backend_sycl_get_device_count();
2435
- #elif defined(GGML_USE_VULKAN)
2436
- return ggml_backend_vk_get_device_count();
2437
- #else
2438
- return 1;
2439
- #endif
2440
- GGML_UNUSED(model);
2441
- }
2442
-
2443
2449
static size_t llama_get_device_memory(const llama_model & model, int device) {
2444
2450
#if defined(GGML_USE_RPC)
2445
- size_t total;
2446
- size_t free;
2447
- std::string endpoint = model.rpc_servers[device];
2448
- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2449
- return free;
2450
- #elif defined(GGML_USE_CUDA)
2451
+ int dev_count = (int)llama_get_device_count(model);
2452
+ int rpc_count = (int)model.rpc_servers.size();
2453
+ if (device >= dev_count - rpc_count) {
2454
+ size_t total;
2455
+ size_t free;
2456
+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2457
+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2458
+ return free;
2459
+ }
2460
+ #endif
2461
+ #if defined(GGML_USE_CUDA)
2451
2462
size_t total;
2452
2463
size_t free;
2453
2464
ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -16162,7 +16173,7 @@ struct llama_model * llama_load_model_from_file(
16162
16173
return true;
16163
16174
};
16164
16175
}
16165
- if (params.rpc_servers != nullptr) {
16176
+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0' ) {
16166
16177
// split the servers set them into model->rpc_servers
16167
16178
std::string servers(params.rpc_servers);
16168
16179
size_t pos = 0;
@@ -16325,17 +16336,7 @@ struct llama_context * llama_new_context_with_model(
16325
16336
16326
16337
if (!hparams.vocab_only) {
16327
16338
// initialize backends
16328
- #if defined(GGML_USE_RPC)
16329
- for (auto & server : model->rpc_servers) {
16330
- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16331
- if (backend == nullptr) {
16332
- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16333
- llama_free(ctx);
16334
- return nullptr;
16335
- }
16336
- ctx->backends.push_back(backend);
16337
- }
16338
- #elif defined(GGML_USE_METAL)
16339
+ #if defined(GGML_USE_METAL)
16339
16340
if (model->n_gpu_layers > 0) {
16340
16341
ctx->backend_metal = ggml_backend_metal_init();
16341
16342
if (ctx->backend_metal == nullptr) {
@@ -16427,6 +16428,18 @@ struct llama_context * llama_new_context_with_model(
16427
16428
}
16428
16429
ctx->backends.push_back(backend);
16429
16430
}
16431
+ #endif
16432
+ #if defined(GGML_USE_RPC)
16433
+ for (int i = 0; i < (int)model->rpc_servers.size(); i++) {
16434
+ const char * endpoint = model->rpc_servers[i].c_str();
16435
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
16436
+ if (backend == nullptr) {
16437
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint);
16438
+ llama_free(ctx);
16439
+ return nullptr;
16440
+ }
16441
+ ctx->backends.push_back(backend);
16442
+ }
16430
16443
#endif
16431
16444
ctx->backend_cpu = ggml_backend_cpu_init();
16432
16445
if (ctx->backend_cpu == nullptr) {
0 commit comments