@@ -2369,13 +2369,39 @@ struct llama_context {
2369
2369
struct llama_control_vector cvec;
2370
2370
};
2371
2371
2372
+ static size_t llama_get_device_count(const llama_model & model) {
2373
+ size_t count = 1;
2374
+ #if defined(GGML_USE_CUDA)
2375
+ count = ggml_backend_cuda_get_device_count();
2376
+ #elif defined(GGML_USE_SYCL)
2377
+ count = ggml_backend_sycl_get_device_count();
2378
+ #elif defined(GGML_USE_VULKAN)
2379
+ count = ggml_backend_vk_get_device_count();
2380
+ #endif
2381
+ #if defined(GGML_USE_RPC)
2382
+ int rpc_count = (int)model.rpc_servers.size();
2383
+ for (int i = 0; i < rpc_count; i++) {
2384
+ int device = count + i;
2385
+ const char * endpoint = model.rpc_servers[i].c_str();
2386
+ ggml_backend_rpc_setdevice(endpoint, device);
2387
+ }
2388
+ count += rpc_count;
2389
+ #endif
2390
+ return count;
2391
+ GGML_UNUSED(model);
2392
+ }
2393
+
2372
2394
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2373
2395
ggml_backend_buffer_type_t buft = nullptr;
2374
2396
2375
- #ifdef GGML_USE_RPC
2376
- std::string endpoint = model.rpc_servers[gpu];
2377
- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2378
- #elif defined(GGML_USE_METAL)
2397
+ #if defined(GGML_USE_RPC)
2398
+ int dev_count = (int)llama_get_device_count(model);
2399
+ int rpc_count = (int)model.rpc_servers.size();
2400
+ if (gpu >= dev_count - rpc_count) {
2401
+ return ggml_backend_rpc_buffer_type(gpu);
2402
+ }
2403
+ #endif
2404
+ #if defined(GGML_USE_METAL)
2379
2405
buft = ggml_backend_metal_buffer_type();
2380
2406
#elif defined(GGML_USE_CUDA)
2381
2407
buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2423,29 +2449,18 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
2423
2449
GGML_UNUSED(tensor_split);
2424
2450
}
2425
2451
2426
- static size_t llama_get_device_count(const llama_model & model) {
2427
- #if defined(GGML_USE_RPC)
2428
- return model.rpc_servers.size();
2429
- #elif defined(GGML_USE_CUDA)
2430
- return ggml_backend_cuda_get_device_count();
2431
- #elif defined(GGML_USE_SYCL)
2432
- return ggml_backend_sycl_get_device_count();
2433
- #elif defined(GGML_USE_VULKAN)
2434
- return ggml_backend_vk_get_device_count();
2435
- #else
2436
- return 1;
2437
- #endif
2438
- GGML_UNUSED(model);
2439
- }
2440
-
2441
2452
static size_t llama_get_device_memory(const llama_model & model, int device) {
2442
2453
#if defined(GGML_USE_RPC)
2443
- size_t total;
2444
- size_t free;
2445
- std::string endpoint = model.rpc_servers[device];
2446
- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2447
- return free;
2448
- #elif defined(GGML_USE_CUDA)
2454
+ int dev_count = (int)llama_get_device_count(model);
2455
+ int rpc_count = (int)model.rpc_servers.size();
2456
+ if (device >= dev_count - rpc_count) {
2457
+ size_t total;
2458
+ size_t free;
2459
+ ggml_backend_rpc_get_device_memory(device, &free, &total);
2460
+ return free;
2461
+ }
2462
+ #endif
2463
+ #if defined(GGML_USE_CUDA)
2449
2464
size_t total;
2450
2465
size_t free;
2451
2466
ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -16146,7 +16161,7 @@ struct llama_model * llama_load_model_from_file(
16146
16161
return true;
16147
16162
};
16148
16163
}
16149
- if (params.rpc_servers != nullptr) {
16164
+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0' ) {
16150
16165
// split the servers set them into model->rpc_servers
16151
16166
std::string servers(params.rpc_servers);
16152
16167
size_t pos = 0;
@@ -16304,17 +16319,7 @@ struct llama_context * llama_new_context_with_model(
16304
16319
16305
16320
if (!hparams.vocab_only) {
16306
16321
// initialize backends
16307
- #if defined(GGML_USE_RPC)
16308
- for (auto & server : model->rpc_servers) {
16309
- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16310
- if (backend == nullptr) {
16311
- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16312
- llama_free(ctx);
16313
- return nullptr;
16314
- }
16315
- ctx->backends.push_back(backend);
16316
- }
16317
- #elif defined(GGML_USE_METAL)
16322
+ #if defined(GGML_USE_METAL)
16318
16323
if (model->n_gpu_layers > 0) {
16319
16324
ctx->backend_metal = ggml_backend_metal_init();
16320
16325
if (ctx->backend_metal == nullptr) {
@@ -16406,6 +16411,19 @@ struct llama_context * llama_new_context_with_model(
16406
16411
}
16407
16412
ctx->backends.push_back(backend);
16408
16413
}
16414
+ #endif
16415
+ #if defined(GGML_USE_RPC)
16416
+ int dev_count = (int)llama_get_device_count(*model);
16417
+ int rpc_count = (int)model->rpc_servers.size();
16418
+ for (int i = dev_count - rpc_count; i < dev_count; i++) {
16419
+ ggml_backend_t backend = ggml_backend_rpc_init(i);
16420
+ if (backend == nullptr) {
16421
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC #%d\n", __func__, i);
16422
+ llama_free(ctx);
16423
+ return nullptr;
16424
+ }
16425
+ ctx->backends.push_back(backend);
16426
+ }
16409
16427
#endif
16410
16428
ctx->backend_cpu = ggml_backend_cpu_init();
16411
16429
if (ctx->backend_cpu == nullptr) {
0 commit comments