@@ -3418,22 +3418,25 @@ struct llama_lora_adapter {
3418
3418
}
3419
3419
};
3420
3420
3421
- static size_t llama_get_device_count(const llama_model & model) {
3422
- size_t count = 1 ;
3421
+ static int llama_get_device_count(const llama_model & model) {
3422
+ int count = (int) model.devices.size() ;
3423
3423
3424
- count = model.devices.size();
3424
+ #if defined(GGML_USE_RPC)
3425
+ count += (int) model.rpc_servers.size();
3426
+ #endif
3425
3427
3426
- #if defined(GGML_USE_SYCL)
3427
- count = ggml_backend_sycl_get_device_count();
3428
+ #if defined(GGML_USE_METAL)
3429
+ count += 1;
3430
+ #elif defined(GGML_USE_SYCL)
3431
+ count += ggml_backend_sycl_get_device_count();
3428
3432
#elif defined(GGML_USE_VULKAN)
3429
- count = ggml_backend_vk_get_device_count();
3433
+ count + = ggml_backend_vk_get_device_count();
3430
3434
#elif defined(GGML_USE_CANN)
3431
- return ggml_backend_cann_get_device_count();
3432
- #endif
3433
- #if defined(GGML_USE_RPC)
3434
- count += model.rpc_servers.size();
3435
+ count += ggml_backend_cann_get_device_count();
3435
3436
#endif
3437
+
3436
3438
return count;
3439
+
3437
3440
GGML_UNUSED(model);
3438
3441
}
3439
3442
@@ -3482,12 +3485,13 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
3482
3485
const char * endpoint = model.rpc_servers[device].c_str();
3483
3486
return ggml_backend_rpc_buffer_type(endpoint);
3484
3487
}
3485
- device = device - rpc_count;
3488
+ device -= rpc_count;
3486
3489
#endif
3487
3490
3488
3491
if (device < (int)model.devices.size()) {
3489
- buft = ggml_backend_dev_buffer_type(model.devices[device]);
3492
+ return ggml_backend_dev_buffer_type(model.devices[device]);
3490
3493
}
3494
+ device -= (int)model.devices.size();
3491
3495
3492
3496
#if defined(GGML_USE_METAL)
3493
3497
buft = ggml_backend_metal_buffer_type();
@@ -6965,6 +6969,13 @@ static bool llm_load_tensors(
6965
6969
void * progress_callback_user_data) {
6966
6970
auto & hparams = model.hparams;
6967
6971
6972
+ // check if the value of main_gpu is valid
6973
+ if (llama_get_device_count(model) > 0 &&
6974
+ split_mode != LLAMA_SPLIT_MODE_LAYER &&
6975
+ (main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
6976
+ throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
6977
+ }
6978
+
6968
6979
model.split_mode = split_mode;
6969
6980
model.main_gpu = main_gpu;
6970
6981
model.n_gpu_layers = n_gpu_layers;
@@ -19291,30 +19302,20 @@ struct llama_context * llama_new_context_with_model(
19291
19302
19292
19303
if (!hparams.vocab_only) {
19293
19304
// initialize backends
19294
- #if defined(GGML_USE_RPC)
19295
- if (model->n_gpu_layers > 0) {
19296
- for (const auto & endpoint : model->rpc_servers) {
19297
- ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
19305
+ int main_gpu = model->main_gpu;
19306
+
19307
+ // with registry
19308
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19309
+ if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
19310
+ ggml_backend_dev_t main_dev = model->devices[main_gpu];
19311
+ ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
19298
19312
if (backend == nullptr) {
19299
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s' \n", __func__, endpoint.c_str( ));
19313
+ LLAMA_LOG_ERROR("%s: failed to initialize %s backend \n", __func__, ggml_backend_dev_name(main_dev ));
19300
19314
llama_free(ctx);
19301
19315
return nullptr;
19302
19316
}
19303
19317
ctx->backends.push_back(backend);
19304
19318
}
19305
- }
19306
- #endif
19307
-
19308
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19309
- // with split_mode LLAMA_SPLIT_MODE_NONE, only the main GPU backend is used
19310
- ggml_backend_dev_t main_dev = model->devices[model->main_gpu];
19311
- ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
19312
- if (backend == nullptr) {
19313
- LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
19314
- llama_free(ctx);
19315
- return nullptr;
19316
- }
19317
- ctx->backends.push_back(backend);
19318
19319
} else {
19319
19320
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19320
19321
for (auto * dev : model->devices) {
@@ -19327,6 +19328,26 @@ struct llama_context * llama_new_context_with_model(
19327
19328
ctx->backends.push_back(backend);
19328
19329
}
19329
19330
}
19331
+ if (main_gpu >= (int)model->devices.size()) {
19332
+ main_gpu -= (int)model->devices.size();
19333
+ }
19334
+
19335
+ #if defined(GGML_USE_RPC)
19336
+ if (model->n_gpu_layers > 0) {
19337
+ for (const auto & endpoint : model->rpc_servers) {
19338
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
19339
+ if (backend == nullptr) {
19340
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
19341
+ llama_free(ctx);
19342
+ return nullptr;
19343
+ }
19344
+ ctx->backends.push_back(backend);
19345
+ }
19346
+ }
19347
+ if (main_gpu >= (int)model->rpc_servers.size()) {
19348
+ main_gpu -= (int)model->rpc_servers.size();
19349
+ }
19350
+ #endif
19330
19351
19331
19352
#if defined(GGML_USE_METAL)
19332
19353
if (model->n_gpu_layers > 0) {
@@ -19345,7 +19366,7 @@ struct llama_context * llama_new_context_with_model(
19345
19366
return nullptr;
19346
19367
}
19347
19368
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19348
- ggml_backend_t backend = ggml_backend_vk_init(model-> main_gpu);
19369
+ ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
19349
19370
if (backend == nullptr) {
19350
19371
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
19351
19372
llama_free(ctx);
@@ -19366,9 +19387,9 @@ struct llama_context * llama_new_context_with_model(
19366
19387
#elif defined(GGML_USE_SYCL)
19367
19388
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19368
19389
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19369
- ggml_backend_t backend = ggml_backend_sycl_init(model-> main_gpu);
19390
+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
19370
19391
if (backend == nullptr) {
19371
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model-> main_gpu);
19392
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
19372
19393
llama_free(ctx);
19373
19394
return nullptr;
19374
19395
}
@@ -19387,7 +19408,7 @@ struct llama_context * llama_new_context_with_model(
19387
19408
}
19388
19409
#elif defined(GGML_USE_KOMPUTE)
19389
19410
if (model->n_gpu_layers > 0) {
19390
- auto * backend = ggml_backend_kompute_init(model-> main_gpu);
19411
+ auto * backend = ggml_backend_kompute_init(main_gpu);
19391
19412
if (backend == nullptr) {
19392
19413
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
19393
19414
llama_free(ctx);
@@ -19396,29 +19417,29 @@ struct llama_context * llama_new_context_with_model(
19396
19417
ctx->backends.push_back(backend);
19397
19418
}
19398
19419
#elif defined(GGML_USE_CANN)
19399
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19400
- // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
19401
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19402
- ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
19403
- if (backend == nullptr) {
19404
- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
19405
- llama_free(ctx);
19406
- return nullptr;
19407
- }
19408
- ctx->backends.push_back(backend);
19409
- } else {
19410
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19411
- // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19412
- for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
19413
- ggml_backend_t backend = ggml_backend_cann_init(device);
19420
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19421
+ // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
19422
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19423
+ ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
19414
19424
if (backend == nullptr) {
19415
- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device );
19425
+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu );
19416
19426
llama_free(ctx);
19417
19427
return nullptr;
19418
19428
}
19419
19429
ctx->backends.push_back(backend);
19430
+ } else {
19431
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19432
+ // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19433
+ for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
19434
+ ggml_backend_t backend = ggml_backend_cann_init(device);
19435
+ if (backend == nullptr) {
19436
+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
19437
+ llama_free(ctx);
19438
+ return nullptr;
19439
+ }
19440
+ ctx->backends.push_back(backend);
19441
+ }
19420
19442
}
19421
- }
19422
19443
#endif
19423
19444
19424
19445
#ifdef GGML_USE_BLAS
0 commit comments