Skip to content

Commit dc475c3

Browse files
committed
fix consistency issues with the usage of main_gpu
1 parent b5516aa commit dc475c3

File tree

1 file changed

+72
-51
lines changed

1 file changed

+72
-51
lines changed

src/llama.cpp

Lines changed: 72 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3418,22 +3418,25 @@ struct llama_lora_adapter {
34183418
}
34193419
};
34203420

3421-
static size_t llama_get_device_count(const llama_model & model) {
3422-
size_t count = 1;
3421+
static int llama_get_device_count(const llama_model & model) {
3422+
int count = (int) model.devices.size();
34233423

3424-
count = model.devices.size();
3424+
#if defined(GGML_USE_RPC)
3425+
count += (int) model.rpc_servers.size();
3426+
#endif
34253427

3426-
#if defined(GGML_USE_SYCL)
3427-
count = ggml_backend_sycl_get_device_count();
3428+
#if defined(GGML_USE_METAL)
3429+
count += 1;
3430+
#elif defined(GGML_USE_SYCL)
3431+
count += ggml_backend_sycl_get_device_count();
34283432
#elif defined(GGML_USE_VULKAN)
3429-
count = ggml_backend_vk_get_device_count();
3433+
count += ggml_backend_vk_get_device_count();
34303434
#elif defined(GGML_USE_CANN)
3431-
return ggml_backend_cann_get_device_count();
3432-
#endif
3433-
#if defined(GGML_USE_RPC)
3434-
count += model.rpc_servers.size();
3435+
count += ggml_backend_cann_get_device_count();
34353436
#endif
3437+
34363438
return count;
3439+
34373440
GGML_UNUSED(model);
34383441
}
34393442

@@ -3482,12 +3485,13 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
34823485
const char * endpoint = model.rpc_servers[device].c_str();
34833486
return ggml_backend_rpc_buffer_type(endpoint);
34843487
}
3485-
device = device - rpc_count;
3488+
device -= rpc_count;
34863489
#endif
34873490

34883491
if (device < (int)model.devices.size()) {
3489-
buft = ggml_backend_dev_buffer_type(model.devices[device]);
3492+
return ggml_backend_dev_buffer_type(model.devices[device]);
34903493
}
3494+
device -= (int)model.devices.size();
34913495

34923496
#if defined(GGML_USE_METAL)
34933497
buft = ggml_backend_metal_buffer_type();
@@ -6965,6 +6969,13 @@ static bool llm_load_tensors(
69656969
void * progress_callback_user_data) {
69666970
auto & hparams = model.hparams;
69676971

6972+
// check if the value of main_gpu is valid
6973+
if (llama_get_device_count(model) > 0 &&
6974+
split_mode != LLAMA_SPLIT_MODE_LAYER &&
6975+
(main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
6976+
throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
6977+
}
6978+
69686979
model.split_mode = split_mode;
69696980
model.main_gpu = main_gpu;
69706981
model.n_gpu_layers = n_gpu_layers;
@@ -19291,30 +19302,20 @@ struct llama_context * llama_new_context_with_model(
1929119302

1929219303
if (!hparams.vocab_only) {
1929319304
// initialize backends
19294-
#if defined(GGML_USE_RPC)
19295-
if (model->n_gpu_layers > 0) {
19296-
for (const auto & endpoint : model->rpc_servers) {
19297-
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
19305+
int main_gpu = model->main_gpu;
19306+
19307+
// with registry
19308+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19309+
if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
19310+
ggml_backend_dev_t main_dev = model->devices[main_gpu];
19311+
ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
1929819312
if (backend == nullptr) {
19299-
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
19313+
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
1930019314
llama_free(ctx);
1930119315
return nullptr;
1930219316
}
1930319317
ctx->backends.push_back(backend);
1930419318
}
19305-
}
19306-
#endif
19307-
19308-
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19309-
// with split_mode LLAMA_SPLIT_MODE_NONE, only the main GPU backend is used
19310-
ggml_backend_dev_t main_dev = model->devices[model->main_gpu];
19311-
ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
19312-
if (backend == nullptr) {
19313-
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
19314-
llama_free(ctx);
19315-
return nullptr;
19316-
}
19317-
ctx->backends.push_back(backend);
1931819319
} else {
1931919320
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
1932019321
for (auto * dev : model->devices) {
@@ -19327,6 +19328,26 @@ struct llama_context * llama_new_context_with_model(
1932719328
ctx->backends.push_back(backend);
1932819329
}
1932919330
}
19331+
if (main_gpu >= (int)model->devices.size()) {
19332+
main_gpu -= (int)model->devices.size();
19333+
}
19334+
19335+
#if defined(GGML_USE_RPC)
19336+
if (model->n_gpu_layers > 0) {
19337+
for (const auto & endpoint : model->rpc_servers) {
19338+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
19339+
if (backend == nullptr) {
19340+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
19341+
llama_free(ctx);
19342+
return nullptr;
19343+
}
19344+
ctx->backends.push_back(backend);
19345+
}
19346+
}
19347+
if (main_gpu >= (int)model->rpc_servers.size()) {
19348+
main_gpu -= (int)model->rpc_servers.size();
19349+
}
19350+
#endif
1933019351

1933119352
#if defined(GGML_USE_METAL)
1933219353
if (model->n_gpu_layers > 0) {
@@ -19345,7 +19366,7 @@ struct llama_context * llama_new_context_with_model(
1934519366
return nullptr;
1934619367
}
1934719368
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19348-
ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
19369+
ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
1934919370
if (backend == nullptr) {
1935019371
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
1935119372
llama_free(ctx);
@@ -19366,9 +19387,9 @@ struct llama_context * llama_new_context_with_model(
1936619387
#elif defined(GGML_USE_SYCL)
1936719388
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
1936819389
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19369-
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
19390+
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
1937019391
if (backend == nullptr) {
19371-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
19392+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
1937219393
llama_free(ctx);
1937319394
return nullptr;
1937419395
}
@@ -19387,7 +19408,7 @@ struct llama_context * llama_new_context_with_model(
1938719408
}
1938819409
#elif defined(GGML_USE_KOMPUTE)
1938919410
if (model->n_gpu_layers > 0) {
19390-
auto * backend = ggml_backend_kompute_init(model->main_gpu);
19411+
auto * backend = ggml_backend_kompute_init(main_gpu);
1939119412
if (backend == nullptr) {
1939219413
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
1939319414
llama_free(ctx);
@@ -19396,29 +19417,29 @@ struct llama_context * llama_new_context_with_model(
1939619417
ctx->backends.push_back(backend);
1939719418
}
1939819419
#elif defined(GGML_USE_CANN)
19399-
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19400-
// TODO: ggml_backend_cann is not support split tensor now, just leave code here.
19401-
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19402-
ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
19403-
if (backend == nullptr) {
19404-
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
19405-
llama_free(ctx);
19406-
return nullptr;
19407-
}
19408-
ctx->backends.push_back(backend);
19409-
} else {
19410-
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19411-
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19412-
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
19413-
ggml_backend_t backend = ggml_backend_cann_init(device);
19420+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19421+
// TODO: ggml_backend_cann is not support split tensor now, just leave code here.
19422+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19423+
ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
1941419424
if (backend == nullptr) {
19415-
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
19425+
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
1941619426
llama_free(ctx);
1941719427
return nullptr;
1941819428
}
1941919429
ctx->backends.push_back(backend);
19430+
} else {
19431+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19432+
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19433+
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
19434+
ggml_backend_t backend = ggml_backend_cann_init(device);
19435+
if (backend == nullptr) {
19436+
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
19437+
llama_free(ctx);
19438+
return nullptr;
19439+
}
19440+
ctx->backends.push_back(backend);
19441+
}
1942019442
}
19421-
}
1942219443
#endif
1942319444

1942419445
#ifdef GGML_USE_BLAS

0 commit comments

Comments
 (0)