Skip to content

Commit db53f8e

Browse files
committed
fix pipeline parallelism check
1 parent 04ef648 commit db53f8e

File tree

1 file changed

+24
-6
lines changed

1 file changed

+24
-6
lines changed

src/llama.cpp

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19494,19 +19494,37 @@ struct llama_context * llama_new_context_with_model(
1949419494
// buffer used to store the computation graph and the tensor meta data
1949519495
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
1949619496

19497+
// TODO: move these checks to ggml_backend_sched
1949719498
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
1949819499
bool pipeline_parallel =
1949919500
llama_get_device_count(*model) > 1 &&
1950019501
model->n_gpu_layers > (int)model->hparams.n_layer &&
1950119502
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
1950219503
params.offload_kqv;
1950319504

19504-
// FIXME
19505-
#if !defined(GGML_USE_CUDA) && false
19506-
// pipeline parallelism requires support for async compute and events
19507-
// currently this is only implemented in the CUDA backend
19508-
pipeline_parallel = false;
19509-
#endif
19505+
// pipeline parallelism requires support for async compute and events in all devices
19506+
if (pipeline_parallel) {
19507+
for (auto * backend : ctx->backends) {
19508+
if (ggml_backend_is_cpu(backend)) {
19509+
// ignore CPU backend
19510+
continue;
19511+
}
19512+
auto * dev = ggml_backend_get_device(backend);
19513+
if (!dev) {
19514+
// backend is using old interface, not supported
19515+
pipeline_parallel = false;
19516+
break;
19517+
}
19518+
ggml_backend_dev_props props;
19519+
ggml_backend_dev_get_props(dev, &props);
19520+
if (!props.caps.async || !props.caps.events) {
19521+
// device does not support async compute or events
19522+
pipeline_parallel = false;
19523+
break;
19524+
}
19525+
}
19526+
}
19527+
1951019528
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
1951119529

1951219530
if (pipeline_parallel) {

0 commit comments

Comments
 (0)