@@ -19494,19 +19494,37 @@ struct llama_context * llama_new_context_with_model(
19494
19494
// buffer used to store the computation graph and the tensor meta data
19495
19495
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
19496
19496
19497
+ // TODO: move these checks to ggml_backend_sched
19497
19498
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
19498
19499
bool pipeline_parallel =
19499
19500
llama_get_device_count(*model) > 1 &&
19500
19501
model->n_gpu_layers > (int)model->hparams.n_layer &&
19501
19502
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
19502
19503
params.offload_kqv;
19503
19504
19504
- // FIXME
19505
- #if !defined(GGML_USE_CUDA) && false
19506
- // pipeline parallelism requires support for async compute and events
19507
- // currently this is only implemented in the CUDA backend
19508
- pipeline_parallel = false;
19509
- #endif
19505
+ // pipeline parallelism requires support for async compute and events in all devices
19506
+ if (pipeline_parallel) {
19507
+ for (auto * backend : ctx->backends) {
19508
+ if (ggml_backend_is_cpu(backend)) {
19509
+ // ignore CPU backend
19510
+ continue;
19511
+ }
19512
+ auto * dev = ggml_backend_get_device(backend);
19513
+ if (!dev) {
19514
+ // backend is using old interface, not supported
19515
+ pipeline_parallel = false;
19516
+ break;
19517
+ }
19518
+ ggml_backend_dev_props props;
19519
+ ggml_backend_dev_get_props(dev, &props);
19520
+ if (!props.caps.async || !props.caps.events) {
19521
+ // device does not support async compute or events
19522
+ pipeline_parallel = false;
19523
+ break;
19524
+ }
19525
+ }
19526
+ }
19527
+
19510
19528
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
19511
19529
19512
19530
if (pipeline_parallel) {
0 commit comments