Skip to content

Commit 92090ec

Browse files
authored
llama : add function for model-based max number of graph nodes (#8622)
* llama : model-based max number of graph nodes ggml-ci * llama : disable 405B max_nodes path due to lack of complaints ggml-ci
1 parent 9d03d08 commit 92090ec

File tree

1 file changed

+53
-43
lines changed

1 file changed

+53
-43
lines changed

src/llama.cpp

Lines changed: 53 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@
101101
#endif
102102

103103
// bump if necessary
104-
#define LLAMA_MAX_NODES 8192
105104
#define LLAMA_MAX_LAYERS 512
106105
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
107106

@@ -3567,6 +3566,15 @@ namespace GGUFMeta {
35673566

35683567
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
35693568

3569+
// TODO: update when needed or think of some clever automatic way to do this
3570+
static size_t llama_model_max_nodes(const llama_model & /*model*/) {
3571+
//if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
3572+
// return 32768;
3573+
//}
3574+
3575+
return 8192;
3576+
}
3577+
35703578
struct llama_model_loader {
35713579
int n_kv = 0;
35723580
int n_tensors = 0;
@@ -8396,7 +8404,7 @@ struct llm_build_context {
83968404
}
83978405

83988406
struct ggml_cgraph * build_k_shift() {
8399-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8407+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
84008408

84018409
GGML_ASSERT(kv_self.size == n_ctx);
84028410

@@ -8427,7 +8435,7 @@ struct llm_build_context {
84278435
}
84288436

84298437
struct ggml_cgraph * build_s_copy() {
8430-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8438+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
84318439

84328440
GGML_ASSERT(kv_self.recurrent);
84338441

@@ -8450,7 +8458,7 @@ struct llm_build_context {
84508458
}
84518459

84528460
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
8453-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8461+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
84548462

84558463
for (uint32_t i = 0; i < ids.size(); ++i) {
84568464
const uint32_t id = ids[i];
@@ -8691,7 +8699,7 @@ struct llm_build_context {
86918699
}
86928700

86938701
struct ggml_cgraph * build_llama() {
8694-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8702+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
86958703

86968704
// mutable variable, needed during the last layer of the computation to skip unused tokens
86978705
int32_t n_tokens = this->n_tokens;
@@ -8834,7 +8842,7 @@ struct llm_build_context {
88348842
}
88358843

88368844
struct ggml_cgraph * build_baichuan() {
8837-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8845+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
88388846

88398847
const int64_t n_embd_head = hparams.n_embd_head_v;
88408848
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8949,7 +8957,7 @@ struct llm_build_context {
89498957
}
89508958

89518959
struct ggml_cgraph * build_xverse() {
8952-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8960+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
89538961

89548962
const int64_t n_embd_head = hparams.n_embd_head_v;
89558963
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9052,7 +9060,7 @@ struct llm_build_context {
90529060
}
90539061

90549062
struct ggml_cgraph * build_falcon() {
9055-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9063+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
90569064

90579065
const int64_t n_embd_head = hparams.n_embd_head_v;
90589066
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9172,7 +9180,7 @@ struct llm_build_context {
91729180
}
91739181

91749182
struct ggml_cgraph * build_grok() {
9175-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9183+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
91769184

91779185
// mutable variable, needed during the last layer of the computation to skip unused tokens
91789186
int32_t n_tokens = this->n_tokens;
@@ -9329,7 +9337,7 @@ struct llm_build_context {
93299337
}
93309338

93319339
struct ggml_cgraph * build_dbrx() {
9332-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9340+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
93339341

93349342
// mutable variable, needed during the last layer of the computation to skip unused tokens
93359343
int32_t n_tokens = this->n_tokens;
@@ -9455,7 +9463,7 @@ struct llm_build_context {
94559463
}
94569464

94579465
struct ggml_cgraph * build_starcoder() {
9458-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9466+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
94599467

94609468
const int64_t n_embd_head = hparams.n_embd_head_v;
94619469
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9559,7 +9567,7 @@ struct llm_build_context {
95599567
}
95609568

95619569
struct ggml_cgraph * build_refact() {
9562-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9570+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
95639571

95649572
const int64_t n_embd_head = hparams.n_embd_head_v;
95659573
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9653,7 +9661,7 @@ struct llm_build_context {
96539661
}
96549662

96559663
struct ggml_cgraph * build_bert() {
9656-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9664+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
96579665

96589666
const int64_t n_embd_head = hparams.n_embd_head_v;
96599667
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9847,7 +9855,7 @@ struct llm_build_context {
98479855
}
98489856

98499857
struct ggml_cgraph * build_bloom() {
9850-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9858+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
98519859

98529860
const int64_t n_embd_head = hparams.n_embd_head_v;
98539861
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9948,7 +9956,7 @@ struct llm_build_context {
99489956
}
99499957

99509958
struct ggml_cgraph * build_mpt() {
9951-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9959+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
99529960

99539961
const int64_t n_embd_head = hparams.n_embd_head_v;
99549962
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10238,7 +10246,7 @@ struct llm_build_context {
1023810246
}
1023910247

1024010248
struct ggml_cgraph * build_qwen() {
10241-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10249+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1024210250

1024310251
const int64_t n_embd_head = hparams.n_embd_head_v;
1024410252
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10350,7 +10358,7 @@ struct llm_build_context {
1035010358
}
1035110359

1035210360
struct ggml_cgraph * build_qwen2() {
10353-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10361+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1035410362

1035510363
const int64_t n_embd_head = hparams.n_embd_head_v;
1035610364
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10462,7 +10470,7 @@ struct llm_build_context {
1046210470
}
1046310471

1046410472
struct ggml_cgraph * build_qwen2moe() {
10465-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10473+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1046610474

1046710475
// mutable variable, needed during the last layer of the computation to skip unused tokens
1046810476
int32_t n_tokens = this->n_tokens;
@@ -10608,7 +10616,7 @@ struct llm_build_context {
1060810616
}
1060910617

1061010618
struct ggml_cgraph * build_phi2() {
10611-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10619+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1061210620

1061310621
const int64_t n_embd_head = hparams.n_embd_head_v;
1061410622
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10729,7 +10737,7 @@ struct llm_build_context {
1072910737
}
1073010738

1073110739
struct ggml_cgraph * build_phi3() {
10732-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10740+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1073310741

1073410742
const int64_t n_embd_head = hparams.n_embd_head_v;
1073510743
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10961,7 +10969,7 @@ struct llm_build_context {
1096110969
}
1096210970

1096310971
struct ggml_cgraph * build_gpt2() {
10964-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10972+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1096510973

1096610974
const int64_t n_embd_head = hparams.n_embd_head_v;
1096710975
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -11066,7 +11074,7 @@ struct llm_build_context {
1106611074
}
1106711075

1106811076
struct ggml_cgraph * build_codeshell() {
11069-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11077+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1107011078

1107111079
const int64_t n_embd_head = hparams.n_embd_head_v;
1107211080
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -11177,7 +11185,7 @@ struct llm_build_context {
1117711185
}
1117811186

1117911187
struct ggml_cgraph * build_orion() {
11180-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11188+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1118111189

1118211190
const int64_t n_embd_head = hparams.n_embd_head_v;
1118311191
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11295,7 +11303,7 @@ struct llm_build_context {
1129511303
}
1129611304

1129711305
struct ggml_cgraph * build_internlm2() {
11298-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11306+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1129911307

1130011308
const int64_t n_embd_head = hparams.n_embd_head_v;
1130111309
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11416,7 +11424,7 @@ struct llm_build_context {
1141611424
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
1141711425
// based on the original build_llama() function
1141811426
struct ggml_cgraph * build_minicpm() {
11419-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11427+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1142011428

1142111429
const int64_t n_embd_head = hparams.n_embd_head_v;
1142211430
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11560,7 +11568,7 @@ struct llm_build_context {
1156011568
}
1156111569

1156211570
struct ggml_cgraph * build_gemma() {
11563-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11571+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1156411572

1156511573
const int64_t n_embd_head_k = hparams.n_embd_head_k;
1156611574

@@ -11668,7 +11676,7 @@ struct llm_build_context {
1166811676
}
1166911677

1167011678
struct ggml_cgraph * build_gemma2() {
11671-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11679+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1167211680

1167311681
const int64_t n_embd_head_k = hparams.n_embd_head_k;
1167411682

@@ -11803,7 +11811,7 @@ struct llm_build_context {
1180311811

1180411812

1180511813
struct ggml_cgraph * build_starcoder2() {
11806-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11814+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1180711815

1180811816
const int64_t n_embd_head = hparams.n_embd_head_v;
1180911817
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11922,7 +11930,7 @@ struct llm_build_context {
1192211930
}
1192311931

1192411932
struct ggml_cgraph * build_mamba() {
11925-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11933+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1192611934

1192711935
const int64_t d_model = n_embd;
1192811936
const int64_t d_conv = hparams.ssm_d_conv;
@@ -12071,7 +12079,7 @@ struct llm_build_context {
1207112079

1207212080
struct ggml_cgraph * build_command_r() {
1207312081

12074-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
12082+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1207512083

1207612084
const int64_t n_embd_head = hparams.n_embd_head_v;
1207712085
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12225,7 +12233,7 @@ struct llm_build_context {
1222512233
// * removed bias
1222612234
// * removed MoE
1222712235
struct ggml_cgraph * build_olmo() {
12228-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
12236+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1222912237

1223012238
// mutable variable, needed during the last layer of the computation to skip unused tokens
1223112239
int32_t n_tokens = this->n_tokens;
@@ -12349,7 +12357,7 @@ struct llm_build_context {
1234912357
}
1235012358

1235112359
struct ggml_cgraph * build_openelm() {
12352-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
12360+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1235312361

1235412362
const int64_t n_embd_head = hparams.n_embd_head_v;
1235512363
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12474,7 +12482,7 @@ struct llm_build_context {
1247412482
}
1247512483

1247612484
struct ggml_cgraph * build_gptneox() {
12477-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
12485+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1247812486

1247912487
const int64_t n_embd_head = hparams.n_embd_head_v;
1248012488
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -12616,7 +12624,7 @@ struct llm_build_context {
1261612624
}
1261712625

1261812626
struct ggml_cgraph * build_arctic() {
12619-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
12627+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1262012628

1262112629
// mutable variable, needed during the last layer of the computation to skip unused tokens
1262212630
int32_t n_tokens = this->n_tokens;
@@ -12748,7 +12756,7 @@ struct llm_build_context {
1274812756
}
1274912757

1275012758
struct ggml_cgraph * build_deepseek2() {
12751-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
12759+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1275212760

1275312761
// mutable variable, needed during the last layer of the computation to skip unused tokens
1275412762
int32_t n_tokens = this->n_tokens;
@@ -12976,7 +12984,7 @@ struct llm_build_context {
1297612984
}
1297712985

1297812986
struct ggml_cgraph * build_bitnet() {
12979-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
12987+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1298012988

1298112989
const int64_t n_embd_head = hparams.n_embd_head_v;
1298212990
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13116,7 +13124,7 @@ struct llm_build_context {
1311613124
}
1311713125

1311813126
struct ggml_cgraph * build_t5() {
13119-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
13127+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1312013128

1312113129
// mutable variable, needed during the last layer of the computation to skip unused tokens
1312213130
int32_t n_tokens = this->n_tokens;
@@ -13433,7 +13441,7 @@ struct llm_build_context {
1343313441
}
1343413442

1343513443
struct ggml_cgraph * build_jais() {
13436-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
13444+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1343713445

1343813446
const int64_t n_embd_head = hparams.n_embd_head_v;
1343913447
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -13525,7 +13533,7 @@ struct llm_build_context {
1352513533
}
1352613534

1352713535
struct ggml_cgraph * build_chatglm() {
13528-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
13536+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1352913537

1353013538
const int64_t n_embd_head = hparams.n_embd_head_v;
1353113539
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -14870,9 +14878,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
1487014878
// each move requires 6*n_layer tensors (see build_defrag)
1487114879
// - source view, destination view, copy operation
1487214880
// - x2 for keys and values
14873-
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
14881+
//const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
1487414882
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
14875-
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
14883+
const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
1487614884

1487714885
// determine which KV cells to move where
1487814886
//
@@ -16762,8 +16770,10 @@ struct llama_context * llama_new_context_with_model(
1676216770
}
1676316771
}
1676416772

16773+
const size_t max_nodes = llama_model_max_nodes(*model);
16774+
1676516775
// buffer used to store the computation graph and the tensor meta data
16766-
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
16776+
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
1676716777

1676816778
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
1676916779
bool pipeline_parallel =
@@ -16776,7 +16786,7 @@ struct llama_context * llama_new_context_with_model(
1677616786
// currently this is only implemented in the CUDA backend
1677716787
pipeline_parallel = false;
1677816788
#endif
16779-
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
16789+
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
1678016790

1678116791
if (pipeline_parallel) {
1678216792
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));

0 commit comments

Comments
 (0)