File tree Expand file tree Collapse file tree 2 files changed +7
-0
lines changed Expand file tree Collapse file tree 2 files changed +7
-0
lines changed Original file line number Diff line number Diff line change @@ -366,6 +366,8 @@ extern "C" {
366
366
bool no_perf; // measure performance timings
367
367
bool op_offload; // offload host tensor operations to device
368
368
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
369
+ // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
370
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
369
371
};
370
372
371
373
// model quantization parameters
Original file line number Diff line number Diff line change @@ -123,6 +123,11 @@ llama_context::llama_context(
123
123
__func__, n_ctx_per_seq, hparams.n_ctx_train );
124
124
}
125
125
126
+ if (!params.swa_full && cparams.n_seq_max > 1 ) {
127
+ LLAMA_LOG_WARN (" %s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n " ,
128
+ __func__, cparams.n_seq_max , " https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573" );
129
+ }
130
+
126
131
if (!hparams.vocab_only ) {
127
132
// GPU backends
128
133
for (auto * dev : model.devices ) {
You can’t perform that action at this time.
0 commit comments