Skip to content

Commit 855b397

Browse files
committed
llama : add warning about multi-sqeuence SWA contexts
1 parent 4a9253a commit 855b397

File tree

2 files changed

+7
-0
lines changed

2 files changed

+7
-0
lines changed

include/llama.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,8 @@ extern "C" {
366366
bool no_perf; // measure performance timings
367367
bool op_offload; // offload host tensor operations to device
368368
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
369+
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
370+
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
369371
};
370372

371373
// model quantization parameters

src/llama-context.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,11 @@ llama_context::llama_context(
123123
__func__, n_ctx_per_seq, hparams.n_ctx_train);
124124
}
125125

126+
if (!params.swa_full && cparams.n_seq_max > 1) {
127+
LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
128+
__func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
129+
}
130+
126131
if (!hparams.vocab_only) {
127132
// GPU backends
128133
for (auto * dev : model.devices) {

0 commit comments

Comments
 (0)