Skip to content

Commit e1067ef

Browse files
committed
llama : fix n_kv to never become 0
1 parent 7b7472e commit e1067ef

File tree

1 file changed

+9
-7
lines changed

1 file changed

+9
-7
lines changed

llama.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,7 @@ struct llama_kv_cache {
10251025
uint32_t size = 0;
10261026

10271027
// computed before each graph build
1028-
uint32_t cell_max = 0;
1028+
uint32_t n = 0;
10291029

10301030
std::vector<llama_kv_cell> cells;
10311031

@@ -2619,7 +2619,7 @@ static struct ggml_cgraph * llm_build_llama(
26192619
const int n_gpu_layers = model.n_gpu_layers;
26202620

26212621
const int32_t n_tokens = batch.n_tokens;
2622-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
2622+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
26232623
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
26242624

26252625
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3007,7 +3007,7 @@ static struct ggml_cgraph * llm_build_baichaun(
30073007
const int n_gpu_layers = model.n_gpu_layers;
30083008

30093009
const int32_t n_tokens = batch.n_tokens;
3010-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
3010+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
30113011
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
30123012

30133013
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3410,7 +3410,7 @@ static struct ggml_cgraph * llm_build_falcon(
34103410
const int n_gpu_layers = model.n_gpu_layers;
34113411

34123412
const int32_t n_tokens = batch.n_tokens;
3413-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
3413+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
34143414
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
34153415

34163416
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3771,7 +3771,7 @@ static struct ggml_cgraph * llm_build_starcoder(
37713771
const float norm_eps = hparams.f_norm_eps;
37723772

37733773
const int32_t n_tokens = batch.n_tokens;
3774-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
3774+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
37753775
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
37763776

37773777
auto & buf_compute = lctx.buf_compute;
@@ -4102,8 +4102,10 @@ static int llama_decode_internal(
41024102
// a heuristic, to avoid attending the full cache if it is not yet utilized
41034103
// after enough generations, the benefit from this heuristic disappears
41044104
// if we start defragmenting the cache, the benefit from this will be more important
4105-
kv_self.cell_max = llama_kv_cache_cell_max(kv_self);
4106-
//printf("kv_self.cell_max = %d\n", kv_self.cell_max);
4105+
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4106+
kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self));
4107+
4108+
//printf("kv_self.n = %d\n", kv_self.n);
41074109

41084110
ggml_allocr_reset(lctx.alloc);
41094111

0 commit comments

Comments
 (0)