File tree Expand file tree Collapse file tree 1 file changed +5
-3
lines changed Expand file tree Collapse file tree 1 file changed +5
-3
lines changed Original file line number Diff line number Diff line change @@ -8805,12 +8805,14 @@ static int llama_decode_impl(
8805
8805
// llama_synchronize(&lctx);
8806
8806
8807
8807
// decide if we need to defrag the kv cache
8808
- if (cparams.causal_attn && cparams.defrag_thold >= 0 .0f ) {
8809
- const float fragmentation = kv_self.n >= 128 ? 1 .0f - float (kv_self.used )/float (kv_self.n ) : 0 .0f ;
8808
+ if (cparams.causal_attn && cparams.defrag_thold > 0 .0f ) {
8809
+ // - do not defrag small contexts (i.e. < 2048 tokens)
8810
+ // - count the padding towards the number of used tokens
8811
+ const float fragmentation = kv_self.n >= 2048 ? std::max (0 .0f , 1 .0f - float (kv_self.used + llama_kv_cache_get_padding (cparams))/float (kv_self.n )) : 0 .0f ;
8810
8812
8811
8813
// queue defragmentation for next llama_kv_cache_update
8812
8814
if (fragmentation > cparams.defrag_thold ) {
8813
- // LLAMA_LOG_INFO(" fragmentation: %.2f\n", fragmentation);
8815
+ LLAMA_LOG_DEBUG ( " %s: fragmentation: %.2f - requesting defrag \n " , __func__ , fragmentation);
8814
8816
8815
8817
llama_kv_cache_defrag (kv_self);
8816
8818
}
You can’t perform that action at this time.
0 commit comments