@@ -396,7 +396,7 @@ def generate(
396
396
and tuple (self .eval_tokens ) == tuple (tokens [: len (self .eval_tokens )])
397
397
):
398
398
if self .verbose :
399
- print ("generate cache hit" , file = sys .stderr )
399
+ print ("Llama. generate: cache hit" , file = sys .stderr )
400
400
reset = False
401
401
tokens = tokens [len (self .eval_tokens ) :]
402
402
@@ -518,7 +518,7 @@ def _create_completion(
518
518
519
519
if self .cache and prompt_tokens in self .cache :
520
520
if self .verbose :
521
- print ("cache hit" , file = sys .stderr )
521
+ print ("Llama._create_completion: cache hit" , file = sys .stderr )
522
522
self .load_state (self .cache [prompt_tokens ])
523
523
524
524
finish_reason = "length"
@@ -538,7 +538,7 @@ def _create_completion(
538
538
if self .cache and len (completion_tokens ) == 0 :
539
539
if prompt_tokens not in self .cache :
540
540
if self .verbose :
541
- print ("cache miss" , file = sys .stderr )
541
+ print ("Llama._create_completion: cache miss" , file = sys .stderr )
542
542
self .cache [prompt_tokens ] = self .save_state ()
543
543
544
544
completion_tokens .append (token )
@@ -957,6 +957,8 @@ def save_state(self) -> LlamaState:
957
957
raise RuntimeError ("Failed to copy llama state data" )
958
958
llama_state_compact = (llama_cpp .c_uint8 * int (n_bytes ))()
959
959
llama_cpp .ctypes .memmove (llama_state_compact , llama_state , int (n_bytes ))
960
+ if self .verbose :
961
+ print (f"Llama.save_state: saving { n_bytes } bytes of llama state" , file = sys .stderr )
960
962
return LlamaState (
961
963
eval_tokens = self .eval_tokens .copy (),
962
964
eval_logits = self .eval_logits .copy (),
0 commit comments