Skip to content

Commit 9ea7a37

Browse files
Merge branch 'abetlen:main' into main
2 parents 719c3ea + 232880c commit 9ea7a37

File tree

6 files changed

+118
-69
lines changed

6 files changed

+118
-69
lines changed

.github/ISSUE_TEMPLATE/bug_report.md

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,17 @@ Please provide detailed steps for reproducing the issue. We are not sitting in f
5757
3. step 3
5858
4. etc.
5959

60-
**Note: Many issues seem to be regarding performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.**
60+
**Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.**
61+
62+
Try the following:
63+
64+
1. `git clone https://github.com/abetlen/llama-cpp-python`
65+
2. `cd llama-cpp-python`
66+
3. `rm -rf _skbuild/` # delete any old builds
67+
4. `python setup.py develop`
68+
5. `cd ./vendor/llama.cpp`
69+
6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
70+
7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)
6171

6272
# Failure Logs
6373

@@ -73,8 +83,14 @@ commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2
7383
llama-cpp-python$ python3 --version
7484
Python 3.10.10
7585
76-
llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette"
77-
fastapi 0.95.0
78-
sse-starlette 1.3.3
79-
uvicorn 0.21.1
86+
llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy"
87+
fastapi 0.95.0
88+
numpy 1.24.3
89+
sse-starlette 1.3.3
90+
uvicorn 0.21.1
91+
92+
llama-cpp-python/vendor/llama.cpp$ git log | head -3
93+
commit 66874d4fbcc7866377246efbcee938e8cc9c7d76
94+
Author: Kerfuffle <[email protected]>
95+
Date: Thu May 25 20:18:01 2023 -0600
8096
```

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [v0.1.56]
11+
1012
### Added
1113

1214
- Added first version of the changelog
15+
- Server: Use async routes
16+
- Use numpy for internal buffers to reduce memory usage and improve performance.
1317

1418
### Fixed
1519

llama_cpp/llama.py

Lines changed: 58 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
from . import llama_cpp
2323
from .llama_types import *
2424

25+
import numpy as np
26+
import numpy.typing as npt
27+
2528

2629

2730
class LlamaCache:
@@ -76,11 +79,15 @@ def __init__(
7679
self,
7780
eval_tokens: Deque[int],
7881
eval_logits: Deque[List[float]],
82+
input_ids: npt.NDArray[np.intc],
83+
scores: npt.NDArray[np.single],
7984
llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8]
8085
llama_state_size: int,
8186
):
8287
self.eval_tokens = eval_tokens
8388
self.eval_logits = eval_logits
89+
self.input_ids = input_ids
90+
self.scores = scores
8491
self.llama_state = llama_state
8592
self.llama_state_size = llama_state_size
8693

@@ -210,27 +217,27 @@ def __init__(
210217

211218
self._n_vocab = self.n_vocab()
212219
self._n_ctx = self.n_ctx()
213-
data = (llama_cpp.llama_token_data * self._n_vocab)(
214-
*[
215-
llama_cpp.llama_token_data(
216-
id=llama_cpp.llama_token(i),
217-
logit=llama_cpp.c_float(0.0),
218-
p=llama_cpp.c_float(0.0),
219-
)
220-
for i in range(self._n_vocab)
221-
]
222-
)
223220
size = llama_cpp.c_size_t(self._n_vocab)
224-
sorted = False
221+
sorted = llama_cpp.c_bool(False)
222+
self._candidates_data = np.array(
223+
[],
224+
dtype=np.dtype(
225+
[("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
226+
),
227+
)
228+
self._candidates_data.resize(3, self._n_vocab)
225229
candidates = llama_cpp.llama_token_data_array(
226-
data=data,
230+
data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
227231
size=size,
228232
sorted=sorted,
229233
)
230234
self._candidates = candidates
231235
self._token_nl = Llama.token_nl()
232236
self._token_eos = Llama.token_eos()
233237

238+
self._input_ids = np.array([], dtype=np.intc)
239+
self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
240+
234241
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
235242
"""Tokenize a string.
236243
@@ -298,6 +305,8 @@ def reset(self):
298305
"""Reset the model state."""
299306
self.eval_tokens.clear()
300307
self.eval_logits.clear()
308+
self._input_ids = np.array([], dtype=np.intc)
309+
self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
301310

302311
def eval(self, tokens: Sequence[int]):
303312
"""Evaluate a list of tokens.
@@ -309,7 +318,7 @@ def eval(self, tokens: Sequence[int]):
309318
n_ctx = self._n_ctx
310319
for i in range(0, len(tokens), self.n_batch):
311320
batch = tokens[i : min(len(tokens), i + self.n_batch)]
312-
n_past = min(n_ctx - len(batch), len(self.eval_tokens))
321+
n_past = min(n_ctx - len(batch), len(self._input_ids))
313322
n_tokens = len(batch)
314323
return_code = llama_cpp.llama_eval(
315324
ctx=self.ctx,
@@ -322,13 +331,19 @@ def eval(self, tokens: Sequence[int]):
322331
raise RuntimeError(f"llama_eval returned {return_code}")
323332
# Save tokens
324333
self.eval_tokens.extend(batch)
334+
self._input_ids: npt.NDArray[np.intc] = np.concatenate(
335+
(self._input_ids, np.array(batch, dtype=np.intc)), axis=0
336+
)
325337
# Save logits
326338
rows = n_tokens if self.params.logits_all else 1
327339
n_vocab = self._n_vocab
328340
cols = n_vocab
329341
logits_view = llama_cpp.llama_get_logits(self.ctx)
330342
logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)]
331343
self.eval_logits.extend(logits)
344+
self._scores: npt.NDArray[np.single] = np.concatenate(
345+
(self._scores, np.array(logits, dtype=np.single)), axis=0
346+
)
332347

333348
def _sample(
334349
self,
@@ -349,6 +364,7 @@ def _sample(
349364
):
350365
assert self.ctx is not None
351366
assert len(self.eval_logits) > 0
367+
assert self._scores.shape[0] > 0
352368
n_vocab = self._n_vocab
353369
n_ctx = self._n_ctx
354370
top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
@@ -357,18 +373,23 @@ def _sample(
357373
if last_n_tokens_size.value < 0
358374
else last_n_tokens_size
359375
)
360-
logits = self.eval_logits[-1]
376+
logits: npt.NDArray[np.single] = self._scores[-1, :]
361377

362378
if logits_processor is not None:
363-
logits = logits_processor(list(self.eval_tokens), logits)
364-
self.eval_logits[-1] = logits
379+
logits = np.array(
380+
logits_processor(self._input_ids.tolist(), logits.tolist()),
381+
dtype=np.single,
382+
)
383+
self._scores[-1, :] = logits
384+
self.eval_logits[-1] = logits.tolist()
365385

366386
nl_logit = logits[self._token_nl]
367387
candidates = self._candidates
368-
for i, logit in enumerate(logits):
369-
candidates.data[i].id = llama_cpp.llama_token(i)
370-
candidates.data[i].logit = llama_cpp.c_float(logit)
371-
candidates.data[i].p = llama_cpp.c_float(0.0)
388+
candidates_data = self._candidates_data
389+
candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore
390+
candidates_data["logit"] = logits
391+
candidates_data["p"] = np.zeros(n_vocab, dtype=np.single)
392+
candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
372393
candidates.sorted = llama_cpp.c_bool(False)
373394
candidates.size = llama_cpp.c_size_t(n_vocab)
374395
llama_cpp.llama_sample_repetition_penalty(
@@ -486,8 +507,8 @@ def sample(
486507
"""
487508
assert self.ctx is not None
488509
last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
489-
0, self.last_n_tokens_size - len(self.eval_tokens)
490-
) + list(self.eval_tokens)[-self.last_n_tokens_size :]
510+
0, self.last_n_tokens_size - len(self._input_ids)
511+
) + self._input_ids[-self.last_n_tokens_size :].tolist()
491512
return self._sample(
492513
last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)(
493514
*last_n_tokens_data
@@ -545,9 +566,9 @@ def generate(
545566
"""
546567
assert self.ctx is not None
547568

548-
if reset and len(self.eval_tokens) > 0:
569+
if reset and len(self._input_ids) > 0:
549570
longest_prefix = 0
550-
for a, b in zip(self.eval_tokens, tokens[:-1]):
571+
for a, b in zip(self._input_ids, tokens[:-1]):
551572
if a == b:
552573
longest_prefix += 1
553574
else:
@@ -557,6 +578,8 @@ def generate(
557578
print("Llama.generate: prefix-match hit", file=sys.stderr)
558579
reset = False
559580
tokens = tokens[longest_prefix:]
581+
self._input_ids = self._input_ids[:longest_prefix]
582+
self._scores = self._scores[:longest_prefix, :]
560583
for _ in range(len(self.eval_tokens) - longest_prefix):
561584
self.eval_tokens.pop()
562585
try:
@@ -583,7 +606,7 @@ def generate(
583606
logits_processor=logits_processor,
584607
)
585608
if stopping_criteria is not None and stopping_criteria(
586-
list(self.eval_tokens), self.eval_logits[-1]
609+
self._input_ids.tolist(), self._scores[-1, :].tolist()
587610
):
588611
return
589612
tokens_or_none = yield token
@@ -718,10 +741,10 @@ def _create_completion(
718741
try:
719742
cache_item = self.cache[prompt_tokens]
720743
cache_prefix_len = Llama.longest_token_prefix(
721-
cache_item.eval_tokens, prompt_tokens
744+
cache_item.input_ids.tolist(), prompt_tokens
722745
)
723746
eval_prefix_len = Llama.longest_token_prefix(
724-
self.eval_tokens, prompt_tokens
747+
self._input_ids.tolist(), prompt_tokens
725748
)
726749
if cache_prefix_len > eval_prefix_len:
727750
self.load_state(cache_item)
@@ -810,7 +833,7 @@ def _create_completion(
810833
self.detokenize(completion_tokens[:returned_tokens])
811834
)
812835
token_offset = len(prompt_tokens) + returned_tokens
813-
logits = self.eval_logits[token_offset - 1]
836+
logits = self._scores[token_offset - 1, :].tolist()
814837
current_logprobs = Llama.logits_to_logprobs(logits)
815838
sorted_logprobs = list(
816839
sorted(
@@ -859,7 +882,7 @@ def _create_completion(
859882
break
860883

861884
if stopping_criteria is not None and stopping_criteria(
862-
list(self.eval_tokens), self.eval_logits[-1]
885+
self._input_ids.tolist(), self._scores[-1, :].tolist()
863886
):
864887
text = self.detokenize(completion_tokens)
865888
finish_reason = "stop"
@@ -889,7 +912,7 @@ def _create_completion(
889912
self.detokenize(completion_tokens[:returned_tokens])
890913
)
891914
token_offset = len(prompt_tokens) + returned_tokens - 1
892-
logits = self.eval_logits[token_offset]
915+
logits = self._scores[token_offset, :].tolist()
893916
current_logprobs = Llama.logits_to_logprobs(logits)
894917
sorted_logprobs = list(
895918
sorted(
@@ -991,8 +1014,7 @@ def _create_completion(
9911014
for token in all_tokens
9921015
]
9931016
all_logprobs = [
994-
Llama.logits_to_logprobs(list(map(float, row)))
995-
for row in self.eval_logits
1017+
Llama.logits_to_logprobs(row.tolist()) for row in self._scores
9961018
][token_offset:]
9971019
for token, token_str, logprobs_token in zip(
9981020
all_tokens, all_token_strs, all_logprobs
@@ -1376,6 +1398,8 @@ def save_state(self) -> LlamaState:
13761398
return LlamaState(
13771399
eval_tokens=self.eval_tokens.copy(),
13781400
eval_logits=self.eval_logits.copy(),
1401+
scores=self._scores.copy(),
1402+
input_ids=self._input_ids.copy(),
13791403
llama_state=llama_state_compact,
13801404
llama_state_size=n_bytes,
13811405
)
@@ -1384,6 +1408,8 @@ def load_state(self, state: LlamaState) -> None:
13841408
assert self.ctx is not None
13851409
self.eval_tokens = state.eval_tokens.copy()
13861410
self.eval_logits = state.eval_logits.copy()
1411+
self._scores = state.scores.copy()
1412+
self._input_ids = state.input_ids.copy()
13871413
state_size = state.llama_state_size
13881414
if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size:
13891415
raise RuntimeError("Failed to set llama state data")

0 commit comments

Comments
 (0)