Skip to content

Commit 49c25cc

Browse files
authored
tests : use new tokenizer type API (#2692)
* Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * Improved tokenizer test But does it work on MacOS? * Improve token type support - Added @klosax code to convert.py - Improved token type support in vocabulary * Exclude platform dependent tests * More sentencepiece compatibility by eliminating magic numbers * Restored accidentally removed comment * Improve commentary * Use token type API in test-tokenizer-1.cpp
1 parent 0b53b8b commit 49c25cc

File tree

2 files changed

+4
-4
lines changed

2 files changed

+4
-4
lines changed

convert.py

100755100644
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,8 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
741741
tokens = []
742742
scores = []
743743
toktypes = []
744+
# NOTE: `all_tokens` returns the the base vocabulary and added tokens
745+
# TODO: add special tokens?
744746
for text, score, toktype in vocab.all_tokens():
745747
tokens.append(text)
746748
scores.append(score)
@@ -751,8 +753,6 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
751753
self.gguf.add_token_scores(scores)
752754
self.gguf.add_token_types(toktypes)
753755

754-
# TODO: added / special tokens
755-
756756
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
757757
n_elements = 1
758758
for dim in tensor.shape:

tests/test-tokenizer-1.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ int main(int argc, char **argv) {
8787
return 2;
8888
}
8989
} else {
90-
// TODO: needs access to token types
91-
if (0 <= i && i < 259) {
90+
llama_token_type type = llama_token_get_type(ctx, i);
91+
if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
9292
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
9393
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
9494
} else {

0 commit comments

Comments
 (0)