Skip to content

Commit 0b53b8b

Browse files
committed
llama : add API for token type
ggml-ci
1 parent 8d177ed commit 0b53b8b

File tree

6 files changed

+115
-116
lines changed

6 files changed

+115
-116
lines changed

convert.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -241,17 +241,19 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
241241
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
242242
else:
243243
added_tokens = {}
244+
244245
vocab_size: int = len(self.bpe_tokenizer)
245-
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
246-
actual_ids = sorted(added_tokens.values())
246+
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
247+
actual_ids = sorted(added_tokens.values())
247248
if expected_ids != actual_ids:
248249
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
250+
249251
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
250-
self.added_tokens_list = [text for (text, idx) in items]
252+
self.added_tokens_list = [text for (text, idx) in items]
251253
self.vocab_size_base: int = vocab_size
252-
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
253-
self.fname_tokenizer = fname_tokenizer
254-
self.fname_added_tokens = fname_added_tokens
254+
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
255+
self.fname_tokenizer = fname_tokenizer
256+
self.fname_added_tokens = fname_added_tokens
255257

256258
def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
257259
tokenizer = self.bpe_tokenizer
@@ -261,12 +263,12 @@ def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
261263
for i, item in enumerate(tokenizer):
262264
text: bytes = item.encode("utf-8")
263265
score: float = -i
264-
yield text, score, 4
266+
yield text, score, gguf.TokenType.USER_DEFINED
265267

266268
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
267269
for text in self.added_tokens_list:
268270
score = -1000.0
269-
yield text.encode("utf-8"), score, 4
271+
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
270272

271273
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
272274
yield from self.bpe_tokens()
@@ -304,27 +306,27 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
304306
text: bytes = piece.encode("utf-8")
305307
score: float = tokenizer.get_score(i)
306308

307-
toktype = 1 # defualt to normal token type
309+
toktype = gguf.TokenType.NORMAL
308310
if tokenizer.is_unknown(i):
309-
toktype = 2
311+
toktype = gguf.TokenType.UNKNOWN
310312
if tokenizer.is_control(i):
311-
toktype = 3
313+
toktype = gguf.TokenType.CONTROL
312314

313315
# NOTE: I think added_tokens are user defined.
314316
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
315-
# if tokenizer.is_user_defined(i): toktype = 4
317+
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
316318

317319
if tokenizer.is_unused(i):
318-
toktype = 5
320+
toktype = gguf.TokenType.UNUSED
319321
if tokenizer.is_byte(i):
320-
toktype = 6
322+
toktype = gguf.TokenType.BYTE
321323

322324
yield text, score, toktype
323325

324326
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
325327
for text in self.added_tokens_list:
326328
score = -1000.0
327-
yield text.encode("utf-8"), score, 4
329+
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
328330

329331
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
330332
yield from self.sentencepiece_tokens()
@@ -725,6 +727,7 @@ def __init__(self, fname_out: Path) -> None:
725727
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
726728

727729
def add_meta_arch(self, params: Params) -> None:
730+
self.gguf.add_name ("llama")
728731
self.gguf.add_context_length (params.n_ctx)
729732
self.gguf.add_embedding_length (params.n_embd)
730733
self.gguf.add_block_count (params.n_layer)

examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
139139
struct llama_vocab {
140140
using id = int32_t;
141141
using token = std::string;
142+
using ttype = llama_token_type;
142143

143-
struct token_score {
144-
token tok;
144+
struct token_data {
145+
token text;
145146
float score;
147+
ttype type;
146148
};
147149

148150
std::unordered_map<token, id> token_to_id;
149-
std::vector<token_score> id_to_token;
151+
std::vector<token_data> id_to_token;
150152
};
151153

152154
struct my_llama_hparams {
@@ -516,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
516518
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
517519
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
518520

519-
std::vector<const char *> strings;
520-
std::vector<float> scores;
521-
int n_vocab = llama_n_vocab(lctx);
522-
strings.resize(n_vocab, NULL);
523-
scores.resize(n_vocab, 0);
524-
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
525-
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
521+
const int n_vocab = llama_n_vocab(lctx);
526522
vocab->id_to_token.resize(n_vocab);
527523
for (int i=0; i<n_vocab; ++i) {
528-
std::string tok = std::string(strings[i]);
529-
float score = scores[i];
530-
vocab->id_to_token[i].tok = tok;
531-
vocab->id_to_token[i].score = score;
532-
vocab->token_to_id.emplace(tok, i);
524+
vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
525+
vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
526+
vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
527+
vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
533528
}
534529
llama_free(lctx);
535530
llama_free_model(lmodel);
536531
} else { // assume llama2.c vocabulary
537532
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
538533
llama_file file(filename, "rb");
539-
uint32_t n_vocab = config->vocab_size;
534+
const int n_vocab = config->vocab_size;
540535
/* uint32_t max_token_length = */ file.read_u32(); // unused
541536
vocab->id_to_token.resize(n_vocab);
542-
for (uint32_t i=0; i<n_vocab; ++i) {
537+
for (int i=0; i<n_vocab; ++i) {
543538
float_t score = file.read_f32();
544539
uint32_t len = file.read_u32();
545-
std::string tok = file.read_string(len);
546-
vocab->id_to_token[i].tok = tok;
540+
std::string text = file.read_string(len);
541+
vocab->id_to_token[i].text = text;
547542
vocab->id_to_token[i].score = score;
548-
vocab->token_to_id.emplace(tok, i);
543+
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
544+
vocab->token_to_id.emplace(text, i);
549545
}
550546
}
551547
}
@@ -611,10 +607,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
611607
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
612608
// uint32_t n_vocab = model->hparams.n_vocab;
613609
// for (uint32_t i = 0; i < n_vocab; i++) {
614-
// const auto & token_score = vocab->id_to_token.at(i);
615-
// file.write_u32((uint32_t) token_score.tok.size());
616-
// file.write_raw(token_score.tok.data(), token_score.tok.size());
617-
// file.write_raw(&token_score.score, sizeof(token_score.score));
610+
// const auto & token_data = vocab->id_to_token.at(i);
611+
// file.write_u32((uint32_t) token_data.tok.size());
612+
// file.write_raw(token_data.tok.data(), token_data.tok.size());
613+
// file.write_raw(&token_data.score, sizeof(token_data.score));
618614
// }
619615
//
620616
// // stuff AK weights into GG weights one by one.

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -170,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
170170
struct llama_vocab {
171171
using id = int32_t;
172172
using token = std::string;
173+
using ttype = llama_token_type;
173174

174-
struct token_score {
175-
token tok;
175+
struct token_data {
176+
token text;
176177
float score;
178+
ttype type;
177179
};
178180

179181
std::unordered_map<token, id> token_to_id;
180-
std::vector<token_score> id_to_token;
182+
std::vector<token_data> id_to_token;
181183
};
182184

183185
struct my_llama_hparams {
@@ -2629,10 +2631,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
26292631
// // write_vocab
26302632
// uint32_t n_vocab = model->hparams.n_vocab;
26312633
// for (uint32_t i = 0; i < n_vocab; i++) {
2632-
// const auto & token_score = vocab->id_to_token.at(i);
2633-
// file.write_u32((uint32_t) token_score.tok.size());
2634-
// file.write_raw(token_score.tok.data(), token_score.tok.size());
2635-
// file.write_raw(&token_score.score, sizeof(token_score.score));
2634+
// const auto & token_data = vocab->id_to_token.at(i);
2635+
// file.write_u32((uint32_t) token_data.tok.size());
2636+
// file.write_raw(token_data.tok.data(), token_data.tok.size());
2637+
// file.write_raw(&token_data.score, sizeof(token_data.score));
26362638
// }
26372639
// // write tensors
26382640
// write_tensor(&file, model->tok_embeddings);
@@ -3055,20 +3057,13 @@ int main(int argc, char ** argv) {
30553057

30563058
struct llama_vocab vocab;
30573059
{
3058-
std::vector<const char *> strings;
3059-
std::vector<float> scores;
3060-
int n_vocab = llama_n_vocab(lctx);
3061-
strings.resize(n_vocab, NULL);
3062-
scores.resize(n_vocab, 0);
3063-
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
3064-
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
3060+
const int n_vocab = llama_n_vocab(lctx);
30653061
vocab.id_to_token.resize(n_vocab);
30663062
for (int i=0; i<n_vocab; ++i) {
3067-
std::string tok = std::string(strings[i]);
3068-
float score = scores[i];
3069-
vocab.id_to_token[i].tok = tok;
3070-
vocab.id_to_token[i].score = score;
3071-
vocab.token_to_id.emplace(tok, i);
3063+
vocab.id_to_token[i].text = llama_token_get_text(lctx, i);
3064+
vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
3065+
vocab.id_to_token[i].type = llama_token_get_type(lctx, i);
3066+
vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
30723067
}
30733068
}
30743069

gguf.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
6262
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
6363

64+
6465
#
6566
# recommended mapping of model tensor names for storage in gguf
6667
#
@@ -319,6 +320,15 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
319320

320321
return tensor_map
321322

323+
324+
class TokenType(IntEnum):
325+
NORMAL = 1
326+
UNKNOWN = 2
327+
CONTROL = 3
328+
USER_DEFINED = 4
329+
UNUSED = 5
330+
BYTE = 6
331+
322332
#
323333
# implementation
324334
#

0 commit comments

Comments
 (0)