Skip to content

Commit 22b31cd

Browse files
committed
llama : expose llama_vocab in the API
ggml-ci
1 parent aefcffa commit 22b31cd

39 files changed

+496
-415
lines changed

common/common.cpp

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -857,21 +857,23 @@ struct common_init_result common_init_from_params(common_params & params) {
857857
return iparams;
858858
}
859859

860+
const llama_vocab * vocab = llama_get_vocab(model);
861+
860862
if (params.reranking) {
861863
bool ok = true;
862864

863-
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
864-
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
865+
if (llama_token_bos(vocab) == LLAMA_TOKEN_NULL) {
866+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
865867
ok = false;
866868
}
867869

868-
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
869-
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
870+
if (llama_token_eos(vocab) == LLAMA_TOKEN_NULL) {
871+
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
870872
ok = false;
871873
}
872874

873-
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
874-
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
875+
if (llama_token_sep(vocab) == LLAMA_TOKEN_NULL) {
876+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
875877
ok = false;
876878
}
877879

@@ -941,14 +943,14 @@ struct common_init_result common_init_from_params(common_params & params) {
941943
common_lora_adapters_apply(lctx, params.lora_adapters);
942944
}
943945

944-
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
945-
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
946+
if (params.sampling.ignore_eos && llama_token_eos(vocab) == LLAMA_TOKEN_NULL) {
947+
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
946948
params.sampling.ignore_eos = false;
947949
}
948950

949951
if (params.sampling.ignore_eos) {
950952
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
951-
if (llama_token_is_eog(model, i)) {
953+
if (llama_token_is_eog(vocab, i)) {
952954
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
953955
params.sampling.logit_bias.push_back({i, -INFINITY});
954956
}
@@ -969,8 +971,9 @@ struct common_init_result common_init_from_params(common_params & params) {
969971
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
970972

971973
std::vector<llama_token> tmp;
972-
llama_token bos = llama_token_bos(model);
973-
llama_token eos = llama_token_eos(model);
974+
llama_token bos = llama_token_bos(vocab);
975+
llama_token eos = llama_token_eos(vocab);
976+
974977
// some models (e.g. T5) don't have a BOS token
975978
if (bos != LLAMA_TOKEN_NULL) {
976979
tmp.push_back(bos);
@@ -1559,21 +1562,23 @@ std::vector<llama_token> common_tokenize(
15591562
const std::string & text,
15601563
bool add_special,
15611564
bool parse_special) {
1562-
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1565+
const llama_model * model = llama_get_model(ctx);
1566+
const llama_vocab * vocab = llama_get_vocab(model);
1567+
return common_tokenize(vocab, text, add_special, parse_special);
15631568
}
15641569

15651570
std::vector<llama_token> common_tokenize(
1566-
const struct llama_model * model,
1571+
const struct llama_vocab * vocab,
15671572
const std::string & text,
15681573
bool add_special,
15691574
bool parse_special) {
15701575
// upper limit for the number of tokens
15711576
int n_tokens = text.length() + 2 * add_special;
15721577
std::vector<llama_token> result(n_tokens);
1573-
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1578+
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
15741579
if (n_tokens < 0) {
15751580
result.resize(-n_tokens);
1576-
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1581+
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
15771582
GGML_ASSERT(check == -n_tokens);
15781583
} else {
15791584
result.resize(n_tokens);
@@ -1582,12 +1587,18 @@ std::vector<llama_token> common_tokenize(
15821587
}
15831588

15841589
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1590+
const llama_model * model = llama_get_model(ctx);
1591+
const llama_vocab * vocab = llama_get_vocab(model);
1592+
return common_token_to_piece(vocab, token, special);
1593+
}
1594+
1595+
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
15851596
std::string piece;
15861597
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1587-
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1598+
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
15881599
if (n_chars < 0) {
15891600
piece.resize(-n_chars);
1590-
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1601+
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
15911602
GGML_ASSERT(check == -n_chars);
15921603
}
15931604
else {
@@ -1597,13 +1608,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
15971608
return piece;
15981609
}
15991610

1600-
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1611+
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1612+
const llama_model * model = llama_get_model(ctx);
1613+
const llama_vocab * vocab = llama_get_vocab(model);
1614+
return common_detokenize(vocab, tokens, special);
1615+
}
1616+
1617+
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
16011618
std::string text;
16021619
text.resize(std::max(text.capacity(), tokens.size()));
1603-
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1620+
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
16041621
if (n_chars < 0) {
16051622
text.resize(-n_chars);
1606-
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1623+
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
16071624
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
16081625
}
16091626

common/common.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,7 @@ std::vector<llama_token> common_tokenize(
541541
bool parse_special = false);
542542

543543
std::vector<llama_token> common_tokenize(
544-
const struct llama_model * model,
544+
const struct llama_vocab * vocab,
545545
const std::string & text,
546546
bool add_special,
547547
bool parse_special = false);
@@ -553,11 +553,21 @@ std::string common_token_to_piece(
553553
llama_token token,
554554
bool special = true);
555555

556+
std::string common_token_to_piece(
557+
const struct llama_vocab * vocab,
558+
llama_token token,
559+
bool special = true);
560+
556561
// detokenizes a vector of tokens into a string
557562
// should work similar to Python's `tokenizer.decode`
558563
// optionally renders special/control tokens
559564
std::string common_detokenize(
560-
llama_context * ctx,
565+
const struct llama_context * ctx,
566+
const std::vector<llama_token> & tokens,
567+
bool special = true);
568+
569+
std::string common_detokenize(
570+
const struct llama_vocab * vocab,
561571
const std::vector<llama_token> & tokens,
562572
bool special = true);
563573

common/sampling.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,15 @@ std::string common_params_sampling::print() const {
142142
}
143143

144144
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
145+
const llama_vocab * vocab = llama_get_vocab(model);
146+
145147
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
146148

147149
lparams.no_perf = params.no_perf;
148150

149151
auto * result = new common_sampler {
150152
/* .params = */ params,
151-
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
153+
/* .grmr = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
152154
/* .chain = */ llama_sampler_chain_init(lparams),
153155
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
154156
/* .cur = */ {},
@@ -172,7 +174,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
172174
c_breakers.push_back(str.c_str());
173175
}
174176

175-
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
177+
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
176178
}
177179
break;
178180
case COMMON_SAMPLER_TYPE_TOP_K:
@@ -194,7 +196,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
194196
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
195197
break;
196198
case COMMON_SAMPLER_TYPE_INFILL:
197-
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
199+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
198200
break;
199201
case COMMON_SAMPLER_TYPE_PENALTIES:
200202
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));

common/speculative.cpp

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,13 @@ bool common_speculative_are_compatible(
7979
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
8080
const struct llama_model * model_dft = llama_get_model(ctx_dft);
8181

82-
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
82+
const struct llama_vocab * vocab_tgt = llama_get_vocab(model_tgt);
83+
const struct llama_vocab * vocab_dft = llama_get_vocab(model_dft);
84+
85+
const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
8386
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
8487

85-
const bool vocab_type_dft = llama_vocab_type(model_dft);
88+
const bool vocab_type_dft = llama_vocab_type(vocab_dft);
8689
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
8790

8891
if (vocab_type_tgt != vocab_type_dft) {
@@ -91,13 +94,13 @@ bool common_speculative_are_compatible(
9194
return false;
9295
}
9396

94-
if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
95-
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
96-
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
97-
llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
98-
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
99-
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
100-
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
97+
if (llama_add_bos_token(vocab_tgt) != llama_add_bos_token(vocab_dft) ||
98+
llama_add_eos_token(vocab_tgt) != llama_add_eos_token(vocab_dft) ||
99+
llama_token_bos(vocab_tgt) != llama_token_bos(vocab_dft) ||
100+
llama_token_eos(vocab_tgt) != llama_token_eos(vocab_dft)) {
101+
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
102+
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(vocab_tgt), llama_add_bos_token(vocab_tgt), llama_token_eos(vocab_tgt), llama_add_eos_token(vocab_tgt));
103+
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(vocab_dft), llama_add_bos_token(vocab_dft), llama_token_eos(vocab_dft), llama_add_eos_token(vocab_dft));
101104
return false;
102105
}
103106

@@ -115,10 +118,10 @@ bool common_speculative_are_compatible(
115118
}
116119

117120
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
118-
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
119-
const char * token_text_dft = llama_token_get_text(model_dft, i);
121+
const char * token_text_tgt = llama_token_get_text(vocab_tgt, i);
122+
const char * token_text_dft = llama_token_get_text(vocab_dft, i);
120123
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
121-
LOG_ERR("%s: draft model vocab must match target model to use speculation but "
124+
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
122125
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
123126
common_token_to_piece(ctx_tgt, i).c_str(),
124127
common_token_to_piece(ctx_dft, i).c_str());

examples/batched/batched.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,12 @@ int main(int argc, char ** argv) {
4848
return 1;
4949
}
5050

51+
const llama_vocab * vocab = llama_get_vocab(model);
52+
5153
// tokenize the prompt
5254

5355
std::vector<llama_token> tokens_list;
54-
tokens_list = common_tokenize(model, params.prompt, true);
56+
tokens_list = common_tokenize(vocab, params.prompt, true);
5557

5658
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
5759

@@ -121,7 +123,7 @@ int main(int argc, char ** argv) {
121123

122124
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
123125
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
124-
decoder_start_token_id = llama_token_bos(model);
126+
decoder_start_token_id = llama_token_bos(vocab);
125127
}
126128

127129
common_batch_clear(batch);
@@ -174,7 +176,7 @@ int main(int argc, char ** argv) {
174176
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
175177

176178
// is it an end of generation? -> mark the stream as finished
177-
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
179+
if (llama_token_is_eog(vocab, new_token_id) || n_cur == n_predict) {
178180
i_batch[i] = -1;
179181
LOG("\n");
180182
if (n_parallel > 1) {

examples/cvector-generator/cvector-generator.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,9 @@ struct tokenized_prompt {
273273
size_t max_seq_len;
274274

275275
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
276-
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
276+
const llama_model * model = llama_get_model(ctx);
277+
const llama_vocab * vocab = llama_get_vocab(model);
278+
const bool add_bos = llama_add_bos_token(vocab);
277279
tokens_pos = common_tokenize(ctx, pos, add_bos, true);
278280
tokens_neg = common_tokenize(ctx, neg, add_bos, true);
279281
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());

examples/embedding/embedding.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ int main(int argc, char ** argv) {
105105
return 1;
106106
}
107107

108+
const llama_vocab * vocab = llama_get_vocab(model);
109+
108110
const int n_ctx_train = llama_n_ctx_train(model);
109111
const int n_ctx = llama_n_ctx(ctx);
110112

@@ -148,7 +150,7 @@ int main(int argc, char ** argv) {
148150
// check if the last token is SEP
149151
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
150152
for (auto & inp : inputs) {
151-
if (inp.empty() || inp.back() != llama_token_sep(model)) {
153+
if (inp.empty() || inp.back() != llama_token_sep(vocab)) {
152154
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
153155
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
154156
}

examples/eval-callback/eval-callback.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
127127
}
128128

129129
static bool run(llama_context * ctx, const common_params & params) {
130-
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
130+
const llama_model * model = llama_get_model(ctx);
131+
const llama_vocab * vocab = llama_get_vocab(model);
132+
133+
const bool add_bos = llama_add_bos_token(vocab);
131134

132135
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
133136

examples/gritlm/gritlm.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
1111
std::vector<std::vector<float>> result;
1212

1313
const llama_model * model = llama_get_model(ctx);
14+
const llama_vocab * vocab = llama_get_vocab(model);
1415

1516
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
1617

@@ -19,16 +20,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
1920

2021
const std::string input_string = instruction + sentences[i];
2122

22-
std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
23+
std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false);
2324

2425
const int32_t n_toks = inputs.size();
2526

2627
// GritLM seems to have EOS = ""
2728
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
28-
// inputs.push_back(llama_token_eos(model));
29+
// inputs.push_back(llama_token_eos(vocab));
2930

3031
// we want to ignore instruction tokens for mean pooling
31-
const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
32+
const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
3233

3334
#ifdef GRIT_DEBUG
3435
// debug tokens - should be matching as referenced in the GritLM sample
@@ -97,15 +98,17 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
9798
std::string result;
9899

99100
const llama_model * model = llama_get_model(ctx);
100-
llama_token eos_token = llama_token_eos(model);
101+
const llama_vocab * vocab = llama_get_vocab(model);
102+
103+
llama_token eos_token = llama_token_eos(vocab);
101104

102105
llama_kv_cache_clear(ctx);
103106
llama_set_embeddings(ctx, false);
104107
llama_set_causal_attn(ctx, true);
105108

106109
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
107110

108-
std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
111+
std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
109112
int32_t i_current_token = 0;
110113

111114
while (true) {

0 commit comments

Comments
 (0)