@@ -975,20 +975,6 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
975
975
(void ) tensor;
976
976
}
977
977
978
- static std::string llama_token_to_str (const struct llama_context * ctx, llama_token token) {
979
- std::vector<char > result (8 , 0 );
980
- const int n_tokens = llama_token_to_piece (llama_get_model (ctx), token, result.data (), result.size ());
981
- if (n_tokens < 0 ) {
982
- result.resize (-n_tokens);
983
- int check = llama_token_to_piece (llama_get_model (ctx), token, result.data (), result.size ());
984
- GGML_ASSERT (check == -n_tokens);
985
- } else {
986
- result.resize (n_tokens);
987
- }
988
-
989
- return std::string (result.data (), result.size ());
990
- }
991
-
992
978
//
993
979
// globals
994
980
//
@@ -1202,10 +1188,10 @@ struct llama_vocab {
1202
1188
id special_eot_id = 32010 ;
1203
1189
1204
1190
int find_bpe_rank (std::string token_left, std::string token_right) const {
1205
- replace_all (token_left, " " , " \u0120 " );
1206
- replace_all (token_left, " \n " , " \u010A " );
1207
- replace_all (token_right, " " , " \u0120 " );
1208
- replace_all (token_right, " \n " , " \u010A " );
1191
+ GGML_ASSERT (token_left. find ( " " ) == std::string::npos );
1192
+ GGML_ASSERT (token_left. find ( " \n " ) == std::string::npos );
1193
+ GGML_ASSERT (token_right. find ( " " ) == std::string::npos );
1194
+ GGML_ASSERT (token_right. find ( " \n " ) == std::string::npos );
1209
1195
1210
1196
auto it = bpe_ranks.find (std::make_pair (token_left, token_right));
1211
1197
if (it == bpe_ranks.end ()) {
@@ -7461,6 +7447,21 @@ void llama_sample_repetition_penalties(
7461
7447
}
7462
7448
}
7463
7449
7450
+ static std::string llama_token_to_piece (const struct llama_context * ctx, llama_token token) {
7451
+ std::vector<char > result (8 , 0 );
7452
+ const int n_tokens = llama_token_to_piece (llama_get_model (ctx), token, result.data (), result.size ());
7453
+ if (n_tokens < 0 ) {
7454
+ result.resize (-n_tokens);
7455
+ int check = llama_token_to_piece (llama_get_model (ctx), token, result.data (), result.size ());
7456
+ GGML_ASSERT (check == -n_tokens);
7457
+ }
7458
+ else {
7459
+ result.resize (n_tokens);
7460
+ }
7461
+
7462
+ return std::string (result.data (), result.size ());
7463
+ }
7464
+
7464
7465
void llama_sample_grammar (struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
7465
7466
GGML_ASSERT (ctx);
7466
7467
const int64_t t_start_sample_us = ggml_time_us ();
@@ -7480,7 +7481,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7480
7481
7481
7482
for (size_t i = 0 ; i < candidates->size ; ++i) {
7482
7483
const llama_token id = candidates->data [i].id ;
7483
- const std::string piece = llama_token_to_str (ctx, id);
7484
+ const std::string piece = llama_token_to_piece (ctx, id);
7484
7485
if (id == eos) {
7485
7486
if (!allow_eos) {
7486
7487
candidates->data [i].logit = -INFINITY;
@@ -7692,7 +7693,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7692
7693
GGML_ASSERT (false );
7693
7694
}
7694
7695
7695
- const std::string piece = llama_token_to_str (ctx, token);
7696
+ const std::string piece = llama_token_to_piece (ctx, token);
7696
7697
7697
7698
// Note terminating 0 in decoded string
7698
7699
const auto decoded = decode_utf8 (piece.c_str (), grammar->partial_utf8 );
0 commit comments