llama : use LLAMA_TOKEN_NULL (ggml-org#11062)

ggerganov · mglambda · commit 5c8155a3d0a9 · 2025-03-08T10:20:00.000+01:00
ggml-ci
diff --git a/common/common.cpp b/common/common.cpp
@@ -982,7 +982,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_encoder(model)) {
             llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-            if (decoder_start_token_id == -1) {
+            if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
                 decoder_start_token_id = bos;
             }
             tmp.clear();
diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
@@ -65,13 +65,13 @@ constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
 static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
     common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
     if (part_static_it == nc_static.end()) {
-        return -1;
+        return LLAMA_TOKEN_NULL;
     }
     const common_ngram_cache_part part_static = part_static_it->second;
 
     int max_count_static  = 0;
     int sum_count_static  = 0;
-    llama_token max_token = -1;
+    llama_token max_token = LLAMA_TOKEN_NULL;
 
     for (std::pair<llama_token, int> token_count_static : part_static) {
         const llama_token token = token_count_static.first;
@@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
     }
 
     if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
-        return -1;
+        return LLAMA_TOKEN_NULL;
     }
     if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
-        return -1;
+        return LLAMA_TOKEN_NULL;
     }
     return max_token;
 }
@@ -98,9 +98,9 @@ static llama_token try_draft(
     common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
     const int * min_sample_size, const int * min_percent) {
 
-    llama_token drafted_token = -1;
+    llama_token drafted_token = LLAMA_TOKEN_NULL;
 
-    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
+    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
         const common_ngram ngram_primary = ngrams_primary[i];
 
         common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
@@ -112,7 +112,7 @@ static llama_token try_draft(
         int max_count_primary = 0;
         int max_count_static  = 0;
         int sum_count_primary = 0;
-        llama_token max_token = -1;
+        llama_token max_token = LLAMA_TOKEN_NULL;
 
         for (std::pair<llama_token, int> token_count_primary : part_primary) {
             const llama_token token = token_count_primary.first;
@@ -154,7 +154,7 @@ void common_ngram_cache_draft(
     }
 
     while ((int) draft.size()-1 < n_draft) {
-        llama_token drafted_token = -1;
+        llama_token drafted_token = LLAMA_TOKEN_NULL;
 
         const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
         common_ngram ngram_static;
@@ -177,17 +177,17 @@ void common_ngram_cache_draft(
             }
             ngrams_cd.push_back(ngram_cd);
         }
-        if (drafted_token == -1) {
+        if (drafted_token == LLAMA_TOKEN_NULL) {
             drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
         }
-        if (drafted_token == -1) {
+        if (drafted_token == LLAMA_TOKEN_NULL) {
             drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
         }
-        if (drafted_token == -1) {
+        if (drafted_token == LLAMA_TOKEN_NULL) {
             drafted_token = try_draft(nc_static, ngram_static);
         }
 
-        if (drafted_token == -1) {
+        if (drafted_token == LLAMA_TOKEN_NULL) {
             break;
         }
 
diff --git a/common/ngram-cache.h b/common/ngram-cache.h
@@ -17,13 +17,13 @@ struct common_ngram {
 
     common_ngram() {
         for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = -1;
+            tokens[i] = LLAMA_TOKEN_NULL;
         }
     }
 
     common_ngram(const llama_token * input, const int ngram_size) {
         for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = i < ngram_size ? input[i] : -1;
+            tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
         }
     }
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
@@ -120,7 +120,7 @@ int main(int argc, char ** argv) {
         }
 
         llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-        if (decoder_start_token_id == -1) {
+        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
             decoder_start_token_id = llama_token_bos(model);
         }
 
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -689,8 +689,8 @@ static void save_as_llama_model(
     gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
     gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
     gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
-    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
-    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
+    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
+    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
 
     gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
     gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -494,7 +494,7 @@ int main(int argc, char ** argv) {
         }
 
         llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-        if (decoder_start_token_id == -1) {
+        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
             decoder_start_token_id = llama_token_bos(model);
         }
 
@@ -831,7 +831,7 @@ int main(int argc, char ** argv) {
                     // if user stop generation mid-way, we must add EOT to finish model's last response
                     if (need_insert_eot && format_chat) {
                         llama_token eot = llama_token_eot(model);
-                        embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
+                        embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot);
                         need_insert_eot = false;
                     }
 
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
@@ -507,7 +507,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
 
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
-    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
+    std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
 
     // if the size is 1 and first bit is 1, meaning it's a partial character
     //   (size > 1 meaning it's already a known token)
diff --git a/include/llama.h b/include/llama.h
@@ -34,7 +34,6 @@
 
 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
 
-// TODO: use everywhere in the implementation
 #define LLAMA_TOKEN_NULL -1
 
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1923,24 +1923,24 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
 
     // special tokens
-    if (vocab.special_bos_id  != -1)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,     vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
-    if (vocab.special_eos_id  != -1)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,     vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
-    if (vocab.special_eot_id  != -1)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,     vocab.id_to_token[vocab.special_eot_id].text.c_str() );  }
-    if (vocab.special_eom_id  != -1)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,     vocab.id_to_token[vocab.special_eom_id].text.c_str() );  }
-    if (vocab.special_unk_id  != -1)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,     vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
-    if (vocab.special_sep_id  != -1)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,     vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
-    if (vocab.special_pad_id  != -1)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,     vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
-    if (vocab.special_cls_id  != -1)    { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,     vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
-    if (vocab.special_mask_id != -1)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id,    vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
-
-    if (vocab.linefeed_id != -1)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,        vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
-
-    if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
-    if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
-    if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
-    if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
-    if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
-    if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
+    if (vocab.special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,     vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
+    if (vocab.special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,     vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
+    if (vocab.special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,     vocab.id_to_token[vocab.special_eot_id].text.c_str() );  }
+    if (vocab.special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,     vocab.id_to_token[vocab.special_eom_id].text.c_str() );  }
+    if (vocab.special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,     vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
+    if (vocab.special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,     vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
+    if (vocab.special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,     vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
+    if (vocab.special_cls_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,     vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
+    if (vocab.special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id,    vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
+
+    if (vocab.linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,        vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+
+    if (vocab.special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
+    if (vocab.special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
+    if (vocab.special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
+    if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
+    if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
+    if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
 
     for (const auto & id : vocab.special_eog_ids) {
         LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -257,7 +257,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
             for (int i = 0; i < (int)cur_p->size; ++i) {
                 const float val = cur_p->data[i].logit;
                 int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
-                ib = std::max(0, std::min(nbuckets-1, ib));
+                ib = std::max(0, std::min(nbuckets - 1, ib));
                 bucket_idx[i] = ib;
                 ++histo[ib];
             }
@@ -280,13 +280,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
             for (int i = 0; i < (int)cur_p->size; ++i) {
                 int j = bucket_idx[i];
                 if (j >= ib) {
-                    *bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i];
+                    *bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
                 }
             }
 
             ptr = tmp_tokens.data();
             int ndone = 0;
-            for (int j = nbuckets-1; j > ib; --j) {
+            for (int j = nbuckets - 1; j > ib; --j) {
                 std::sort(ptr, ptr + histo[j], comp);
                 ptr += histo[j];
                 ndone += histo[j];
@@ -1832,7 +1832,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
                 ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
                 if (n > 0) {
                     lt = k;
-                    rt = k+n-1;
+                    rt = k + n - 1;
                 }
             } else {
                 // If k is inside the current Z-box, consider two cases.
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -497,7 +497,7 @@ struct llm_tokenizer_bpe_session {
 
     bool append_bos(std::vector<llama_vocab::id> & output) const {
         if (vocab.tokenizer_add_bos) {
-            GGML_ASSERT(vocab.special_bos_id != -1);
+            GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
             output.push_back(vocab.special_bos_id);
             return true;
         }
@@ -506,7 +506,7 @@ struct llm_tokenizer_bpe_session {
 
     bool append_eos(std::vector<llama_vocab::id> & output) const {
         if (vocab.tokenizer_add_eos) {
-            GGML_ASSERT(vocab.special_eos_id != -1);
+            GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
             output.push_back(vocab.special_eos_id);
             return true;
         }
@@ -1403,7 +1403,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         if (source == 0) {
                             buffer.erase_after(buffer.before_begin());
                         } else {
-                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
+                            buffer.erase_after(std::next(buffer.begin(), (source - 1)));
                         }
 
                         // repeat for the right side
@@ -1417,7 +1417,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         if (source == 0) {
                             buffer.erase_after(buffer.before_begin());
                         } else {
-                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
+                            buffer.erase_after(std::next(buffer.begin(), (source - 1)));
                         }
                         break;
                     }
@@ -1454,7 +1454,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                 bool is_prev_special = true;  // prefix with space if first token
 
                 if (add_special && vocab.tokenizer_add_bos) {
-                    GGML_ASSERT(vocab.special_bos_id != -1);
+                    GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
                     output.push_back(vocab.special_bos_id);
                     is_prev_special = true;
                 }
@@ -1489,7 +1489,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                 }
 
                 if (add_special && vocab.tokenizer_add_eos) {
-                    GGML_ASSERT(vocab.special_eos_id != -1);
+                    GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
                     output.push_back(vocab.special_eos_id);
                 }
             } break;
@@ -1522,7 +1522,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
         case LLAMA_VOCAB_TYPE_WPM:
             {
                 if (add_special) {
-                    GGML_ASSERT(vocab.special_cls_id != -1);
+                    GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL);
                     output.push_back(vocab.special_cls_id);
                 }
 
@@ -1542,14 +1542,14 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                 }
 
                 if (add_special) {
-                    GGML_ASSERT(vocab.special_sep_id != -1);
+                    GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL);
                     output.push_back(vocab.special_sep_id);
                 }
             } break;
         case LLAMA_VOCAB_TYPE_UGM:
             {
                 if (add_special && vocab.tokenizer_add_bos) {
-                    GGML_ASSERT(vocab.special_bos_id != -1);
+                    GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
                     output.push_back(vocab.special_bos_id);
                 }
                 llm_tokenizer_ugm_session session(vocab);
@@ -1574,7 +1574,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
                 }
 
                 if (add_special && vocab.tokenizer_add_eos) {
-                    GGML_ASSERT(vocab.special_eos_id != -1);
+                    GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
                     output.push_back(vocab.special_eos_id);
                 }
             } break;
@@ -1642,7 +1642,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
 }
 
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
-    return token != -1 && vocab.special_eog_ids.count(token) > 0;
+    return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0;
 }
 
 bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
@@ -1881,7 +1881,7 @@ int32_t llama_detokenize_impl(
     }
 
     if (remove_special && vocab.tokenizer_add_eos) {
-        if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) {
+        if (n_tokens > 0 && tokens[n_tokens - 1] == vocab.special_eos_id) {
             n_tokens--;
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -982,7 +982,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`982`	`982`	`if (llama_model_has_encoder(model)) {`
`983`	`983`	`llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));`
`984`	`984`	`llama_token decoder_start_token_id = llama_model_decoder_start_token(model);`
`985`		`- if (decoder_start_token_id == -1) {`
	`985`	`+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {`
`986`	`986`	`decoder_start_token_id = bos;`
`987`	`987`	`}`
`988`	`988`	`tmp.clear();`
Original file line number	Diff line number	Diff line change
`@@ -17,13 +17,13 @@ struct common_ngram {`
`17`	`17`
`18`	`18`	`common_ngram() {`
`19`	`19`	`for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {`
`20`		`- tokens[i] = -1;`
	`20`	`+ tokens[i] = LLAMA_TOKEN_NULL;`
`21`	`21`	`}`
`22`	`22`	`}`
`23`	`23`
`24`	`24`	`common_ngram(const llama_token * input, const int ngram_size) {`
`25`	`25`	`for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {`
`26`		`- tokens[i] = i < ngram_size ? input[i] : -1;`
	`26`	`+ tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;`
`27`	`27`	`}`
`28`	`28`	`}`
`29`	`29`
Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,7 @@ int main(int argc, char ** argv) {`
`120`	`120`	`}`
`121`	`121`
`122`	`122`	`llama_token decoder_start_token_id = llama_model_decoder_start_token(model);`
`123`		`- if (decoder_start_token_id == -1) {`
	`123`	`+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {`
`124`	`124`	`decoder_start_token_id = llama_token_bos(model);`
`125`	`125`	`}`
`126`	`126`
Original file line number	Diff line number	Diff line change
`@@ -494,7 +494,7 @@ int main(int argc, char ** argv) {`
`494`	`494`	`}`
`495`	`495`
`496`	`496`	`llama_token decoder_start_token_id = llama_model_decoder_start_token(model);`
`497`		`- if (decoder_start_token_id == -1) {`
	`497`	`+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {`
`498`	`498`	`decoder_start_token_id = llama_token_bos(model);`
`499`	`499`	`}`
`500`	`500`
`@@ -831,7 +831,7 @@ int main(int argc, char ** argv) {`
`831`	`831`	`// if user stop generation mid-way, we must add EOT to finish model's last response`
`832`	`832`	`if (need_insert_eot && format_chat) {`
`833`	`833`	`llama_token eot = llama_token_eot(model);`
`834`		`- embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);`
	`834`	`+ embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot);`
`835`	`835`	`need_insert_eot = false;`
`836`	`836`	`}`
`837`	`837`
Original file line number	Diff line number	Diff line change
`@@ -497,7 +497,7 @@ struct llm_tokenizer_bpe_session {`
`497`	`497`
`498`	`498`	`bool append_bos(std::vector<llama_vocab::id> & output) const {`
`499`	`499`	`if (vocab.tokenizer_add_bos) {`
`500`		`- GGML_ASSERT(vocab.special_bos_id != -1);`
	`500`	`+ GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);`
`501`	`501`	`output.push_back(vocab.special_bos_id);`
`502`	`502`	`return true;`
`503`	`503`	`}`
`@@ -506,7 +506,7 @@ struct llm_tokenizer_bpe_session {`
`506`	`506`
`507`	`507`	`bool append_eos(std::vector<llama_vocab::id> & output) const {`
`508`	`508`	`if (vocab.tokenizer_add_eos) {`
`509`		`- GGML_ASSERT(vocab.special_eos_id != -1);`
	`509`	`+ GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);`
`510`	`510`	`output.push_back(vocab.special_eos_id);`
`511`	`511`	`return true;`
`512`	`512`	`}`
`@@ -1403,7 +1403,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<`
`1403`	`1403`	`if (source == 0) {`
`1404`	`1404`	`buffer.erase_after(buffer.before_begin());`
`1405`	`1405`	`} else {`
`1406`		`- buffer.erase_after(std::next(buffer.begin(), (source-1)));`
	`1406`	`+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));`
`1407`	`1407`	`}`
`1408`	`1408`
`1409`	`1409`	`// repeat for the right side`
`@@ -1417,7 +1417,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<`
`1417`	`1417`	`if (source == 0) {`
`1418`	`1418`	`buffer.erase_after(buffer.before_begin());`
`1419`	`1419`	`} else {`
`1420`		`- buffer.erase_after(std::next(buffer.begin(), (source-1)));`
	`1420`	`+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));`
`1421`	`1421`	`}`
`1422`	`1422`	`break;`
`1423`	`1423`	`}`
`@@ -1454,7 +1454,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(`
`1454`	`1454`	`bool is_prev_special = true; // prefix with space if first token`
`1455`	`1455`
`1456`	`1456`	`if (add_special && vocab.tokenizer_add_bos) {`
`1457`		`- GGML_ASSERT(vocab.special_bos_id != -1);`
	`1457`	`+ GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);`
`1458`	`1458`	`output.push_back(vocab.special_bos_id);`
`1459`	`1459`	`is_prev_special = true;`
`1460`	`1460`	`}`
`@@ -1489,7 +1489,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(`
`1489`	`1489`	`}`
`1490`	`1490`
`1491`	`1491`	`if (add_special && vocab.tokenizer_add_eos) {`
`1492`		`- GGML_ASSERT(vocab.special_eos_id != -1);`
	`1492`	`+ GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);`
`1493`	`1493`	`output.push_back(vocab.special_eos_id);`
`1494`	`1494`	`}`
`1495`	`1495`	`} break;`
`@@ -1522,7 +1522,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(`
`1522`	`1522`	`case LLAMA_VOCAB_TYPE_WPM:`
`1523`	`1523`	`{`
`1524`	`1524`	`if (add_special) {`
`1525`		`- GGML_ASSERT(vocab.special_cls_id != -1);`
	`1525`	`+ GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL);`
`1526`	`1526`	`output.push_back(vocab.special_cls_id);`
`1527`	`1527`	`}`
`1528`	`1528`
`@@ -1542,14 +1542,14 @@ std::vector<llama_vocab::id> llama_tokenize_internal(`
`1542`	`1542`	`}`
`1543`	`1543`
`1544`	`1544`	`if (add_special) {`
`1545`		`- GGML_ASSERT(vocab.special_sep_id != -1);`
	`1545`	`+ GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL);`
`1546`	`1546`	`output.push_back(vocab.special_sep_id);`
`1547`	`1547`	`}`
`1548`	`1548`	`} break;`
`1549`	`1549`	`case LLAMA_VOCAB_TYPE_UGM:`
`1550`	`1550`	`{`
`1551`	`1551`	`if (add_special && vocab.tokenizer_add_bos) {`
`1552`		`- GGML_ASSERT(vocab.special_bos_id != -1);`
	`1552`	`+ GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);`
`1553`	`1553`	`output.push_back(vocab.special_bos_id);`
`1554`	`1554`	`}`
`1555`	`1555`	`llm_tokenizer_ugm_session session(vocab);`
`@@ -1574,7 +1574,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(`
`1574`	`1574`	`}`
`1575`	`1575`
`1576`	`1576`	`if (add_special && vocab.tokenizer_add_eos) {`
`1577`		`- GGML_ASSERT(vocab.special_eos_id != -1);`
	`1577`	`+ GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);`
`1578`	`1578`	`output.push_back(vocab.special_eos_id);`
`1579`	`1579`	`}`
`1580`	`1580`	`} break;`
`@@ -1642,7 +1642,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla`
`1642`	`1642`	`}`
`1643`	`1643`
`1644`	`1644`	`bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {`
`1645`		`- return token != -1 && vocab.special_eog_ids.count(token) > 0;`
	`1645`	`+ return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0;`
`1646`	`1646`	`}`
`1647`	`1647`
`1648`	`1648`	`bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {`
`@@ -1881,7 +1881,7 @@ int32_t llama_detokenize_impl(`
`1881`	`1881`	`}`
`1882`	`1882`
`1883`	`1883`	`if (remove_special && vocab.tokenizer_add_eos) {`
`1884`		`- if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) {`
	`1884`	`+ if (n_tokens > 0 && tokens[n_tokens - 1] == vocab.special_eos_id) {`
`1885`	`1885`	`n_tokens--;`
`1886`	`1886`	`}`
`1887`	`1887`	`}`