Symetric params for llama_tokenize() and llama_detokenize()

jaime-m-p · jaime-m-p · commit 9854a9cde923 · 2024-06-25T17:28:53.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -2924,10 +2924,10 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
 std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special);
+    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
     if (n_chars < 0) {
         text.resize(-n_chars);
-        n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special);
+        n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
         GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
     }
 
diff --git a/llama.cpp b/llama.cpp
@@ -18503,16 +18503,30 @@ int32_t llama_detokenize(
                          int32_t   n_tokens,
                             char * text,
                          int32_t   text_len_max,
-                            bool   special) {
+                            bool   remove_special,
+                            bool   unparse_special) {
     // remove the leading space of the first non-control token
     static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
-    bool remove_space = !special && model->vocab.tokenizer_add_space_prefix;
+    bool remove_space = !unparse_special && model->vocab.tokenizer_add_space_prefix;
     int32_t avail = text_len_max;
     int32_t total = 0;
 
+    if (remove_special && model->vocab.tokenizer_add_bos) {
+        if (n_tokens > 0 && tokens[0] == model->vocab.special_bos_id) {
+            n_tokens--;
+            tokens++;
+        }
+    }
+
+    if (remove_special && model->vocab.tokenizer_add_eos) {
+        if (n_tokens > 0 && tokens[n_tokens-1] == model->vocab.special_eos_id) {
+            n_tokens--;
+        }
+    }
+
     for (int32_t i = 0; i < n_tokens; ++i) {
         GGML_ASSERT(avail >= 0);
-        int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, special);
+        int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, unparse_special);
         const llama_token_attr attr = llama_token_get_attr(model, tokens[i]);
         remove_space = remove_space && (attr & attr_special);  // until non-control token
         if (n_chars < 0) {
diff --git a/llama.h b/llama.h
@@ -874,6 +874,7 @@ extern "C" {
     /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
     /// @return Returns the number of tokens on success, no more than n_tokens_max
     /// @return Returns a negative number on failure - the number of tokens that would have been returned
+    /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
     /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
     ///                      as plaintext. Does not insert a leading space.
     LLAMA_API int32_t llama_tokenize(
@@ -898,18 +899,20 @@ extern "C" {
                                int32_t   lstrip,
                                   bool   special);
 
-    /// @details Convert the provided tokens into text.
+    /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
     /// @param text The char pointer must be large enough to hold the resulting text.
     /// @return Returns the number of chars/bytes on success, no more than text_len_max.
     /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
-    /// @param special If true, special tokens are rendered in the output.
+    /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
+    /// @param unparse_special If true, special tokens are rendered in the output.
     LLAMA_API int32_t llama_detokenize(
         const struct llama_model * model,
                const llama_token * tokens,
                          int32_t   n_tokens,
                             char * text,
                          int32_t   text_len_max,
-                            bool   special);
+                            bool   remove_special,
+                            bool   unparse_special);
 
     /// Apply chat template. Inspired by hf apply_chat_template() on python.
     /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
@@ -98,15 +98,15 @@ def tokenize(self, text: str, add_special: bool = False, parse_special: bool = F
             num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, len(self.token_ids), add_special, parse_special)
         return list(self.token_ids[0:num])
 
-    def detokenize(self, ids: list[int], special: bool = False) -> str:
+    def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
         if len(self.token_ids) < len(ids):
             self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
         for i, id in enumerate(ids):
             self.token_ids[i] = id
-        num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), special)
+        num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
         while num < 0 and len(self.text_buff) < (16 << 20):
             self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
-            num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), special)
+            num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
         return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
 
 
@@ -160,7 +160,7 @@ def encode(self, text: str) -> list[int]:
         return self.model.tokenize(text, add_special=True, parse_special=True)
 
     def decode(self, ids: list[int]) -> str:
-        return self.model.detokenize(ids, special=True)
+        return self.model.detokenize(ids, remove_special=False, unparse_special=True)
 
 
 def generator_custom_text() -> Iterator[str]: