Add llama_tokens_to_string() to utils.cpp

thomasantony · thomasantony · commit 05224ed47253 · 2023-03-19T13:31:57.000-07:00
- Also single token converter
diff --git a/utils.cpp b/utils.cpp
@@ -347,6 +347,16 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
 
     return res;
 }
+std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens) {
+    std::string res;
+    for (auto t : tokens) {
+        res += vocab.id_to_token.at(t);
+    }
+    return res;
+}
+std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens) {
+    return vocab.id_to_token.at(tokens);
+}
 
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
     printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
diff --git a/utils.h b/utils.h
@@ -81,6 +81,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 // ref: https://github.com/google/sentencepiece
 std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
 
+// convert tokens to string
+// opposite llama_tokenize
+std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens);
+inline std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens);
+
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);