Skip to content

Commit 05224ed

Browse files
committed
Add llama_tokens_to_string() to utils.cpp
- Also single token converter
1 parent 912e624 commit 05224ed

File tree

2 files changed

+15
-0
lines changed

2 files changed

+15
-0
lines changed

utils.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,16 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
347347

348348
return res;
349349
}
350+
std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens) {
351+
std::string res;
352+
for (auto t : tokens) {
353+
res += vocab.id_to_token.at(t);
354+
}
355+
return res;
356+
}
357+
std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens) {
358+
return vocab.id_to_token.at(tokens);
359+
}
350360

351361
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
352362
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());

utils.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
8181
// ref: https://github.com/google/sentencepiece
8282
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
8383

84+
// convert tokens to string
85+
// opposite llama_tokenize
86+
std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens);
87+
inline std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens);
88+
8489
// load the tokens from encoder.json
8590
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
8691

0 commit comments

Comments
 (0)