Skip to content

Commit 36639c3

Browse files
committed
Add llama_tokens_to_string() to utils.cpp
- Also single token converter
1 parent 407c1b8 commit 36639c3

File tree

2 files changed

+15
-0
lines changed

2 files changed

+15
-0
lines changed

utils.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,16 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
340340

341341
return res;
342342
}
343+
std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens) {
344+
std::string res;
345+
for (auto t : tokens) {
346+
res += vocab.id_to_token.at(t);
347+
}
348+
return res;
349+
}
350+
std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens) {
351+
return vocab.id_to_token.at(tokens);
352+
}
343353

344354
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
345355
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());

utils.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
7676
// ref: https://github.com/google/sentencepiece
7777
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
7878

79+
// convert tokens to string
80+
// opposite llama_tokenize
81+
std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens);
82+
inline std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens);
83+
7984
// load the tokens from encoder.json
8085
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
8186

0 commit comments

Comments
 (0)