Skip to content

Commit 8e5fd6f

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .gitignore # README.md # docs/backend/BLIS.md # docs/backend/SYCL.md # docs/development/llama-star/idea-arch.key # docs/development/llama-star/idea-arch.pdf # docs/development/token_generation_performance_tips.md # src/llama.cpp # tests/test-tokenizer-0.cpp # tests/test-tokenizer-1-bpe.cpp # tests/test-tokenizer-1-spm.cpp # tests/test-tokenizer-random.py
2 parents 5e458f4 + 87e25a1 commit 8e5fd6f

28 files changed

+352
-2091
lines changed

AUTHORS

Lines changed: 0 additions & 782 deletions
This file was deleted.

common/common.cpp

Lines changed: 21 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2593,51 +2593,35 @@ std::vector<llama_token> llama_tokenize(
25932593
}
25942594

25952595
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
2596-
std::vector<char> result(8, 0);
2597-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2598-
if (n_tokens < 0) {
2599-
result.resize(-n_tokens);
2600-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2601-
GGML_ASSERT(check == -n_tokens);
2602-
} else {
2603-
result.resize(n_tokens);
2604-
}
2605-
2606-
return std::string(result.data(), result.size());
2607-
}
2608-
2609-
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
2610-
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
2611-
26122596
std::string piece;
2613-
std::string result;
2614-
2615-
for (size_t i = 0; i < tokens.size(); ++i) {
2616-
piece = llama_token_to_piece(ctx, tokens[i]);
2617-
2618-
// remove the leading space of the first non-BOS token
2619-
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
2620-
piece = piece.substr(1);
2621-
}
2622-
2623-
result += piece;
2597+
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
2598+
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2599+
if (n_chars < 0) {
2600+
piece.resize(-n_chars);
2601+
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2602+
GGML_ASSERT(check == -n_chars);
2603+
}
2604+
else {
2605+
piece.resize(n_chars);
26242606
}
26252607

2626-
return result;
2608+
return piece;
26272609
}
26282610

2629-
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
2630-
std::string piece;
2631-
std::string result;
2632-
2633-
for (size_t i = 0; i < tokens.size(); ++i) {
2634-
piece = llama_token_to_piece(ctx, tokens[i]);
2635-
2636-
result += piece;
2611+
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
2612+
std::string text;
2613+
text.resize(std::max(text.capacity(), tokens.size()));
2614+
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2615+
if (n_chars < 0) {
2616+
text.resize(-n_chars);
2617+
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2618+
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
26372619
}
26382620

2621+
text.resize(n_chars);
2622+
26392623
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
2640-
return result;
2624+
return text;
26412625
}
26422626

26432627
bool llama_should_add_bos_token(const llama_model * model) {

common/common.h

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -367,21 +367,13 @@ std::string llama_token_to_piece(
367367
llama_token token,
368368
bool special = true);
369369

370-
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
371-
// that takes into account the tokenizer type and decides how to handle the leading space
372-
//
373-
// detokenizes a vector of tokens into a string
374-
// should work similar to Python's `tokenizer.decode`
375-
// removes the leading space from the first non-BOS token
376-
std::string llama_detokenize_spm(
377-
llama_context * ctx,
378-
const std::vector<llama_token> & tokens);
379-
380370
// detokenizes a vector of tokens into a string
381371
// should work similar to Python's `tokenizer.decode`
382-
std::string llama_detokenize_bpe(
372+
// optionally renders special/control tokens
373+
std::string llama_detokenize(
383374
llama_context * ctx,
384-
const std::vector<llama_token> & tokens);
375+
const std::vector<llama_token> & tokens,
376+
bool special = true);
385377

386378
// Uses the value from the model metadata if possible, otherwise
387379
// defaults to true when model type is SPM, otherwise false.

docs/HOWTO-add-model.md

Lines changed: 0 additions & 119 deletions
This file was deleted.

docs/debugging-tests.md

Lines changed: 0 additions & 104 deletions
This file was deleted.

examples/batched.swift/Sources/main.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
229229

230230
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
231231
var result = [CChar](repeating: 0, count: 8)
232-
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
232+
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
233233
if nTokens < 0 {
234234
let actualTokensCount = -Int(nTokens)
235235
result = .init(repeating: 0, count: actualTokensCount)
@@ -238,6 +238,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
238238
token,
239239
&result,
240240
Int32(result.count),
241+
0,
241242
false
242243
)
243244
assert(check == actualTokensCount)

examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,15 +322,15 @@ actor LlamaContext {
322322
defer {
323323
result.deallocate()
324324
}
325-
let nTokens = llama_token_to_piece(model, token, result, 8, false)
325+
let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
326326

327327
if nTokens < 0 {
328328
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
329329
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
330330
defer {
331331
newResult.deallocate()
332332
}
333-
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
333+
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
334334
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
335335
return Array(bufferPointer)
336336
} else {

0 commit comments

Comments
 (0)