Skip to content

Commit 7ab7b73

Browse files
zhangfuwenzhangfuwenggerganov
authored
android : fix utf8 decoding error (#5935)
* examples: fix utf8 decoding error some models have a tokenizer that decodes an id into an incomplete utf8 sequence, need to validate and wait for next token one example would be: https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q4_0.gguf and and an example of the token is 18137 * android : minor --------- Co-authored-by: zhangfuwen <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent d9f65c9 commit 7ab7b73

File tree

2 files changed

+53
-4
lines changed

2 files changed

+53
-4
lines changed

examples/llama.android/app/src/main/cpp/llama-android.cpp

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,45 @@ jclass la_int_var;
3333
jmethodID la_int_var_value;
3434
jmethodID la_int_var_inc;
3535

36+
std::string cached_token_chars;
37+
38+
bool is_valid_utf8(const char * string) {
39+
if (!string) {
40+
return true;
41+
}
42+
43+
const unsigned char * bytes = (const unsigned char *)string;
44+
int num;
45+
46+
while (*bytes != 0x00) {
47+
if ((*bytes & 0x80) == 0x00) {
48+
// U+0000 to U+007F
49+
num = 1;
50+
} else if ((*bytes & 0xE0) == 0xC0) {
51+
// U+0080 to U+07FF
52+
num = 2;
53+
} else if ((*bytes & 0xF0) == 0xE0) {
54+
// U+0800 to U+FFFF
55+
num = 3;
56+
} else if ((*bytes & 0xF8) == 0xF0) {
57+
// U+10000 to U+10FFFF
58+
num = 4;
59+
} else {
60+
return false;
61+
}
62+
63+
bytes += 1;
64+
for (int i = 1; i < num; ++i) {
65+
if ((*bytes & 0xC0) != 0x80) {
66+
return false;
67+
}
68+
bytes += 1;
69+
}
70+
}
71+
72+
return true;
73+
}
74+
3675
static void log_callback(ggml_log_level level, const char * fmt, void * data) {
3776
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
3877
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
@@ -295,6 +334,8 @@ Java_com_example_llama_Llm_completion_1init(
295334
jint n_len
296335
) {
297336

337+
cached_token_chars.clear();
338+
298339
const auto text = env->GetStringUTFChars(jtext, 0);
299340
const auto context = reinterpret_cast<llama_context *>(context_pointer);
300341
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
@@ -372,8 +413,16 @@ Java_com_example_llama_Llm_completion_1loop(
372413
}
373414

374415
auto new_token_chars = llama_token_to_piece(context, new_token_id);
375-
LOGi("new_token_chars: `%s`", new_token_chars.c_str());
376-
auto new_token = env->NewStringUTF(new_token_chars.c_str());
416+
cached_token_chars += new_token_chars;
417+
418+
jstring new_token = nullptr;
419+
if (is_valid_utf8(cached_token_chars.c_str())) {
420+
new_token = env->NewStringUTF(cached_token_chars.c_str());
421+
LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id);
422+
cached_token_chars.clear();
423+
} else {
424+
new_token = env->NewStringUTF("");
425+
}
377426

378427
llama_batch_clear(*batch);
379428
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);

examples/llama.android/app/src/main/java/com/example/llama/Llm.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class Llm {
7171
batch: Long,
7272
nLen: Int,
7373
ncur: IntVar
74-
): String
74+
): String?
7575

7676
private external fun kv_cache_clear(context: Long)
7777

@@ -115,7 +115,7 @@ class Llm {
115115
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
116116
while (ncur.value <= nlen) {
117117
val str = completion_loop(state.context, state.batch, nlen, ncur)
118-
if (str.isEmpty()) {
118+
if (str == null) {
119119
break
120120
}
121121
emit(str)

0 commit comments

Comments
 (0)