Skip to content

Commit ed8891f

Browse files
larryliu0820malfet
authored andcommitted
Add sentencepiece tokenizer (#607)
* Add sentencepiece tokenizer Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Add white space Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Handle white space: Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * See if CI is happy Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Handle control ids Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * More cleanup Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Lint Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Use unique_ptr Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent f5548ba commit ed8891f

File tree

9 files changed

+141
-321
lines changed

9 files changed

+141
-321
lines changed

.github/workflows/pull.yml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,6 @@ jobs:
463463
pushd checkpoints/stories15M
464464
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
465465
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
466-
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
467466
popd
468467
469468
mkdir gguf_files
@@ -900,14 +899,14 @@ jobs:
900899
- name: Run inference
901900
run: |
902901
python torchchat.py download stories15M
903-
wget -O ./tokenizer.bin https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
902+
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
904903
905904
export PRMT="Once upon a time in a land far away"
906905
907906
python torchchat.py generate stories15M --temperature 0 --prompt "${PRMT}"
908907
909908
python torchchat.py export stories15M --output-pte-path ./model.pte
910-
./cmake-out/et_run ./model.pte -z ./tokenizer.bin -t 0 -i "${PRMT}"
909+
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
911910
912911
echo "Tests complete."
913912
runner-aoti:
@@ -946,7 +945,6 @@ jobs:
946945
pushd checkpoints/stories15M
947946
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
948947
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
949-
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
950948
popd
951949
- name: Run inference
952950
run: |
@@ -960,7 +958,7 @@ jobs:
960958
for dtype in fp32 fp16 bf16 fast fast16; do
961959
echo "Running export + runner with dtype=$dtype"
962960
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --dtype $dtype --output-dso-path /tmp/model.so
963-
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}"
961+
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
964962
done
965963
966964
echo "Tests complete."

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
[submodule "tokenizer/third-party/re2"]
55
path = tokenizer/third-party/re2
66
url = https://github.com/google/re2.git
7+
[submodule "tokenizer/third-party/sentencepiece"]
8+
path = tokenizer/third-party/sentencepiece
9+
url = https://github.com/google/sentencepiece.git

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# PyTorch ecosystem
44
torch
55
torchao
6-
executorch
6+
executorch==0.1.2
77

88
# Hugging Face download
99
huggingface_hub

runner/run.cpp

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -383,11 +383,11 @@ Tokenizer* build_tokenizer(
383383
Tokenizer* tokenizer = NULL;
384384
switch (model_type) {
385385
case LLAMA2_MODEL:
386-
tokenizer = new BPETokenizer(vocab_size, /*bos*/ 1, /*eos*/ 2);
386+
tokenizer = new SPTokenizer(vocab_size, /*bos*/ 1, /*eos*/ 2);
387387
tokenizer->load(tokenizer_path);
388388
break;
389389
case LLAMA3_MODEL:
390-
tokenizer = new Tiktoken(vocab_size, /*bos*/ 1, /*eos*/ 2);
390+
tokenizer = new Tiktoken(vocab_size, /*bos*/ 128000, /*eos*/ 128001);
391391
tokenizer->load(tokenizer_path);
392392
break;
393393
default:
@@ -503,9 +503,11 @@ unsigned generate_from_prompt_tokens(
503503
printf("\n");
504504
} else {
505505
std::string piece = tokenizer->decode(token, next);
506-
safe_printf(piece.c_str()); // same as printf("%s", piece), but skips
507-
// "unsafe" bytes
508-
fflush(stdout);
506+
if (!piece.empty() && piece.length() != 0) {
507+
safe_printf(piece.c_str()); // same as printf("%s", piece), but skips
508+
// "unsafe" bytes
509+
fflush(stdout);
510+
}
509511
}
510512
}
511513

@@ -553,10 +555,7 @@ void generate(
553555
stop_tokens.push_back(tokenizer->eos_tok());
554556
break;
555557
case LLAMA3_MODEL:
556-
prompt_tokens = tokenizer->encode(prompt, 0, 0);
557-
prompt_tokens.insert(
558-
prompt_tokens.begin(),
559-
tokenizer->encode("<|begin_of_text|>", 0, 0)[0]);
558+
prompt_tokens = tokenizer->encode(prompt, 1, 0);
560559
stop_tokens.push_back(tokenizer->encode("<|end_of_text|>", 0, 0)[0]);
561560
stop_tokens.push_back(tokenizer->encode("<|eot_id|>", 0, 0)[0]);
562561
break;

tokenizer/CMakeLists.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@ ENDIF()
99
# build tokenizer library
1010
add_library(
1111
tokenizer
12-
bpe_tokenizer.cpp
12+
tokenizer.h
13+
sentencepiece.cpp
1314
tiktoken.cpp)
1415

15-
target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
16+
target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src)
1617

1718
# add RE2 as subdirectory
1819
set(ABSL_ENABLE_INSTALL ON)
@@ -22,6 +23,7 @@ ${CMAKE_POSITION_INDEPENDENT_CODE})
2223
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
2324
add_subdirectory(third-party/abseil-cpp)
2425
add_subdirectory(third-party/re2)
26+
add_subdirectory(third-party/sentencepiece)
2527
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
2628

27-
target_link_libraries(tokenizer PUBLIC re2::re2)
29+
target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static)

0 commit comments

Comments
 (0)