Skip to content

Commit de0deb3

Browse files
larryliu0820malfet
authored andcommitted
[retake] Add sentencepiece tokenizer (#626)
* Add sentencepiece tokenizer Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Add white space Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Handle white space: Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Handle control ids Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * More cleanup Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Lint Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Use unique_ptr Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Use a larger runner Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Debug Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Debug Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Cleanup
1 parent 77a581f commit de0deb3

File tree

9 files changed

+168
-364
lines changed

9 files changed

+168
-364
lines changed

.github/workflows/pull.yml

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,6 @@ jobs:
468468
pushd checkpoints/stories15M
469469
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
470470
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
471-
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
472471
popd
473472
474473
mkdir gguf_files
@@ -910,32 +909,29 @@ jobs:
910909
- name: Run inference
911910
run: |
912911
python torchchat.py download stories15M
913-
wget -O ./tokenizer.bin https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
912+
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
914913
915914
export PRMT="Once upon a time in a land far away"
916915
917916
python torchchat.py generate stories15M --temperature 0 --prompt "${PRMT}"
918917
919918
python torchchat.py export stories15M --output-pte-path ./model.pte
920-
./cmake-out/et_run ./model.pte -z ./tokenizer.bin -t 0 -i "${PRMT}"
921-
922-
for dtype in fp32 fp16; do # bf16 needs to be supported
923-
echo "Testing export + runner with dtype=$dtype"
924-
python torchchat.py export stories15M --dtype $dtype --output-pte-path ./model.pte
925-
./cmake-out/et_run ./model.pte -z ./tokenizer.bin -t 0 -i "${PRMT}"
926-
done
927-
919+
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
920+
921+
for dtype in fp32 fp16; do # bf16 needs to be supported
922+
echo "Testing export + runner with dtype=$dtype"
923+
python torchchat.py export stories15M --dtype $dtype --output-pte-path ./model.pte
924+
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
925+
done
926+
928927
echo "Tests complete."
929928
runner-aoti:
930-
name: test-runner-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
931-
needs: gather-models-cpu
932929
strategy:
933-
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
934-
fail-fast: false
935-
runs-on: ${{ matrix.runner }}
930+
matrix:
931+
runner: [16-core-ubuntu, macos-14-xlarge]
932+
runs-on: ${{matrix.runner}}
936933
env:
937934
TORCHCHAT_ROOT: ${{ github.workspace }}
938-
REPO_NAME: ${{ matrix.repo_name }}
939935
steps:
940936
- name: Checkout repo
941937
uses: actions/checkout@v3
@@ -962,7 +958,6 @@ jobs:
962958
pushd checkpoints/stories15M
963959
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
964960
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
965-
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
966961
popd
967962
- name: Run inference
968963
run: |
@@ -976,7 +971,7 @@ jobs:
976971
for dtype in fp32 fp16 bf16 fast fast16; do
977972
echo "Running export + runner with dtype=$dtype"
978973
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --dtype $dtype --output-dso-path /tmp/model.so
979-
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}"
974+
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
980975
done
981976
982977
echo "Tests complete."

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
[submodule "tokenizer/third-party/re2"]
55
path = tokenizer/third-party/re2
66
url = https://github.com/google/re2.git
7+
[submodule "tokenizer/third-party/sentencepiece"]
8+
path = tokenizer/third-party/sentencepiece
9+
url = https://github.com/google/sentencepiece.git

runner/run.cpp

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,6 @@ float* forward(Transformer* transformer, int token, int pos) {
192192
.to(torch::dtype(torch::kFloat32))
193193
.to(cpu_device);
194194
auto logits = result[0].data_ptr();
195-
196195
#else // __ET_MODEL__
197196
ManagedTensor pos_managed(pos_buffer, sizeof(int64_t), {1}, ScalarType::Long);
198197
ManagedTensor tokens_managed(
@@ -376,18 +375,15 @@ int sample(Sampler* sampler, float* logits) {
376375
return next;
377376
}
378377

379-
Tokenizer* build_tokenizer(
380-
const char* tokenizer_path,
381-
ModelType model_type,
382-
int vocab_size) {
378+
Tokenizer* build_tokenizer(const char* tokenizer_path, ModelType model_type) {
383379
Tokenizer* tokenizer = NULL;
384380
switch (model_type) {
385381
case LLAMA2_MODEL:
386-
tokenizer = new BPETokenizer(vocab_size, /*bos*/ 1, /*eos*/ 2);
382+
tokenizer = new SPTokenizer();
387383
tokenizer->load(tokenizer_path);
388384
break;
389385
case LLAMA3_MODEL:
390-
tokenizer = new Tiktoken(vocab_size, /*bos*/ 1, /*eos*/ 2);
386+
tokenizer = new Tiktoken();
391387
tokenizer->load(tokenizer_path);
392388
break;
393389
default:
@@ -553,10 +549,7 @@ void generate(
553549
stop_tokens.push_back(tokenizer->eos_tok());
554550
break;
555551
case LLAMA3_MODEL:
556-
prompt_tokens = tokenizer->encode(prompt, 0, 0);
557-
prompt_tokens.insert(
558-
prompt_tokens.begin(),
559-
tokenizer->encode("<|begin_of_text|>", 0, 0)[0]);
552+
prompt_tokens = tokenizer->encode(prompt, 1, 0);
560553
stop_tokens.push_back(tokenizer->encode("<|end_of_text|>", 0, 0)[0]);
561554
stop_tokens.push_back(tokenizer->encode("<|eot_id|>", 0, 0)[0]);
562555
break;
@@ -911,30 +904,16 @@ int main(int argc, char* argv[]) {
911904
if (steps < 0)
912905
steps = 0;
913906

907+
Tokenizer* tokenizer = build_tokenizer(tokenizer_path, model_type);
908+
914909
// If no tokenizer path provided, get default for model_type
915910
if (vocab_size == -1) {
916-
switch (model_type) {
917-
case LLAMA2_MODEL:
918-
vocab_size = 32000;
919-
break;
920-
case LLAMA3_MODEL:
921-
vocab_size = 128256;
922-
break;
923-
default:
924-
fprintf(
925-
stderr,
926-
"No vocab_size was provided with -v argument, and there is no default vocab_size for model_type %d.\n",
927-
model_type);
928-
error_usage();
929-
}
911+
vocab_size = tokenizer->vocab_size();
930912
}
931913

932914
Transformer transformer;
933915
build_transformer(&transformer, model_path, vocab_size, steps);
934916

935-
Tokenizer* tokenizer =
936-
build_tokenizer(tokenizer_path, model_type, vocab_size);
937-
938917
Sampler sampler;
939918
build_sampler(&sampler, vocab_size, temperature, topp, rng_seed);
940919

tokenizer/CMakeLists.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@ ENDIF()
99
# build tokenizer library
1010
add_library(
1111
tokenizer
12-
bpe_tokenizer.cpp
12+
tokenizer.h
13+
sentencepiece.cpp
1314
tiktoken.cpp)
1415

15-
target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
16+
target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src)
1617

1718
# add RE2 as subdirectory
1819
set(ABSL_ENABLE_INSTALL ON)
@@ -22,6 +23,7 @@ ${CMAKE_POSITION_INDEPENDENT_CODE})
2223
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
2324
add_subdirectory(third-party/abseil-cpp)
2425
add_subdirectory(third-party/re2)
26+
add_subdirectory(third-party/sentencepiece)
2527
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
2628

27-
target_link_libraries(tokenizer PUBLIC re2::re2)
29+
target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static)

0 commit comments

Comments
 (0)