pytorch
diff --git a/‎.github/workflows/pull.yml
Lines changed: 13 additions & 18 deletions b/‎.github/workflows/pull.yml
Lines changed: 13 additions & 18 deletions
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎runner/run.cpp
Lines changed: 7 additions & 28 deletions b/‎runner/run.cpp
Lines changed: 7 additions & 28 deletions
diff --git a/‎tokenizer/CMakeLists.txt
Lines changed: 5 additions & 3 deletions b/‎tokenizer/CMakeLists.txt
Lines changed: 5 additions & 3 deletions
@@ -468,7 +468,6 @@ jobs:
           pushd checkpoints/stories15M
           wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
           wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
-          wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
           popd
 
           mkdir gguf_files
@@ -910,32 +909,29 @@ jobs:
       - name: Run inference
         run: |
           python torchchat.py download stories15M
-          wget -O ./tokenizer.bin https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
+          wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
 
           export PRMT="Once upon a time in a land far away"
 
           python torchchat.py generate stories15M --temperature 0 --prompt "${PRMT}"
 
           python torchchat.py export stories15M --output-pte-path ./model.pte
-          ./cmake-out/et_run ./model.pte -z ./tokenizer.bin -t 0 -i "${PRMT}"
-          
-            for dtype in fp32 fp16; do   # bf16 needs to be supported
-              echo "Testing export + runner with dtype=$dtype"
-              python torchchat.py export stories15M --dtype $dtype --output-pte-path ./model.pte
-                  ./cmake-out/et_run ./model.pte -z ./tokenizer.bin -t 0 -i "${PRMT}"
-            done
-  
+          ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
+
+          for dtype in fp32 fp16; do   # bf16 needs to be supported
+            echo "Testing export + runner with dtype=$dtype"
+            python torchchat.py export stories15M --dtype $dtype --output-pte-path ./model.pte
+            ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
+          done
+
           echo "Tests complete."
   runner-aoti:
-    name: test-runner-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
-    needs: gather-models-cpu
     strategy:
-      matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
+      matrix:
+        runner: [16-core-ubuntu, macos-14-xlarge]
+    runs-on: ${{matrix.runner}}
     env:
       TORCHCHAT_ROOT: ${{ github.workspace }}
-      REPO_NAME: ${{ matrix.repo_name }}
     steps:
       - name: Checkout repo
         uses: actions/checkout@v3
@@ -962,7 +958,6 @@ jobs:
           pushd checkpoints/stories15M
           wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
           wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
-          wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
           popd
       - name: Run inference
         run: |
@@ -976,7 +971,7 @@ jobs:
           for dtype in fp32 fp16 bf16 fast fast16; do
             echo "Running export + runner with dtype=$dtype"
             python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --dtype $dtype --output-dso-path /tmp/model.so
-            ./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}"
+            ./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
           done
 
           echo "Tests complete."
 
@@ -4,3 +4,6 @@
 [submodule "tokenizer/third-party/re2"]
 	path = tokenizer/third-party/re2
 	url = https://github.com/google/re2.git
+[submodule "tokenizer/third-party/sentencepiece"]
+	path = tokenizer/third-party/sentencepiece
+	url = https://github.com/google/sentencepiece.git
@@ -192,7 +192,6 @@ float* forward(Transformer* transformer, int token, int pos) {
                              .to(torch::dtype(torch::kFloat32))
                              .to(cpu_device);
   auto logits = result[0].data_ptr();
-
 #else // __ET_MODEL__
   ManagedTensor pos_managed(pos_buffer, sizeof(int64_t), {1}, ScalarType::Long);
   ManagedTensor tokens_managed(
@@ -376,18 +375,15 @@ int sample(Sampler* sampler, float* logits) {
   return next;
 }
 
-Tokenizer* build_tokenizer(
-    const char* tokenizer_path,
-    ModelType model_type,
-    int vocab_size) {
+Tokenizer* build_tokenizer(const char* tokenizer_path, ModelType model_type) {
   Tokenizer* tokenizer = NULL;
   switch (model_type) {
     case LLAMA2_MODEL:
-      tokenizer = new BPETokenizer(vocab_size, /*bos*/ 1, /*eos*/ 2);
+      tokenizer = new SPTokenizer();
       tokenizer->load(tokenizer_path);
       break;
     case LLAMA3_MODEL:
-      tokenizer = new Tiktoken(vocab_size, /*bos*/ 1, /*eos*/ 2);
+      tokenizer = new Tiktoken();
       tokenizer->load(tokenizer_path);
       break;
     default:
@@ -553,10 +549,7 @@ void generate(
       stop_tokens.push_back(tokenizer->eos_tok());
       break;
     case LLAMA3_MODEL:
-      prompt_tokens = tokenizer->encode(prompt, 0, 0);
-      prompt_tokens.insert(
-          prompt_tokens.begin(),
-          tokenizer->encode("<|begin_of_text|>", 0, 0)[0]);
+      prompt_tokens = tokenizer->encode(prompt, 1, 0);
       stop_tokens.push_back(tokenizer->encode("<|end_of_text|>", 0, 0)[0]);
       stop_tokens.push_back(tokenizer->encode("<|eot_id|>", 0, 0)[0]);
       break;
@@ -911,30 +904,16 @@ int main(int argc, char* argv[]) {
   if (steps < 0)
     steps = 0;
 
+  Tokenizer* tokenizer = build_tokenizer(tokenizer_path, model_type);
+
   // If no tokenizer path provided, get default for model_type
   if (vocab_size == -1) {
-    switch (model_type) {
-      case LLAMA2_MODEL:
-        vocab_size = 32000;
-        break;
-      case LLAMA3_MODEL:
-        vocab_size = 128256;
-        break;
-      default:
-        fprintf(
-            stderr,
-            "No vocab_size was provided with -v argument, and there is no default vocab_size for model_type %d.\n",
-            model_type);
-        error_usage();
-    }
+    vocab_size = tokenizer->vocab_size();
   }
 
   Transformer transformer;
   build_transformer(&transformer, model_path, vocab_size, steps);
 
-  Tokenizer* tokenizer =
-      build_tokenizer(tokenizer_path, model_type, vocab_size);
-
   Sampler sampler;
   build_sampler(&sampler, vocab_size, temperature, topp, rng_seed);
 
 
@@ -9,10 +9,11 @@ ENDIF()
 # build tokenizer library
 add_library(
     tokenizer
-    bpe_tokenizer.cpp
+    tokenizer.h
+    sentencepiece.cpp
     tiktoken.cpp)
 
-target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src)
 
 # add RE2 as subdirectory
 set(ABSL_ENABLE_INSTALL ON)
@@ -22,6 +23,7 @@ ${CMAKE_POSITION_INDEPENDENT_CODE})
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_subdirectory(third-party/abseil-cpp)
 add_subdirectory(third-party/re2)
+add_subdirectory(third-party/sentencepiece)
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
-target_link_libraries(tokenizer PUBLIC re2::re2)
+target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static)