Fix test_llama_runner by hiding tiktoken

larryliu0820 · larryliu0820 · commit efe8a89bdb1b · 2024-04-16T10:02:31.000-07:00
Summary: We don't always want to build tiktoken dependencies (re2 and abseil) so this PR only build it if the option is on. Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 9d5342d Pull Request resolved: #3055
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
@@ -21,7 +21,7 @@ project(llama_runner)
 # Duplicating options as root CMakeLists.txt
 option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF)
 
-option(EXECUTORCH_BUILD_RE2 "Build RE2" OFF)
+option(EXECUTORCH_USE_TIKTOKEN "Use Tiktoken as a tokenizer" OFF)
 
 include(CMakeDependentOption)
 #
@@ -88,7 +88,7 @@ endif()
 
 # llama_runner library
 add_subdirectory(runner)
-if(EXECUTORCH_BUILD_RE2)
+if(EXECUTORCH_USE_TIKTOKEN)
   # find RE2 for tokenizer
   set(ABSL_ENABLE_INSTALL ON)
   set(_pic_flag
diff --git a/examples/models/llama2/main.cpp b/examples/models/llama2/main.cpp
@@ -39,11 +39,6 @@ DEFINE_int32(
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
-DEFINE_bool(
-    use_tiktoken,
-    false,
-    "Use Tiktoken tokenizer instead of the default BPE tokenizer.");
-
 int32_t main(int32_t argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -62,8 +57,6 @@ int32_t main(int32_t argc, char** argv) {
 
   int32_t cpu_threads = FLAGS_cpu_threads;
 
-  bool use_tiktoken = FLAGS_use_tiktoken;
-
 #if defined(ET_USE_THREADPOOL)
   uint32_t num_performant_cores = cpu_threads == -1
       ? torch::executorch::cpuinfo::get_num_performant_cores()
@@ -76,8 +69,7 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  ::torch::executor::Runner runner(
-      model_path, tokenizer_path, temperature, use_tiktoken);
+  ::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
 
   // generate
   runner.generate(prompt, seq_len);
diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt
@@ -39,19 +39,26 @@ list(TRANSFORM _llama_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 target_include_directories(extension_module
                            INTERFACE ${_common_include_directories})
 
-if(CMAKE_TOOLCHAIN_IOS OR ANDROID OR APPLE)
-  # Building a share library on iOS requires code signing
-  # On Android we see duplicated registration when using shared lib
+if(EXECUTORCH_USE_TIKTOKEN)
+  list(APPEND _llama_runner__srcs
+       ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/tiktoken.cpp)
+  set(_preprocessor_flag -DET_USE_TIKTOKEN)
+endif()
+
+if(CMAKE_TOOLCHAIN_IOS
+   OR ANDROID
+   OR APPLE)
+  # Building a share library on iOS requires code signing On Android we see
+  # duplicated registration when using shared lib
   add_library(llama_runner STATIC ${_llama_runner__srcs})
 else()
   add_library(llama_runner SHARED ${_llama_runner__srcs})
 endif()
 
 set(llama_runner_deps executorch extension_module extension_data_loader)
 
-target_link_libraries(
-  llama_runner PUBLIC ${llama_runner_deps})
+target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
 
-target_include_directories(llama_runner
-                           INTERFACE ${_common_include_directories}
-                           ${EXECUTORCH_ROOT})
+target_include_directories(llama_runner INTERFACE ${_common_include_directories}
+                                                  ${EXECUTORCH_ROOT})
+target_compile_options(llama_runner PUBLIC ${_preprocessor_flag})
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -11,7 +11,9 @@
 
 #include <executorch/examples/models/llama2/runner/runner.h>
 #include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
+#if defined(ET_USE_TIKTOKEN)
 #include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
+#endif
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
@@ -38,10 +40,8 @@ std::string statsToJsonString(const Runner::Stats& stats);
 Runner::Runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
-    const float temperature,
-    bool use_tiktoken)
-    : use_tiktoken_(use_tiktoken),
-      module_(std::make_unique<Module>(
+    const float temperature)
+    : module_(std::make_unique<Module>(
           model_path,
           Module::MlockConfig::UseMlockIgnoreErrors)),
       tokenizer_path_(tokenizer_path),
@@ -80,11 +80,11 @@ Error Runner::load() {
   append_eos_ = getMetadataHelper("append_eos_to_prompt", false);
 
   // Load tokenizer
-  if (use_tiktoken_) {
-    tokenizer_ = std::make_unique<Tiktoken>(vocab_size_, bos_id_, eos_id_);
-  } else {
-    tokenizer_ = std::make_unique<BPETokenizer>(vocab_size_, bos_id_, eos_id_);
-  }
+#if defined(ET_USE_TIKTOKEN)
+  tokenizer_ = std::make_unique<Tiktoken>(vocab_size_, bos_id_, eos_id_);
+#else
+  tokenizer_ = std::make_unique<BPETokenizer>(vocab_size_, bos_id_, eos_id_);
+#endif
   tokenizer_->load(tokenizer_path_);
   if (tokenizer_->bos_tok() != bos_id_) {
     ET_LOG(
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -29,8 +29,7 @@ class Runner {
   explicit Runner(
       const std::string& model_path,
       const std::string& tokenizer_path,
-      const float temperature = 0.8f,
-      bool use_tiktoken = false);
+      const float temperature = 0.8f);
 
   struct Stats {
     // Scaling factor for timestamps - in this case, we use ms.
@@ -86,7 +85,6 @@ class Runner {
   int32_t n_bos_;
   int32_t n_eos_;
   int32_t max_seq_len_;
-  bool use_tiktoken_;
   bool use_kv_cache_;
   bool use_sdpa_with_kv_cache_;
   bool append_eos_;
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
@@ -30,14 +30,17 @@ def define_common_targets():
             exported_deps = [
                 "//executorch/backends/xnnpack:xnnpack_backend",
                 "//executorch/examples/models/llama2/sampler:sampler" + aten_suffix,
-                "//executorch/examples/models/llama2/tokenizer:tokenizer",
                 "//executorch/extension/evalue_util:print_evalue" + aten_suffix,
                 "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/kernels/quantized:generated_lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
-            ] + (_get_operator_lib(aten)) + ([
+            ] + ([
+                "//executorch/examples/models/llama2/tokenizer:tiktoken",
+            ] if native.read_config("llama", "use_tiktoken", "0") == "1" else [
+                "//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
+            ]) + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests
                 "//executorch/backends/vulkan:vulkan_backend_lib",
diff --git a/examples/models/llama2/tokenizer/targets.bzl b/examples/models/llama2/tokenizer/targets.bzl
@@ -2,14 +2,30 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
     runtime.cxx_library(
-        name = "tokenizer",
+        name = "bpe_tokenizer",
         srcs = [
             "bpe_tokenizer.cpp",
-            "tiktoken.cpp",
         ],
         exported_headers = [
             "tokenizer.h",
             "bpe_tokenizer.h",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "tiktoken",
+        srcs = [
+            "tiktoken.cpp",
+        ],
+        exported_headers = [
+            "tokenizer.h",
             "tiktoken.h",
             "base64.h",
         ],