Remove logic for appending or prepending tokens (#4920)

helunwencser · facebook-github-bot · commit 17d39e4d524d · 2024-10-07T13:59:58.000-07:00
Summary: Pull Request resolved: #4920 The runner should not prepend or append any tokens to the prompt. Users should make sure that the prompt is properly formatted as the model's chat template. This PR removes the logic for appending or prepending tokens. Test Plan: CI imported-using-ghimport Differential Revision: D61824091 Pulled By: helunwencser
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
@@ -146,7 +146,7 @@ python -m examples.models.llama2.export_llama \
   --use_sdpa_with_kv_cache \
   -X \
   -d bf16 \
-  --metadata '{"append_eos_to_prompt": 0, "get_bos_id":128000, "get_eos_ids":[128009, 128001], "get_n_bos": 0, "get_n_eos": 0}' \
+  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
   --output_name="llama3_2.pte"
 ```
 
@@ -172,7 +172,7 @@ python -m examples.models.llama2.export_llama \
    -d fp32 \
    --preq_embedding_quantize 8,0 \
    --use_spin_quant native \
-   --metadata '{"append_eos_to_prompt": 0, "get_bos_id":128000, "get_eos_ids":[128009, 128001], "get_n_bos": 0, "get_n_eos": 0}'
+   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
 ```
 
 ### Option B: Download and export Llama 3 8B instruct model
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -723,12 +723,9 @@ def _load_llama_model_metadata(
 ):
     is_fairseq2 = weight_type == WeightType.FAIRSEQ2
     metadata = {
-        "append_eos_to_prompt": is_fairseq2,  # For language llama, tell the runtime to always append EOS token(s) to prompt.
         "get_bos_id": 3 if is_fairseq2 else 1,
         "get_eos_ids": [3] if is_fairseq2 else [2],
         "get_max_seq_len": model_args.max_seq_len,
-        "get_n_bos": 1,
-        "get_n_eos": 2 if is_fairseq2 else 1,
         "get_n_layers": model_args.n_layers,
         "get_vocab_size": model_args.vocab_size,
         "use_kv_cache": use_kv_cache,
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -27,13 +27,10 @@ using ::executorch::runtime::Result;
 namespace llm = ::executorch::extension::llm;
 
 namespace {
-static constexpr auto kAppendEosToPrompt = "append_eos_to_prompt";
 static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
 static constexpr auto kBosId = "get_bos_id";
 static constexpr auto kEosIds = "get_eos_ids";
 static constexpr auto kMaxSeqLen = "get_max_seq_len";
-static constexpr auto kNBos = "get_n_bos";
-static constexpr auto kNEos = "get_n_eos";
 static constexpr auto kVocabSize = "get_vocab_size";
 static constexpr auto kUseKVCache = "use_kv_cache";
 static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
@@ -50,11 +47,8 @@ Runner::Runner(
       module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
       tokenizer_path_(tokenizer_path),
       metadata_({
-          {kAppendEosToPrompt, false},
           {kEnableDynamicShape, false},
           {kMaxSeqLen, 128},
-          {kNBos, 1},
-          {kNEos, 1},
           {kUseKVCache, true},
           {kUseSDPAWithKVCache, false},
       }) {
@@ -203,8 +197,8 @@ Error Runner::generate(
 
   Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
       prompt,
-      metadata_.at(kNBos),
-      metadata_.at(kAppendEosToPrompt) ? metadata_.at(kNEos) : 0);
+      /* bos */ 0,
+      /* eos */ 0);
 
   ET_CHECK_OK_OR_RETURN_ERROR(
       encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
diff --git a/extension/export_util/export_hf_model.py b/extension/export_util/export_hf_model.py
@@ -85,8 +85,6 @@ def _get_constant_methods(model: PreTrainedModel):
             "get_head_dim": model.config.hidden_size / model.config.num_attention_heads,
             "get_max_batch_size": model.generation_config.cache_config.batch_size,
             "get_max_seq_len": model.generation_config.cache_config.max_cache_len,
-            "get_n_bos": 1,
-            "get_n_eos": 1,
             "get_n_kv_heads": model.config.num_key_value_heads,
             "get_n_layers": model.config.num_hidden_layers,
             "get_vocab_size": model.config.vocab_size,