pytorch · helunwencser · Oct 7, 2024
@@ -146,7 +146,7 @@ python -m examples.models.llama2.export_llama \
   --use_sdpa_with_kv_cache \
   -X \
   -d bf16 \
-  --metadata '{"append_eos_to_prompt": 0, "get_bos_id":128000, "get_eos_ids":[128009, 128001], "get_n_bos": 0, "get_n_eos": 0}' \
+  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
   --output_name="llama3_2.pte"
 ```
 
@@ -172,7 +172,7 @@ python -m examples.models.llama2.export_llama \
    -d fp32 \
    --preq_embedding_quantize 8,0 \
    --use_spin_quant native \
-   --metadata '{"append_eos_to_prompt": 0, "get_bos_id":128000, "get_eos_ids":[128009, 128001], "get_n_bos": 0, "get_n_eos": 0}'
+   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
 ```
 
 ### Option B: Download and export Llama 3 8B instruct model

@@ -723,12 +723,9 @@ def _load_llama_model_metadata(
 ):
     is_fairseq2 = weight_type == WeightType.FAIRSEQ2
     metadata = {
-        "append_eos_to_prompt": is_fairseq2,  # For language llama, tell the runtime to always append EOS token(s) to prompt.
         "get_bos_id": 3 if is_fairseq2 else 1,
         "get_eos_ids": [3] if is_fairseq2 else [2],
         "get_max_seq_len": model_args.max_seq_len,
-        "get_n_bos": 1,
-        "get_n_eos": 2 if is_fairseq2 else 1,
         "get_n_layers": model_args.n_layers,
         "get_vocab_size": model_args.vocab_size,
         "use_kv_cache": use_kv_cache,

@@ -27,13 +27,10 @@ using ::executorch::runtime::Result;
 namespace llm = ::executorch::extension::llm;
 
 namespace {
-static constexpr auto kAppendEosToPrompt = "append_eos_to_prompt";
 static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
 static constexpr auto kBosId = "get_bos_id";
 static constexpr auto kEosIds = "get_eos_ids";
 static constexpr auto kMaxSeqLen = "get_max_seq_len";
-static constexpr auto kNBos = "get_n_bos";
-static constexpr auto kNEos = "get_n_eos";
 static constexpr auto kVocabSize = "get_vocab_size";
 static constexpr auto kUseKVCache = "use_kv_cache";
 static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
@@ -50,11 +47,8 @@ Runner::Runner(
       module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
       tokenizer_path_(tokenizer_path),
       metadata_({
-          {kAppendEosToPrompt, false},
           {kEnableDynamicShape, false},
           {kMaxSeqLen, 128},
-          {kNBos, 1},
-          {kNEos, 1},
           {kUseKVCache, true},
           {kUseSDPAWithKVCache, false},
       }) {
@@ -203,8 +197,8 @@ Error Runner::generate(
 
   Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
       prompt,
-      metadata_.at(kNBos),
-      metadata_.at(kAppendEosToPrompt) ? metadata_.at(kNEos) : 0);
+      /* bos */ 0,
+      /* eos */ 0);
 
   ET_CHECK_OK_OR_RETURN_ERROR(
       encode_res.error(), "Failed to encode prompt %s", prompt.c_str());

@@ -85,8 +85,6 @@ def _get_constant_methods(model: PreTrainedModel):
             "get_head_dim": model.config.hidden_size / model.config.num_attention_heads,
             "get_max_batch_size": model.generation_config.cache_config.batch_size,
             "get_max_seq_len": model.generation_config.cache_config.max_cache_len,
-            "get_n_bos": 1,
-            "get_n_eos": 1,
             "get_n_kv_heads": model.config.num_key_value_heads,
             "get_n_layers": model.config.num_hidden_layers,
             "get_vocab_size": model.config.vocab_size,