pytorch · Jack-Khuu · Mar 21, 2024
@@ -68,6 +68,7 @@ def load_llama_model(
     use_sdpa_with_kv_cache: bool = False,
     weight_type: WeightType = WeightType.LLAMA,
     verbose: bool = False,
+    max_seq_len: int = 128,
 ) -> "LlamaEdgeManager":
     """
     A helper util that builds a Llama2 model. It returns a LlamaEdgeManager that
@@ -87,6 +88,7 @@ def load_llama_model(
         use_kv_cache=use_kv_cache,
         use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
         fairseq2=weight_type == WeightType.FAIRSEQ2,
+        max_seq_len=max_seq_len,
     )
     state_dict = model.state_dict()
     dtype = state_dict[next(iter(state_dict))].dtype

@@ -140,12 +140,6 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--limit", type=int, default=5, help="number of samples to evalulate"
     )
-    parser.add_argument(
-        "--max_seq_length",
-        type=int,
-        default=100,
-        help="maximum length sequence to evaluate",
-    )
 
     return parser
 

@@ -391,6 +391,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="Override the output filename of the saved pte model file.",
     )
 
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=128,
+        help="maximum length sequence to evaluate",
+    )
+
     parser.add_argument("-2", "--fairseq2", action="store_true")
     parser.add_argument("-v", "--verbose", action="store_true")
     parser.add_argument("-X", "--xnnpack", action="store_true")
@@ -511,6 +518,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
             use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
             weight_type=weight_type,
             verbose=args.verbose,
+            max_seq_len=args.max_seq_length,
         )
         .set_output_dir(output_dir_path)
         .set_metadata(args.metadata)

@@ -66,6 +66,8 @@ def __init__(self, **kwargs):
             if "use_sdpa_with_kv_cache" in kwargs
             else False
         )
+
+        self.max_seq_len = kwargs["max_seq_len"] if "max_seq_len" in kwargs else 128
         # The example is using a dummy small model with random weights for demo purpose only.
         # Follow the instruction in https://github.com/facebookresearch/llama to download the model
         device = "cpu"
@@ -112,7 +114,7 @@ def __init__(self, **kwargs):
                 )
         with open(params_path, "r") as f:
             params = json.loads(f.read())
-        max_seq_len = 128
+        max_seq_len = self.max_seq_len
         max_batch_size = 1
         model_args: ModelArgs = ModelArgs(
             max_seq_len=max_seq_len,