Merge branch 'main' into angelayi/aoti_api_update

Jack-Khuu · web-flow · commit 23d48e1be1f4 · 2025-01-15T14:05:01.000-08:00
diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml
@@ -52,7 +52,7 @@ jobs:
 
             python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-aoti-package-path /tmp/model.pt2
 
-            ./cmake-out/aoti_run /tmp/model.pt2 -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
+            ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
 
         done
 
diff --git a/README.md b/README.md
@@ -341,7 +341,7 @@ torchchat/utils/scripts/build_native.sh aoti
 
 Then run the compiled executable, with the pt2.
 ```bash
-cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
+cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
 ```
 
 ## Mobile Execution
diff --git a/runner/run.cpp b/runner/run.cpp
@@ -102,6 +102,7 @@ typedef struct {
 typedef struct {
   Config config;  // the hyperparameters of the architecture (the blueprint)
   RunState state; // buffers for the "wave" of activations in the forward pass
+  std::unordered_map<std::string, std::string> metadata;
 
 #ifdef __AOTI_MODEL__
   torch::inductor::AOTIModelPackageLoader *runner;
@@ -141,20 +142,9 @@ void read_checkpoint(char *checkpoint, Config *config) {
   config->vocab_size = abs(config->vocab_size);
 }
 
-void build_transformer(Transformer *t, char *model_path, int vocab_size,
-                       int seq_len) {
-  // read in the Config and the Weights from the model
-  // read_checkpoint(model_path, &t->config);
-  // allocate the RunState buffers
-  t->config.vocab_size = vocab_size;
-  t->config.seq_len = seq_len;
-  malloc_run_state(&t->state, &t->config);
-
+void build_transformer(Transformer *t, char *model_path) {
 #ifdef __AOTI_MODEL__
   t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
-  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu"
-                    ? torch::Device(torch::kCPU)
-                    : torch::Device(torch::kCUDA);
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -776,9 +766,6 @@ void error_usage() {
           "  -v <int>    (optional) vocab size, default is model-specific.\n");
   fprintf(stderr,
           "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
-  fprintf(
-      stderr,
-      "  -d <string> (optional) device(CUDA or CPU)  model was exported for\n");
   exit(EXIT_FAILURE);
 }
 
@@ -848,37 +835,35 @@ int main(int argc, char *argv[]) {
       system_prompt = argv[i + 1];
     } else if (argv[i][1] == 'l') {
       llama_ver = atoi(argv[i + 1]);
-#ifdef __AOTI_MODEL__
-    } else if (argv[i][1] == 'd') {
-#ifdef USE_CUDA
-      if (strcasecmp(argv[i + 1], "CUDA") == 0) {
-        aoti_device = torch::Device(torch::kCUDA);
-      } else
-#endif
-          if (strcasecmp(argv[i + 1], "CPU") == 0) {
-        aoti_device = torch::Device(torch::kCPU);
-      } else {
-        fprintf(stderr, "Unknown device %s", argv[i + 1]);
-        exit(1);
-      }
-#endif
     } else {
       error_usage();
     }
   }
 
+  if (model_path == NULL) {
+    fprintf(stderr, "No model_path provided.");
+    error_usage();
+  }
+
+  Transformer transformer;
+  build_transformer(&transformer, model_path);
+
+#ifdef __AOTI_MODEL__
+  auto aoti_metadata = transformer.runner->get_metadata();
+  aoti_device = aoti_metadata["AOTI_DEVICE_KEY"] == "cpu"
+                    ? torch::Device(torch::kCPU)
+                    : torch::Device(torch::kCUDA);
+  ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"]));
+#else // __ET_MODEL__
   ModelType model_type = get_model_type(llama_ver);
+#endif
+
   if (model_type == UNKNOWN_MODEL) {
     fprintf(stderr, "Unknown model type passed by -l argument.  Received l=%d.",
             llama_ver);
     error_usage();
   }
 
-  if (model_path == NULL) {
-    fprintf(stderr, "No model_path provided.");
-    error_usage();
-  }
-
   if (tokenizer_path == NULL) {
     fprintf(stderr, "No tokenizer_path provided.");
     error_usage();
@@ -901,8 +886,12 @@ int main(int argc, char *argv[]) {
     vocab_size = tokenizer->vocab_size();
   }
 
-  Transformer transformer;
-  build_transformer(&transformer, model_path, vocab_size, steps);
+  // read in the Config and the Weights from the model
+  // read_checkpoint(model_path, &t->config);
+  // allocate the RunState buffers
+  transformer.config.vocab_size = vocab_size;
+  transformer.config.seq_len = steps;
+  malloc_run_state(&transformer.state, &transformer.config);
 
   Sampler sampler;
   build_sampler(&sampler, vocab_size, temperature, topp, rng_seed);
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
-from typing import Optional
+from typing import Dict, Optional
 
 import torch
 import torch._inductor
@@ -39,6 +39,7 @@ def export_for_server(
     output_path: str = "model.pt2",
     dynamic_shapes: bool = False,
     package: bool = True,
+    metadata: Optional[Dict[str, str]] = None,
 ) -> str:
     """
     Export the model using AOT Compile to get a .dso for server use cases.
@@ -67,8 +68,10 @@ def export_for_server(
         dynamic_shapes = None
 
     with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
-        metadata = {}  # TODO: put more metadata here
-        options = {"aot_inductor.metadata": metadata}
+        options = {
+            "aot_inductor.package": package,
+            "aot_inductor.metadata": metadata or {},
+        }
         if not package:
             options = {"aot_inductor.output_path": output_path}
 
@@ -106,13 +109,13 @@ def export_for_server(
     from typing import Any, Dict, Tuple, Union
 
     import executorch.exir as exir
+    from executorch.backends.xnnpack._passes.convert_to_linear import (
+        ConvertToLinearPass,
+    )
 
     from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
         XnnpackDynamicallyQuantizedPartitioner,
     )
-    from executorch.backends.xnnpack._passes.convert_to_linear import (
-        ConvertToLinearPass,
-    )
     from executorch.exir import EdgeProgramManager, to_edge
 
     from executorch.exir.capture._config import (
@@ -170,18 +173,22 @@ def __init__(self, attention: Attention):
 
             self.wo = attention.wo
 
-            max_batch_size, n_heads, max_seq_length, head_dim = (
-                attention.kv_cache[0].k_cache.shape
-            )
+            max_batch_size, n_heads, max_seq_length, head_dim = attention.kv_cache[
+                0
+            ].k_cache.shape
             cache_dtype = attention.kv_cache[0].k_cache.dtype
             # The `Attention` module being replaced can have multiple KV caches
             # (denoted by `cache_lanes`).  Thus we follow the same setup format
             # as in `Attention.setup_cache`.
             cache_lanes = len(attention.kv_cache)
-            self.kv_cache = nn.ModuleList([
-                CustomKVCache(max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype)
-                for _ in range(cache_lanes)
-            ])
+            self.kv_cache = nn.ModuleList(
+                [
+                    CustomKVCache(
+                        max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype
+                    )
+                    for _ in range(cache_lanes)
+                ]
+            )
 
             self.n_heads = attention.n_heads
             self.head_dim = attention.head_dim
@@ -219,9 +226,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None, cache_lane: int = 0):
             return self.wo(output)
 
     def replace_attention_with_custom_sdpa_attention(module: nn.Module):
-        from executorch.extension.llm.custom_ops import (  # noqa
-            sdpa_with_kv_cache,
-        )
+        from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa
 
         for name, child in module.named_children():
             if isinstance(child, Attention):
@@ -242,7 +247,9 @@ def _to_core_aten(
             raise ValueError(
                 f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}"
             )
-        core_aten_ep = export_for_training(model, example_inputs, dynamic_shapes=dynamic_shapes)
+        core_aten_ep = export_for_training(
+            model, example_inputs, dynamic_shapes=dynamic_shapes
+        )
         if verbose:
             logging.info(f"Core ATen graph:\n{core_aten_ep.graph}")
         return core_aten_ep
@@ -354,7 +361,11 @@ def main(args):
 
     print(f"Using device={builder_args.device}")
     set_precision(builder_args.precision)
-    set_backend(dso=args.output_dso_path, pte=args.output_pte_path, aoti_package=args.output_aoti_package_path)
+    set_backend(
+        dso=args.output_dso_path,
+        pte=args.output_pte_path,
+        aoti_package=args.output_aoti_package_path,
+    )
 
     builder_args.dso_path = None
     builder_args.pte_path = None
@@ -376,6 +387,7 @@ def main(args):
 
     # TODO: clean this up
     # This mess is because ET does not support _weight_int4pack_mm right now
+    tokenizer_args = None
     if not builder_args.gguf_path:
         # tokenizer needed for quantization so get that here,
         try:
@@ -386,9 +398,8 @@ def main(args):
 
         if builder_args.max_seq_length is None:
             if (
-                (output_dso_path is not None or output_aoti_package_path is not None)
-                and not builder_args.dynamic_shapes
-            ):
+                output_dso_path is not None or output_aoti_package_path is not None
+            ) and not builder_args.dynamic_shapes:
                 print("Setting max_seq_length to 300 for DSO export.")
                 builder_args.max_seq_length = 300
             elif output_pte_path is not None:
@@ -401,7 +412,8 @@ def main(args):
             quantize,
             tokenizer,
             max_seq_length=builder_args.max_seq_length,
-            support_tensor_subclass=output_dso_path is None and output_aoti_package_path is None,
+            support_tensor_subclass=output_dso_path is None
+            and output_aoti_package_path is None,
         )
         model_to_pte = model
         model_to_dso = model
@@ -439,7 +451,9 @@ def main(args):
         if output_dso_path:
             output_dso_path = str(os.path.abspath(output_dso_path))
             print(f"Exporting model using AOT Inductor to {output_dso_path}")
-            print("WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.")
+            print(
+                "WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead."
+            )
             export_for_server(
                 model_to_dso,
                 builder_args.device,
@@ -450,11 +464,23 @@ def main(args):
 
         if output_aoti_package_path:
             output_aoti_package_path = str(os.path.abspath(output_aoti_package_path))
-            print(f"Exporting model using AOT Inductor to {output_aoti_package_path}")
+
+            if tokenizer_args is None:
+                tokenizer_type = "0"
+            elif tokenizer_args.is_sentencepiece:
+                tokenizer_type = "2"  # Corresponding to llama2
+            else:
+                tokenizer_type = "3"  # Corresponding to llama3
+
+            metadata = {"tokenizer_type": tokenizer_type}
+            print(
+                "Exporting model using AOT Inductor to " f"{output_aoti_package_path}."
+            )
             export_for_server(
                 model_to_aoti_package,
                 builder_args.device,
                 output_aoti_package_path,
                 builder_args.dynamic_shapes,
                 package=True,
+                metadata=metadata,
             )