[aoti] Add cpp packaging for aoti + loading in python

angelayi · angelayi · commit b23d414f8608 · 2024-07-16T11:16:20.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 # C extensions
 *.so
 
+.vscode
 .model-artifacts/
 .venv
 .torchchat
@@ -18,3 +19,7 @@ runner-aoti/cmake-out/*
 
 # pte files
 *.pte
+
+checkpoints/
+exportedModels/
+cmake-out/
diff --git a/README.md b/README.md
@@ -54,7 +54,7 @@ source .venv/bin/activate
 ```
 [skip default]: end
 
-[shell default]: ./install_requirements.sh 
+[shell default]: ./install_requirements.sh
 
 Installations can be tested by
 
@@ -152,11 +152,11 @@ users to test the exported model.
 
 ```
 # Compile
-python3 torchchat.py export llama3 --output-dso-path exportedModels/llama3.so
+python3 torchchat.py export llama3 --output-pt2-path exportedModels/llama3_artifacts
 
 # Execute the exported model using Python
 
-python3 torchchat.py generate llama3 --dso-path exportedModels/llama3.so --prompt "Hello my name is"
+python3 torchchat.py generate llama3 --pt2-path exportedModels/llama3_artifacts --prompt "Hello my name is"
 ```
 
 NOTE: If your machine has cuda add this flag for performance
@@ -174,7 +174,8 @@ scripts/build_native.sh aoti
 
 Execute
 ```bash
-cmake-out/aoti_run exportedModels/llama3.so -z `python3 torchchat.py where llama3`/tokenizer.model -l 3 -i "Once upon a time"
+make -C exportedModels/llama3_artifacts
+cmake-out/aoti_run exportedModels/llama3_artifacts.so -z `python3 torchchat.py where llama3`/tokenizer.model -l 3 -i "Once upon a time"
 ```
 
 ## Mobile Execution
diff --git a/build/builder.py b/build/builder.py
@@ -33,6 +33,7 @@ class BuilderArgs:
     gguf_path: Optional[Union[Path, str]] = None
     gguf_kwargs: Optional[Dict[str, Any]] = None
     dso_path: Optional[Union[Path, str]] = None
+    pt2_path: Optional[Union[Path, str]] = None
     pte_path: Optional[Union[Path, str]] = None
     device: Optional[str] = None
     precision: torch.dtype = torch.float32
@@ -50,28 +51,29 @@ def __post_init__(self):
             or (self.checkpoint_dir and self.checkpoint_dir.is_dir())
             or (self.gguf_path and self.gguf_path.is_file())
             or (self.dso_path and Path(self.dso_path).is_file())
+            or (self.pt2_path and Path(self.pt2_path).is_file())
             or (self.pte_path and Path(self.pte_path).is_file())
         ):
             raise RuntimeError(
                 "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path"
             )
 
-        if self.dso_path and self.pte_path:
-            raise RuntimeError("specify either DSO path or PTE path, but not both")
+        if sum(1 for path in (self.dso_path, self.pte_path, self.pt2_path) if path is not None) > 1:
+            raise RuntimeError("specify either DSO path, PT2 path, or PTE path, but not more than one")
 
-        if self.checkpoint_path and (self.dso_path or self.pte_path):
+        if self.checkpoint_path and (self.dso_path or self.pte_path or self.pt2_path):
             print(
-                "Warning: checkpoint path ignored because an exported DSO or PTE path specified"
+                "Warning: checkpoint path ignored because an exported DSO, PT2, or PTE path specified"
             )
-        if self.checkpoint_dir and (self.dso_path or self.pte_path):
+        if self.checkpoint_dir and (self.dso_path or self.pte_path or self.pt2_path):
             print(
-                "Warning: checkpoint dir ignored because an exported DSO or PTE path specified"
+                "Warning: checkpoint dir ignored because an exported DSO, PT2, or PTE path specified"
             )
-        if self.gguf_path and (self.dso_path or self.pte_path):
+        if self.gguf_path and (self.dso_path or self.pte_path or self.pt2_path):
             print(
-                "Warning: GGUF path ignored because an exported DSO or PTE path specified"
+                "Warning: GGUF path ignored because an exported DSO, PT2, or PTE path specified"
             )
-        if not (self.dso_path) and not (self.pte_path):
+        if not (self.dso_path) and not (self.pte_path) and not (self.pt2_path):
             self.prefill_possible = True
 
     @classmethod
@@ -105,6 +107,7 @@ def from_args(cls, args):  # -> BuilderArgs:
                 checkpoint_path,
                 checkpoint_dir,
                 args.dso_path,
+                args.pt2_path,
                 args.pte_path,
                 args.gguf_path,
             ]:
@@ -138,10 +141,11 @@ def from_args(cls, args):  # -> BuilderArgs:
             gguf_path=args.gguf_path,
             gguf_kwargs=None,
             dso_path=args.dso_path,
+            pt2_path=args.pt2_path,
             pte_path=args.pte_path,
             device=args.device,
             precision=dtype,
-            setup_caches=(args.output_dso_path or args.output_pte_path),
+            setup_caches=(args.output_dso_path or args.output_pte_path or args.output_pt2_path),
             use_distributed=args.distributed,
             is_chat_model=is_chat_model,
         )
@@ -154,6 +158,7 @@ def from_speculative_args(cls, args):  # -> BuilderArgs:
         speculative_builder_args.checkpoint_path = args.draft_checkpoint_path
         speculative_builder_args.gguf_path = None
         speculative_builder_args.dso_path = None
+        speculative_builder_args.pt2_path = None
         speculative_builder_args.pte_path = None
         return speculative_builder_args
 
@@ -377,11 +382,12 @@ def _initialize_model(
 ):
     print("Loading model...")
 
-    if builder_args.gguf_path and (builder_args.dso_path or builder_args.pte_path):
+    if builder_args.gguf_path and (builder_args.dso_path or builder_args.pt2_path or builder_args.pte_path):
         print("Setting gguf_kwargs for generate.")
         is_dso = builder_args.dso_path is not None
+        is_pt2 = builder_args.pt2_path is not None
         is_pte = builder_args.pte_path is not None
-        assert not (is_dso and is_pte)
+        assert not (is_dso and is_pt2 and is_pte)
         assert builder_args.gguf_kwargs is None
         # TODO: make GGUF load independent of backend
         # currently not working because AVX int_mm broken
@@ -415,6 +421,36 @@ def _initialize_model(
             )
         except:
             raise RuntimeError(f"Failed to load AOTI compiled {builder_args.dso_path}")
+    
+    elif builder_args.pt2_path:
+        if not is_cuda_or_cpu_device(builder_args.device):
+            print(
+                f"Cannot load specified PT2 to {builder_args.device}. Attempting to load model to CPU instead"
+            )
+            builder_args.device = "cpu"
+
+        # assert (
+        #     quantize is None or quantize == "{ }"
+        # ), "quantize not valid for exported PT2 model. Specify quantization during export."
+
+        with measure_time("Time to load model: {time:.02f} seconds"):
+            model = _load_model(builder_args, only_config=True)
+            device_sync(device=builder_args.device)
+
+        try:
+            # Replace model forward with the AOT-compiled forward
+            # This is a hacky way to quickly demo AOTI's capability.
+            # model is still a Python object, and any mutation to its
+            # attributes will NOT be seen on by AOTI-compiled forward
+            # function, e.g. calling model.setup_cache will NOT touch
+            # AOTI compiled and maintained model buffers such as kv_cache.
+            from torch._inductor.package import load_package
+            model.forward = load_package(
+                str(builder_args.pt2_path.absolute()), builder_args.device
+            )
+        except:
+            raise RuntimeError(f"Failed to load AOTI compiled {builder_args.pt2_path}")
+
     elif builder_args.pte_path:
         if not is_cpu_device(builder_args.device):
             print(
diff --git a/build/utils.py b/build/utils.py
@@ -69,42 +69,46 @@ def unpack_packed_weights(
 
 active_builder_args_dso = None
 active_builder_args_pte = None
+active_builder_args_pt2 = None
 
 
-def set_backend(dso, pte):
+def set_backend(dso, pte, pt2):
     global active_builder_args_dso
     global active_builder_args_pte
     active_builder_args_dso = dso
+    active_builder_args_pt2 = pt2
     active_builder_args_pte = pte
 
 
 def use_aoti_backend() -> bool:
     global active_builder_args_dso
+    global active_builder_args_pt2
     global active_builder_args_pte
 
     # eager == aoti, which is when backend has not been explicitly set
-    if (not active_builder_args_dso) and not (active_builder_args_pte):
+    if (not active_builder_args_dso) and not (active_builder_args_pte) and not (active_builder_args_pt2):
         return True
 
-    if active_builder_args_pte and active_builder_args_dso:
+    if sum(1 for builder in (active_builder_args_pte, active_builder_args_dso, active_builder_args_pt2)) > 1:
         raise RuntimeError(
-            "code generation needs to choose different implementations for DSO and PTE path.  Please only use one export option, and call export twice if necessary!"
+            "code generation needs to choose different implementations for DSO, PT2, and PTE path.  Please only use one export option, and call export twice if necessary!"
         )
 
-    return bool(active_builder_args_dso)
+    return bool(active_builder_args_dso) or bool(active_builder_args_pt2)
 
 
 def use_et_backend() -> bool:
     global active_builder_args_dso
+    global active_builder_args_pt2
     global active_builder_args_pte
 
     # eager == aoti, which is when backend has not been explicitly set
-    if not (active_builder_args_pte or active_builder_args_dso):
-        return False
+    if (not active_builder_args_dso) and not (active_builder_args_pte) and not (active_builder_args_pt2):
+        return True
 
-    if active_builder_args_pte and active_builder_args_dso:
+    if sum(1 for builder in (active_builder_args_pte, active_builder_args_dso, active_builder_args_pt2)) > 1:
         raise RuntimeError(
-            "code generation needs to choose different implementations for DSO and PTE path.  Please only use one export option, and call export twice if necessary!"
+            "code generation needs to choose different implementations for DSO, PT2, and PTE path.  Please only use one export option, and call export twice if necessary!"
         )
 
     return bool(active_builder_args_pte)
diff --git a/cli.py b/cli.py
@@ -210,6 +210,12 @@ def _add_export_output_path_args(parser) -> None:
         default=None,
         help="Output to the specified AOT Inductor .dso model file",
     )
+    output_path_parser.add_argument(
+        "--output-pt2-path",
+        type=str,
+        default=None,
+        help="Output directory for AOTInductor compiled artifacts",
+    )
 
 
 # Add CLI Args representing user provided exported model files
@@ -221,6 +227,12 @@ def _add_exported_model_input_args(parser) -> None:
         default=None,
         help="Use the specified AOT Inductor .dso model file",
     )
+    exported_model_path_parser.add_argument(
+        "--pt2-path",
+        type=Path,
+        default=None,
+        help="Use the specified directory containing AOT Inductor compiled files",
+    )
     exported_model_path_parser.add_argument(
         "--pte-path",
         type=Path,
diff --git a/eval.py b/eval.py
@@ -233,7 +233,7 @@ def main(args) -> None:
 
     if compile:
         assert not (
-            builder_args.dso_path or builder_args.pte_path
+            builder_args.dso_path or builder_args.pte_path or builder_args.pt2_path
         ), "cannot compile exported model"
         global model_forward
         model_forward = torch.compile(
@@ -260,6 +260,8 @@ def main(args) -> None:
     )
     if builder_args.dso_path:
         print(f"For model {builder_args.dso_path}")
+    if builder_args.pt2_path:
+        print(f"For model {builder_args.pt2_path}")
     elif builder_args.pte_path:
         print(f"For model {builder_args.pte_path}")
     elif builder_args.checkpoint_path:
diff --git a/export.py b/export.py
@@ -20,7 +20,7 @@
 
 from build.utils import set_backend, set_precision
 from cli import add_arguments_for_verb, arg_init, check_args
-from export_aoti import export_model as export_model_aoti
+from export_aoti import export_model_so, export_model_pt2
 
 try:
     executorch_export_available = True
@@ -39,14 +39,16 @@ def main(args):
 
     print(f"Using device={builder_args.device}")
     set_precision(builder_args.precision)
-    set_backend(dso=args.output_dso_path, pte=args.output_pte_path)
+    set_backend(dso=args.output_dso_path, pte=args.output_pte_path, pt2=args.output_pt2_path)
 
     builder_args.dso_path = None
     builder_args.pte_path = None
+    builder_args.pt2_path = None
     builder_args.setup_caches = True
 
     output_pte_path = args.output_pte_path
     output_dso_path = args.output_dso_path
+    output_pt2_path = args.output_pt2_path
 
     if output_pte_path and builder_args.device != "cpu":
         print(
@@ -74,6 +76,7 @@ def main(args):
         )
         model_to_pte = model
         model_to_dso = model
+        model_to_pt2 = model
     else:
         if output_pte_path:
             _set_gguf_kwargs(builder_args, is_et=True, context="export")
@@ -83,12 +86,13 @@ def main(args):
             )
             _unset_gguf_kwargs(builder_args)
 
-        if output_dso_path:
+        if output_dso_path or output_pt2_path:
             _set_gguf_kwargs(builder_args, is_et=False, context="export")
-            model_to_dso = _initialize_model(
+            model_to_pt2 = _initialize_model(
                 builder_args,
                 quantize,
             )
+            model_to_dso = model_to_pt2
             _unset_gguf_kwargs(builder_args)
 
     with torch.no_grad():
@@ -104,10 +108,16 @@ def main(args):
                     "Export with executorch requested but ExecuTorch could not be loaded"
                 )
                 print(executorch_exception)
+                
         if output_dso_path:
             output_dso_path = str(os.path.abspath(output_dso_path))
             print(f"Exporting model using AOT Inductor to {output_dso_path}")
-            export_model_aoti(model_to_dso, builder_args.device, output_dso_path, args)
+            export_model_so(model_to_dso, builder_args.device, output_dso_path, args)
+
+        if output_pt2_path:
+            output_pt2_path = str(os.path.abspath(output_pt2_path))
+            print(f"Exporting model using AOT Inductor to {output_pt2_path}")
+            export_model_pt2(model_to_pt2, builder_args.device, output_pt2_path, args)
 
 
 if __name__ == "__main__":
diff --git a/export_aoti.py b/export_aoti.py
@@ -12,7 +12,30 @@
 default_device = "cpu"
 
 
-def export_model(model: nn.Module, device, output_path, args=None):
+def export_model_pt2(model: nn.Module, device, output_path, args=None):
+    max_seq_length = 350
+
+    input = (
+        torch.tensor([[1, 9038, 2501, 263, 931]], dtype=torch.int, device=device),
+        torch.tensor([0, 1, 2, 3, 4], dtype=torch.int, device=device),
+    )
+
+    seq = Dim("seq", min=1, max=max_seq_length)
+    # Specify that the first dimension of each input is that batch size
+    dynamic_shapes = {"idx": {1: seq}, "input_pos": {0: seq}}
+
+    model.to(device)
+    pt2_path = torch._export.aot_compile(
+        model,
+        args=input,
+        options={"aot_inductor.output_path": output_path, "aot_inductor.package": True},
+        dynamic_shapes=dynamic_shapes,
+    )
+    print(f"The AOTInductor compiled files can be found at: {pt2_path}")
+    return pt2_path
+
+
+def export_model_so(model: nn.Module, device, output_path, args=None):
     max_seq_length = 350
 
     input = (
diff --git a/generate.py b/generate.py
@@ -91,8 +91,8 @@ def validate_build(
             reason = "model compilation for prefill"
         if self.compile:
             reason = "model compilation"
-        if builder_args.dso_path:
-            model_type = "DSO"
+        if builder_args.pt2_path:
+            model_type = "PT2"
         if builder_args.pte_path:
             model_type = "PTE"
         if model_type and reason:
@@ -103,7 +103,7 @@ def validate_build(
     @classmethod
     def from_args(cls, args):
         sequential_prefill = (
-            args.sequential_prefill or bool(args.dso_path) or bool(args.pte_path)
+            args.sequential_prefill or bool(args.pt2_path) or bool(args.pte_path)
         )
 
         return cls(