pytorch
diff --git a/‎backends/transforms/TARGETS
Lines changed: 1 addition & 0 deletions b/‎backends/transforms/TARGETS
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/extension-module.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/extension-module.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/llm/getting-started.md
Lines changed: 6 additions & 5 deletions b/‎docs/source/llm/getting-started.md
Lines changed: 6 additions & 5 deletions
diff --git a/‎examples/llm_manual/main.cpp
Lines changed: 1 addition & 3 deletions b/‎examples/llm_manual/main.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/models/llama2/TARGETS
Lines changed: 0 additions & 4 deletions b/‎examples/models/llama2/TARGETS
Lines changed: 0 additions & 4 deletions
diff --git a/‎examples/models/llama2/eval_llama_lib.py
Lines changed: 3 additions & 2 deletions b/‎examples/models/llama2/eval_llama_lib.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/models/llama2/export_llama_lib.py
Lines changed: 11 additions & 12 deletions b/‎examples/models/llama2/export_llama_lib.py
Lines changed: 11 additions & 12 deletions
diff --git a/‎examples/models/llama2/runner/runner.cpp
Lines changed: 5 additions & 12 deletions b/‎examples/models/llama2/runner/runner.cpp
Lines changed: 5 additions & 12 deletions
diff --git a/‎examples/models/llama2/runner/targets.bzl
Lines changed: 0 additions & 3 deletions b/‎examples/models/llama2/runner/targets.bzl
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/models/llama2/source_transformation/quantize.py
Lines changed: 2 additions & 2 deletions b/‎examples/models/llama2/source_transformation/quantize.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/models/phi-3-mini/export_model.py
Lines changed: 0 additions & 65 deletions b/‎examples/models/phi-3-mini/export_model.py
Lines changed: 0 additions & 65 deletions
diff --git a/‎examples/models/phi-3-mini/export_phi-3-mini.py
Lines changed: 52 additions & 0 deletions b/‎examples/models/phi-3-mini/export_phi-3-mini.py
Lines changed: 52 additions & 0 deletions
diff --git a/‎examples/models/phi-3-mini/main.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/models/phi-3-mini/main.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/qualcomm/llama2/llama.py
Lines changed: 1 addition & 1 deletion b/‎examples/qualcomm/llama2/llama.py
Lines changed: 1 addition & 1 deletion
@@ -119,6 +119,7 @@ runtime.python_library(
     visibility = [
         "//executorch/backends/...",
         "//executorch/examples/...",
+        "//executorch/extension/llm/...",
     ],
     deps = [
         "//caffe2:torch",
 
@@ -136,7 +136,7 @@ Use [ExecuTorch Dump](sdk-etdump.md) to trace model execution. Create an instanc
 
 using namespace ::torch::executor;
 
-Module module("/path/to/model.pte", Module::MlockConfig::UseMlock, std::make_unique<ETDumpGen>());
+Module module("/path/to/model.pte", Module::LoadMode::MmapUseMlock, std::make_unique<ETDumpGen>());
 
 // Execute a method, e.g. module.forward(...); or module.execute("my_method", ...);
 
 
@@ -313,6 +313,8 @@ penalties for repeated tokens, and biases to prioritize or de-prioritize specifi
 ```cpp
 // main.cpp
 
+using namespace torch::executor;
+
 int main() {
     // Set up the prompt. This provides the seed text for the model to elaborate.
     std::cout << "Enter model prompt: ";
@@ -327,7 +329,7 @@ int main() {
     BasicSampler sampler = BasicSampler();
 
     // Load the exported nanoGPT program, which was generated via the previous steps.
-    Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
+    Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
 
     const auto max_input_tokens = 1024;
     const auto max_output_tokens = 30;
@@ -787,15 +789,14 @@ Include the ETDump header in your code.
 
 Create an Instance of the ETDumpGen class and pass it to the Module constructor.
 ```cpp
-std::unique_ptr<torch::executor::ETDumpGen> etdump_gen_ = std::make_unique<torch::executor::ETDumpGen>();
-Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors, std::move(etdump_gen_));
+std::unique_ptr<ETDumpGen> etdump_gen_ = std::make_unique<ETDumpGen>();
+Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors, std::move(etdump_gen_));
 ```
 
 After calling `generate()`, save the ETDump to a file. You can capture multiple
 model runs in a single trace, if desired.
 ```cpp
-torch::executor::ETDumpGen* etdump_gen =
-    static_cast<torch::executor::ETDumpGen*>(model.event_tracer());
+ETDumpGen* etdump_gen = static_cast<ETDumpGen*>(model.event_tracer());
 
 ET_LOG(Info, "ETDump size: %zu blocks", etdump_gen->get_num_blocks());
 etdump_result result = etdump_gen->get_etdump_data();
 
@@ -110,9 +110,7 @@ int main() {
 
   // Load the exported nanoGPT program, which was generated via the previous
   // steps.
-  Module model(
-      "nanogpt.pte",
-      torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
+  Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
 
   const auto max_input_tokens = 1024;
   const auto max_output_tokens = 30;
 
@@ -64,7 +64,6 @@ runtime.python_binary(
 runtime.python_library(
     name = "export_library",
     srcs = [
-        "builder.py",
         "export_llama.py",
         "export_llama_lib.py",
         "model.py",
@@ -82,13 +81,10 @@ runtime.python_library(
     ],
     deps = [
         "//caffe2:torch",
-        "//executorch/backends/transforms:duplicate_dynamic_quant_chain",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
         "//executorch/examples/models/llama2/custom_ops:custom_ops_aot_py",
-        "//executorch/exir:lib",
         "//executorch/extension/llm/export:export_lib",
-        "//executorch/extension/export_util:export_util",
         # one definition has to be included in the user of the libarary
         # depending on what library the client wants to use
         # "//executorch/extension/pybindings:aten_lib",
 
@@ -19,9 +19,10 @@
     Tokenizer as SentencePieceTokenizer,
 )
 
+from executorch.extension.llm.export import LLMEdgeManager
+
 from lm_eval.api.model import LM
 
-from .builder import LlamaEdgeManager
 from .export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
@@ -130,7 +131,7 @@ def gen_eval_wrapper(
 
     pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
     # GPTFastEvalWrapper: Create a wrapper around a pre-exported model
-    manager: LlamaEdgeManager = _prepare_for_llama_export(model_name, args)
+    manager: LLMEdgeManager = _prepare_for_llama_export(model_name, args)
 
     if len(quantizers) != 0:
         manager = manager.capture_pre_autograd_graph().pt2e_quantize(quantizers)
 
@@ -22,6 +22,8 @@
 
 from executorch.examples.models.llama2.llama_transformer import ModelArgs
 
+from executorch.extension.llm.export.builder import DType, LLMEdgeManager
+
 from executorch.extension.llm.export.partitioner_lib import (
     get_coreml_partitioner,
     get_mps_partitioner,
@@ -40,8 +42,6 @@
 from executorch.util.activation_memory_profiler import generate_memory_trace
 
 from ..model_factory import EagerModelFactory
-
-from .builder import DType, LlamaEdgeManager
 from .source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
@@ -333,12 +333,12 @@ def export_llama(modelname, args) -> str:
         return filename
 
 
-def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
+def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
     """
     Helper function for export_llama. Loads the model from checkpoint and params,
-    and sets up a LlamaEdgeManager with initial transforms and dtype conversion.
+    and sets up a LLMEdgeManager with initial transforms and dtype conversion.
 
-    Returns a LlamaEdgeManager prior to calling export_to_edge with quantizers
+    Returns a LLMEdgeManager prior to calling export_to_edge with quantizers
     """
 
     # load model from checkpoint and params.json
@@ -429,7 +429,7 @@ def _validate_args(args):
         )
 
 
-def _export_llama(modelname, args) -> LlamaEdgeManager:  # noqa: C901
+def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
     _validate_args(args)
     pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
 
@@ -579,12 +579,12 @@ def _load_llama_model(
     verbose: bool = False,
     max_seq_len: int = 128,
     metadata_str: Optional[str] = None,
-) -> "LlamaEdgeManager":
+) -> "LLMEdgeManager":
     """
-    A helper util that builds a Llama2 model. It returns a LlamaEdgeManager that
+    A helper util that builds a Llama2 model. It returns a LLMEdgeManager that
     can help further lower the model to ExecuTorch.
     Returns:
-        An instance of LlamaEdgeManager which contains the eager mode model.
+        An instance of LLMEdgeManager which contains the eager mode model.
     """
     assert (
         checkpoint or checkpoint_dir
@@ -622,13 +622,12 @@ def _load_llama_model(
     else:
         raise ValueError(f"Unsupported dtype {dtype}")
 
-    return LlamaEdgeManager(
+    return LLMEdgeManager(
         model=model,
         modelname=modelname,
-        weight_type=weight_type,
+        max_seq_len=model.params.max_seq_len,
         dtype=dtype,
         use_kv_cache=use_kv_cache,
-        use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
         example_inputs=example_inputs,
         enable_dynamic_shape=enable_dynamic_shape,
         verbose=verbose,
 
@@ -15,7 +15,6 @@
 #else /* BPE */
 #include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #endif /* ET_USE_TIKTOKEN*/
-#include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
@@ -43,7 +42,10 @@ Runner::Runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
     const float temperature)
-    : model_path_(model_path),
+    // NOTE: we observed ~2x loading performance increase on iPhone 15
+    // and a ~5% improvement on Galaxy S22 by switching to
+    // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
+    : module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
       tokenizer_path_(tokenizer_path),
       temperature_(temperature) {
   ET_LOG(
@@ -54,22 +56,13 @@ Runner::Runner(
 }
 
 bool Runner::is_loaded() const {
-  return module_ && module_->is_loaded() && tokenizer_ && sampler_;
+  return module_->is_loaded() && tokenizer_ && sampler_;
 }
 
 Error Runner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
-  // NOTE: we observed ~2x loading performance increase on iPhone 15
-  // and a ~5% improvement on Galaxy S22 by switching to
-  // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-  auto data_loader_result = util::FileDataLoader::from(model_path_.c_str());
-  if (!data_loader_result.ok()) {
-    return data_loader_result.error();
-  }
-  module_ = std::make_unique<Module>(
-      std::make_unique<util::FileDataLoader>(std::move(*data_loader_result)));
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
 
   // Read out metadata: vocab_size (expected by the model), BOS, EOS, n_BOS,
 
@@ -31,9 +31,6 @@ def define_common_targets():
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
-            deps = [
-                "//executorch/extension/data_loader:file_data_loader",
-            ],
             exported_deps = [
                 "//executorch/backends/xnnpack:xnnpack_backend",
                 "//executorch/examples/models/llama2/sampler:sampler" + aten_suffix,
 
@@ -12,9 +12,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from sentencepiece import SentencePieceProcessor
+from executorch.extension.llm.export.builder import DType
 
-from ..builder import DType
+from sentencepiece import SentencePieceProcessor
 
 try:
     from fairseq2.nn.embedding import (
 
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.extension.llm.export.builder import DType, LLMEdgeManager
+
+from executorch.extension.llm.export.partitioner_lib import get_xnnpack_partitioner
+from executorch.extension.llm.export.quantizer_lib import (
+    DynamicQuantLinearOptions,
+    get_pt2e_quantizers,
+    PT2EQuantOptions,
+)
+
+from transformers import Phi3ForCausalLM
+
+
+def main() -> None:
+    torch.manual_seed(42)
+
+    # pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM`
+    model = Phi3ForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+
+    modelname = "phi-3-mini"
+
+    (
+        LLMEdgeManager(
+            model=model,
+            modelname=modelname,
+            max_seq_len=128,
+            dtype=DType.fp32,
+            use_kv_cache=False,
+            example_inputs=(torch.randint(0, 100, (1, 100), dtype=torch.long),),
+            enable_dynamic_shape=True,
+            verbose=True,
+        )
+        .set_output_dir(".")
+        .capture_pre_autograd_graph()
+        .pt2e_quantize(
+            get_pt2e_quantizers(PT2EQuantOptions(None, DynamicQuantLinearOptions()))
+        )
+        .export_to_edge()
+        .to_backend([get_xnnpack_partitioner()])
+        .to_executorch()
+        .save_to_pte(f"{modelname}.pte")
+    )
+
+
+if __name__ == "__main__":
+    main()
@@ -83,7 +83,7 @@ int main() {
 
   SentencePieceTokenizer tokenizer("tokenizer.model");
 
-  Module model("phi-3-mini.pte", Module::MlockConfig::UseMlockIgnoreErrors);
+  Module model("phi-3-mini.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
 
   const auto max_output_tokens = 128;
   generate(model, prompt, tokenizer, max_output_tokens);
 
@@ -30,7 +30,6 @@
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
 )
-from executorch.examples.models.llama2.builder import DType
 from executorch.examples.qualcomm.llama2.model.static_llama import LlamaModel, ModelArgs
 from executorch.examples.qualcomm.scripts.utils import (
     make_output_dir,
@@ -41,6 +40,7 @@
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.exir.program._program import _get_updated_graph_signature
+from executorch.extension.llm.export.builder import DType
 
 from sentencepiece import SentencePieceProcessor
 from torch.ao.quantization.observer import MinMaxObserver