pytorch
diff --git a/‎.github/workflows/android-release-artifacts.yml
Lines changed: 7 additions & 1 deletion b/‎.github/workflows/android-release-artifacts.yml
Lines changed: 7 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 6 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 6 additions & 2 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/replace_ops.py
Lines changed: 0 additions & 25 deletions b/‎backends/cadence/aot/replace_ops.py
Lines changed: 0 additions & 25 deletions
diff --git a/‎backends/cadence/aot/tests/test_replace_ops_passes.py
Lines changed: 0 additions & 31 deletions b/‎backends/cadence/aot/tests/test_replace_ops_passes.py
Lines changed: 0 additions & 31 deletions
diff --git a/‎examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
Lines changed: 5 additions & 1 deletion b/‎examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/mediatek/executor_runner/mtk_llama_runner.cpp
Lines changed: 2 additions & 4 deletions b/‎examples/mediatek/executor_runner/mtk_llama_runner.cpp
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/mediatek/executor_runner/mtk_llama_runner.h
Lines changed: 2 additions & 4 deletions b/‎examples/mediatek/executor_runner/mtk_llama_runner.h
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/models/llama/main.cpp
Lines changed: 9 additions & 4 deletions b/‎examples/models/llama/main.cpp
Lines changed: 9 additions & 4 deletions
diff --git a/‎examples/models/llama/model.py
Lines changed: 4 additions & 0 deletions b/‎examples/models/llama/model.py
Lines changed: 4 additions & 0 deletions
@@ -80,6 +80,12 @@ jobs:
 
         echo -n "$SECRET_EXECUTORCH_MAVEN_SIGNING_GPG_KEY_CONTENTS" | base64 -d > /tmp/secring.gpg
 
+        # Update the version name in build.gradle in case of maven publish
+        VERSION="${{ inputs.version }}"
+        if [ ! -z "$VERSION" ]; then
+          sed -i "s/\(coordinates(\"org.pytorch\", \"executorch-android\", \"\)\([0-9]\+.[0-9]\+.[0-9]\+\)\(\")\)/\1$VERSION\3/" extension/android/executorch_android/build.gradle
+        fi
+
         # Build AAR Package
         mkdir aar-out
         export BUILD_AAR_DIR=aar-out
@@ -92,7 +98,7 @@ jobs:
         # Publish to maven staging
         UPLOAD_TO_MAVEN="${{ inputs.upload_to_maven }}"
         if [[ "$UPLOAD_TO_MAVEN" == "true" ]]; then
-          (cd aar-out; ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:publishToMavenCentral)
+          (cd extension/android; ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:publishToMavenCentral)
         fi
 
   upload-release-aar:
 
@@ -761,12 +761,16 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_MODULE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_LLM)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizers)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_MODULE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
+if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
 
@@ -45,7 +45,7 @@ executorch
 │   └── <a href="devtools/visualization">visualization</a> - Visualization tools for representing model structure and performance metrics.
 ├── <a href="docs">docs</a> - Static docs tooling and documentation source files.
 ├── <a href="examples">examples</a> - Examples of various user flows, such as model export, delegates, and runtime execution.
-├── <a href="exir">exir</a> - Ahead-of-time library: model capture and lowering APIs. EXport Intermediate Representation (EXIR) is a format for representing the result of <a href="https://pytorch.org/docs/main/export.ir_spec.html">torch.export</a>. This directory contains utilities and passes for lowering the EXIR graphs into different <a href="/docs/source/ir-exir.md">dialects</a> and eventually suitable to run on target hardware.
+├── <a href="exir">exir</a> - Ahead-of-time library: model capture and lowering APIs. EXport Intermediate Representation (EXIR) is a format for representing the result of <a href="https://pytorch.org/docs/stable/export.html">torch.export</a>. This directory contains utilities and passes for lowering the EXIR graphs into different <a href="/docs/source/ir-exir.md">dialects</a> and eventually suitable to run on target hardware.
 │   ├── <a href="exir/_serialize">_serialize</a> - Serialize final export artifact.
 │   ├── <a href="exir/backend">backend</a> - Backend delegate ahead of time APIs.
 │   ├── <a href="exir/capture">capture</a> - Program capture.
 
@@ -1806,30 +1806,6 @@ def call_operator(self, op, args, kwargs, meta):
         return super().call_operator(op, tuple(new_args), kwargs, meta)
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=0))
-class ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass(ExportPass):
-    """
-    Replace the aten.linalg_vector_norm op with a custom op.
-    aten.linalg_vector_norm is not supported by Jarvis, so we
-    need to replace it with native_batch_norm at all optimization levels.
-    """
-
-    def call_operator(self, op, args, kwargs, meta):
-        if op != exir_ops.edge.aten.linalg_vector_norm.default:
-            return super().call_operator(op, args, kwargs, meta)
-
-        assert (
-            len(args) == 1
-        ), "aten.linalg_vector_norm should have 1 argument (a tensor), we do not support any custom variants"
-
-        return super().call_operator(
-            exir_ops.edge.cadence.linalg_vector_norm.default,
-            args,
-            kwargs,
-            meta,
-        )
-
-
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
 class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
     """
@@ -2243,7 +2219,6 @@ class CadenceReplaceOpsInGraph:
         ReplacePT2DequantWithCadenceDequantPass,
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
-        ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
         ReplaceWhereWithFullArgsWithWhereScalar,
         # ReplaceGeluWithApproximateGeluPass,
     ]
@@ -23,7 +23,6 @@
     MakeSliceAndCatDimOutermostPass,
     ReplaceAddMMWithLinearPass,
     ReplaceAtenConvolutionWithJarvisConvolutionPass,
-    ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
     ReplaceConstantPadNdWithSlicePass,
     ReplaceConvolutionOptionalArgsWithConcreteArgsPass,
     ReplaceConvWithIm2RowAndLinear,
@@ -1189,36 +1188,6 @@ def forward(self, x):
             count_node(graph_after_passes, exir_ops.edge.aten.transpose_copy.int), 0
         )
 
-    def test_replace_aten_linalg_vector_norm_with_cadence_linalg_vector_norm(self):
-        class LinalgVectorNorm(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
-                return torch.linalg.vector_norm(x)
-
-        x = torch.randn(32)
-
-        graph_module = (
-            export_to_edge(LinalgVectorNorm(), (x,)).exported_program().graph_module
-        )
-
-        p = ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass()
-        graph_after_passes = cast(PassResult, p(graph_module)).graph_module
-
-        # Assert that aten.linalg_vector_norm op was replaced by a
-        # cadence.linalg_vector_norm op
-        self.assertEqual(
-            count_node(
-                graph_after_passes,
-                exir_ops.edge.aten.linalg_vector_norm.default,
-            ),
-            0,
-        )
-        self.assertEqual(
-            count_node(
-                graph_after_passes, exir_ops.edge.cadence.linalg_vector_norm.default
-            ),
-            1,
-        )
-
     def test_replace_aten_where_with_cadence_where_Scalar(self):
         class WhereScalarModel(torch.nn.Module):
             def forward(self, cond: torch.Tensor):
 
@@ -12,6 +12,7 @@
 #import <executorch/examples/models/llama/runner/runner.h>
 #import <executorch/examples/models/llava/runner/llava_runner.h>
 
+using executorch::extension::llm::GenerationConfig;
 using executorch::extension::llm::Image;
 using executorch::runtime::Error;
 
@@ -61,8 +62,11 @@ - (BOOL)generate:(NSString*)prompt
        sequenceLength:(NSInteger)seq_len
     withTokenCallback:(nullable void (^)(NSString*))callback
                 error:(NSError**)error {
+  const GenerationConfig config{
+    .seq_len = static_cast<int32_t>(seq_len)
+  };
   const auto status = _runner->generate(
-      prompt.UTF8String, seq_len, [callback](const std::string& token) {
+      prompt.UTF8String, config, [callback](const std::string& token) {
         callback(@(token.c_str()));
       });
   if (status != Error::Ok) {
 
@@ -80,11 +80,9 @@ bool MTKLlamaRunner::is_loaded() const {
 
 Error MTKLlamaRunner::generate(
     const std::string& prompt,
-    int32_t seq_len,
+    executorch::extension::llm::GenerationConfig config,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback,
-    bool echo,
-    bool warming) {
+    std::function<void(const Stats&)> stats_callback) {
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
 
@@ -43,11 +43,9 @@ class MTKLlamaRunner : public executorch::extension::llm::IRunner {
   Error load();
   Error generate(
       const std::string& prompt,
-      int32_t seq_len = 128,
+      executorch::extension::llm::GenerationConfig config,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {},
-      bool echo = true,
-      bool warming = false);
+      std::function<void(const Stats&)> stats_callback = {});
   void stop();
 
   LlamaModelOptions get_model_options();
 
@@ -53,7 +53,7 @@ int32_t main(int32_t argc, char** argv) {
 
   const char* prompt = FLAGS_prompt.c_str();
 
-  double temperature = FLAGS_temperature;
+  float temperature = FLAGS_temperature;
 
   int32_t seq_len = FLAGS_seq_len;
 
@@ -73,13 +73,18 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  example::Runner runner(model_path, tokenizer_path, temperature);
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+  example::Runner runner(model_path, tokenizer_path);
 
   if (warmup) {
-    runner.warmup(prompt, seq_len);
+    // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+    runner.warmup(prompt, /*max_new_tokens=*/seq_len);
   }
   // generate
-  runner.generate(prompt, seq_len);
+  executorch::extension::llm::GenerationConfig config{
+      .seq_len = seq_len, .temperature = temperature};
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+  runner.generate(prompt, config);
 
   return 0;
 }
@@ -18,6 +18,7 @@
 from executorch.examples.models.llama.llama_transformer import Transformer
 
 from executorch.examples.models.llama.model_args import ModelArgs
+from torchao.utils import TorchAOBaseTensor
 
 try:
     from .fairseq2 import convert_to_llama_checkpoint
@@ -257,6 +258,9 @@ def __init__(self, **kwargs):
                 strict=False,
                 assign=True,
             )  # self.model_ = Transformer(gptconf)
+            for param in self.model_.parameters():
+                if isinstance(param, TorchAOBaseTensor):
+                    param.requires_grad = False
         else:
             print("Checkpoint not provided, defaulting weights to zeros.")
             self.model_.to_empty(device="cpu")