pytorch
diff --git a/‎.ci/scripts/build_llama_android.sh
Lines changed: 2 additions & 2 deletions b/‎.ci/scripts/build_llama_android.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 4 additions & 4 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 4 additions & 4 deletions
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 3 additions & 3 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/scripts/extract_benchmark_results.py
Lines changed: 48 additions & 26 deletions b/‎.github/scripts/extract_benchmark_results.py
Lines changed: 48 additions & 26 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/apple-perf.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/apple-perf.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 9 additions & 3 deletions b/‎.github/workflows/pull.yml
Lines changed: 9 additions & 3 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 6 additions & 4 deletions b/‎.github/workflows/trunk.yml
Lines changed: 6 additions & 4 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp
Lines changed: 6 additions & 6 deletions b/‎backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎backends/qualcomm/_passes/annotate_quant_attrs.py
Lines changed: 24 additions & 4 deletions b/‎backends/qualcomm/_passes/annotate_quant_attrs.py
Lines changed: 24 additions & 4 deletions
@@ -48,9 +48,9 @@ build_llama_runner() {
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -Bcmake-android-out/examples/models/llama2 examples/models/llama2
+    -Bcmake-android-out/examples/models/llama examples/models/llama
 
-    cmake --build cmake-android-out/examples/models/llama2 -j4 --config Release
+    cmake --build cmake-android-out/examples/models/llama -j4 --config Release
 }
 install_flatc_from_source
 install_executorch_and_backend_lib
 
@@ -125,7 +125,7 @@ cmake_install_executorch_libraries() {
 
 cmake_build_llama_runner() {
     echo "Building llama runner"
-    dir="examples/models/llama2"
+    dir="examples/models/llama"
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Debug \
@@ -206,7 +206,7 @@ if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
 fi
 # Add dynamically linked library location
-$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
+$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
 
 # Create tokenizer.bin.
 echo "Creating tokenizer.bin"
@@ -219,15 +219,15 @@ echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
 if [[ "${BUILD_TOOL}" == "buck2" ]]; then
   # Run model.
   # shellcheck source=/dev/null
-  $BUCK run examples/models/llama2:main -- ${RUNTIME_ARGS} > result.txt
+  $BUCK run examples/models/llama:main -- ${RUNTIME_ARGS} > result.txt
 elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
   cmake_install_executorch_libraries
   cmake_build_llama_runner
   # Run llama runner
   NOW=$(date +"%H:%M:%S")
   echo "Starting to run llama runner at ${NOW}"
   # shellcheck source=/dev/null
-  cmake-out/examples/models/llama2/llama_main ${RUNTIME_ARGS} > result.txt
+  cmake-out/examples/models/llama/llama_main ${RUNTIME_ARGS} > result.txt
   NOW=$(date +"%H:%M:%S")
   echo "Finished at ${NOW}"
 else
 
@@ -75,9 +75,9 @@ run_portable_executor_runner() {
 test_model() {
   if [[ "${MODEL_NAME}" == "llama2" ]]; then
     # Install requirements for export_llama
-    bash examples/models/llama2/install_requirements.sh
-    # Test export_llama script: python3 -m examples.models.llama2.export_llama
-    "${PYTHON_EXECUTABLE}" -m examples.models.llama2.export_llama -c examples/models/llama2/params/demo_rand_params.pth -p examples/models/llama2/params/demo_config.json
+    bash examples/models/llama/install_requirements.sh
+    # Test export_llama script: python3 -m examples.models.llama.export_llama
+    "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
     run_portable_executor_runner
     rm "./${MODEL_NAME}.pte"
   fi
 
@@ -9,7 +9,6 @@
 import logging
 import os
 import re
-import time
 import zipfile
 from argparse import Action, ArgumentParser, Namespace
 from io import BytesIO
@@ -26,12 +25,15 @@
 
 # iOS-related regexes and variables
 IOS_TEST_SPEC_REGEX = re.compile(
-    r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>\w+)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+),"
+    r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>[\w\+]+)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+),"
 )
 IOS_TEST_NAME_REGEX = re.compile(
-    r"test_(?P<method>forward|load|generate)_(?P<model_name>\w+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
+    r"test_(?P<method>forward|load|generate)_(?P<model_name>[\w\+]+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
+)
+# The backend name could contain +, i.e. tinyllama_xnnpack+custom+qe_fp32
+IOS_MODEL_NAME_REGEX = re.compile(
+    r"(?P<model>[^_]+)_(?P<backend>[\w\+]+)_(?P<dtype>\w+)"
 )
-IOS_MODEL_NAME_REGEX = re.compile(r"(?P<model>[^_]+)_(?P<backend>\w+)_(?P<dtype>\w+)")
 
 
 class ValidateArtifacts(Action):
@@ -159,19 +161,8 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
     ios_ver = m.group("ios_ver").replace("_", ".")
     iphone_ver = m.group("iphone_ver").replace("_", ".")
 
-    # NB: This looks brittle, but unless we can return iOS benchmark results in JSON
-    # format by the test, the mapping is needed to match with Android test
-    if method == "load":
-        metric = "model_load_time(ms)"
-    elif method == "forward":
-        metric = (
-            "generate_time(ms)"
-            if "llama" in model_name
-            else "avg_inference_latency(ms)"
-        )
-    elif method == "generate":
-        metric = "token_per_sec"
-
+    # The default backend and quantization dtype if the script couldn't extract
+    # them from the model name
     backend = ""
     quantization = "unknown"
 
@@ -194,8 +185,9 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
             "availMem": 0,
             "totalMem": 0,
         },
-        "metric": metric,
+        "method": method,
         # These fields will be populated later by extract_ios_metric
+        "metric": "",
         "actualValue": 0,
         "targetValue": 0,
     }
@@ -210,10 +202,38 @@ def extract_ios_metric(
     """
     Map the metric name from iOS xcresult to the benchmark result
     """
-    if metric_name == "Clock Monotonic Time, s":
-        # The benchmark value is in ms
-        benchmark_result["actualValue"] = metric_value * 1000
-    elif metric_name == "Tokens Per Second, t/s":
+    method = benchmark_result.get("method", "")
+    if not method:
+        return benchmark_result
+
+    # NB: This looks brittle, but unless we can return iOS benchmark results in JSON
+    # format by the test, the mapping is needed to match with Android test
+    if method == "load":
+        if metric_name == "Clock Monotonic Time, s":
+            benchmark_result["metric"] = "model_load_time(ms)"
+            benchmark_result["actualValue"] = metric_value * 1000
+
+        elif metric_name == "Memory Peak Physical, kB":
+            # NB: Showing the value in mB is friendlier IMO
+            benchmark_result["metric"] = "peak_load_mem_usage(mb)"
+            benchmark_result["actualValue"] = metric_value / 1024
+
+    elif method == "forward":
+        if metric_name == "Clock Monotonic Time, s":
+            benchmark_result["metric"] = (
+                "generate_time(ms)"
+                if "llama" in test_name
+                else "avg_inference_latency(ms)"
+            )
+            benchmark_result["actualValue"] = metric_value * 1000
+
+        elif metric_name == "Memory Peak Physical, kB":
+            # NB: Showing the value in mB is friendlier IMO
+            benchmark_result["metric"] = "peak_inference_mem_usage(mb)"
+            benchmark_result["actualValue"] = metric_value / 1024
+
+    elif method == "generate" and metric_name == "Tokens Per Second, t/s":
+        benchmark_result["metric"] = "token_per_sec"
         benchmark_result["actualValue"] = metric_value
 
     return benchmark_result
@@ -235,31 +255,33 @@ def extract_ios_benchmark_results(
 
         with request.urlopen(artifact_s3_url) as data:
             current_test_name = ""
+            current_metric_name = ""
             current_record = {}
 
             for line in data.read().decode("utf8").splitlines():
                 s = IOS_TEST_SPEC_REGEX.search(line)
                 if not s:
                     continue
 
-                test_class = s.group("test_class")
                 test_name = s.group("test_name")
                 metric_name = s.group("metric")
                 metric_value = float(s.group("value"))
 
-                if test_name != current_test_name:
-                    if current_record:
+                if test_name != current_test_name or metric_name != current_metric_name:
+                    if current_record and current_record.get("metric", ""):
                         # Save the benchmark result in the same format used by Android
                         benchmark_results.append(current_record.copy())
 
                     current_test_name = test_name
+                    current_metric_name = metric_name
                     current_record = initialize_ios_metadata(current_test_name)
 
                 current_record = extract_ios_metric(
                     current_record, test_name, metric_name, metric_value
                 )
 
-            benchmark_results.append(current_record.copy())
+            if current_record and current_record.get("metric", ""):
+                benchmark_results.append(current_record.copy())
 
         return benchmark_results
 
 
@@ -160,7 +160,7 @@ jobs:
 
         if [[ ${{ matrix.model }} =~ ^stories* ]]; then
             # Install requirements for export_llama
-            PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
+            PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
             # Test llama2
             if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
                 DELEGATE_CONFIG="xnnpack+custom+qe"
 
@@ -76,7 +76,7 @@ jobs:
           # on-demand and periodic benchmarking.
           CRON_DEFAULT_MODELS: "stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l"
           CRON_DEFAULT_DEVICES: "apple_iphone_15"
-          CRON_DEFAULT_DELEGATES: "xnnpack,coreml,mps"
+          CRON_DEFAULT_DELEGATES: "nnpack,coreml,mps"
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
@@ -162,7 +162,7 @@ jobs:
         if [[ ${{ matrix.model }} =~ ^stories* ]]; then
           # Install requirements for export_llama
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-            bash examples/models/llama2/install_requirements.sh
+            bash examples/models/llama/install_requirements.sh
 
           # Test llama2
           if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
 
@@ -98,6 +98,12 @@ jobs:
           - dtype: bf16
             build-tool: buck2
             mode: portable
+          - dtype: bf16
+            build-tool: cmake
+            mode: custom
+          - dtype: bf16
+            build-tool: buck2
+            mode: custom
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -117,7 +123,7 @@ jobs:
         # Setup executorch
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
         # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
 
@@ -216,7 +222,7 @@ jobs:
         bash install_requirements.sh --pybind xnnpack
 
         # install Llava requirements
-        bash examples/models/llama2/install_requirements.sh
+        bash examples/models/llama/install_requirements.sh
         bash examples/models/llava/install_requirements.sh
 
         # run python unittest
@@ -411,7 +417,7 @@ jobs:
         # Setup executorch
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
         # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
 
 
@@ -227,6 +227,8 @@ jobs:
         include:
           - dtype: bf16
             mode: portable
+          - dtype: bf16
+            mode: custom
       fail-fast: false
     with:
       runner: macos-m1-stable
@@ -255,7 +257,7 @@ jobs:
         fi
 
         # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}"
 
@@ -279,7 +281,7 @@ jobs:
   #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
 
   #       # install Llava requirements
-  #       ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
+  #       ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
   #       ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
 
   #       # run python unittest
@@ -385,7 +387,7 @@ jobs:
         cmake --build cmake-out -j9 --target install --config Release
 
         echo "Build llama runner"
-        dir="examples/models/llama2"
+        dir="examples/models/llama"
         cmake \
             -DCMAKE_INSTALL_PREFIX=cmake-out \
             -DCMAKE_BUILD_TYPE=Release \
@@ -437,5 +439,5 @@ jobs:
 
         python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
 
-        cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
+        cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
         echo "::endgroup::"
@@ -22,10 +22,10 @@ please visit our documentation website [for the latest release](https://pytorch.
 
 Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin.
 
-Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
+Check out the examples of [Llama](./examples/models/llama/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
 
 
-**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama2/README.md) models via ExecuTorch.
+**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama/README.md) models via ExecuTorch.
 
 ## Feedback
 
 
@@ -253,11 +253,11 @@ bool write_directory_node(InMemoryDirectoryNode* node,
         return false;
     }
 
-    for (const auto& [_, node]: node->get_items()) {
-        if (node.get()->isDirectory() && !recursive) {
+    for (const auto& [_, node_2]: node->get_items()) {
+        if (node_2.get()->isDirectory() && !recursive) {
             continue;
         }
-        if (!write_node(node.get(), dir_path, recursive, error)) {
+        if (!write_node(node_2.get(), dir_path, recursive, error)) {
             return false;
         }
     }
@@ -383,9 +383,9 @@ FlattenedInMemoryNode::unflatten(const std::vector<FlattenedInMemoryNode>& flatt
             case InMemoryFileSystem::InMemoryNode::Kind::Directory: {
                 std::unordered_map<std::string, std::unique_ptr<InMemoryFileSystem::InMemoryNode>> items;
                 items.reserve(flattened_node_metadata.child_name_to_indices_map.size());
-                for (const auto& [name, index]: flattened_node_metadata.child_name_to_indices_map) {
-                    auto moveIt = std::make_move_iterator(nodes.begin() + index);
-                    items[name] = *moveIt;
+                for (const auto& [name_2, index_2]: flattened_node_metadata.child_name_to_indices_map) {
+                    auto moveIt = std::make_move_iterator(nodes.begin() + index_2);
+                    items[name_2] = *moveIt;
                 }
                 auto directory_node =
                     std::make_unique<InMemoryDirectoryNode>(std::move(name), std::move(attributes), std::move(items));
 
@@ -27,9 +27,12 @@ class AnnotateQuantAttrs(ExportPass):
     generated after quatization process.
     """
 
-    def __init__(self, edge_program: torch.export.ExportedProgram):
+    def __init__(
+        self, edge_program: torch.export.ExportedProgram, skip_advanced_requat: bool
+    ):
         super(AnnotateQuantAttrs, self).__init__()
         self.edge_program = edge_program
+        self.skip_advanced_requant = skip_advanced_requat
 
     def _annotate_source_nodes(
         self, quant_node: torch.fx.Node, quant_attrs: Dict[str, Any]
@@ -68,9 +71,26 @@ def _annotate_requant(self, n):
 
             # TODO: Store multiple pairs of requantize attributes when we have an op builder
             # that has multiple outputs that requires quant attributes.
-            if q_attrs["dtype"] != dq_attrs["dtype"]:
-                dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
-                n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs
+            if self.skip_advanced_requant:
+                if q_attrs["dtype"] != dq_attrs["dtype"]:
+                    dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
+                    n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs
+            else:
+                # When dtype is the same but other specs such as scale and offset are different,
+                # insert requant to improve accuracy.
+                # Users can turn this feature off if any inference speed drop is observed.
+                if any(
+                    q_attrs[attr] != dq_attrs[attr]
+                    for attr in [
+                        "scale",
+                        "zero_point",
+                        "quant_min",
+                        "quant_max",
+                        "dtype",
+                    ]
+                ):
+                    dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
+                    n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs
 
     # Dequant all the fold_quant parameters back to fp32.
     # If an operation is not supported by QNN and got fallback, it will expect a fp32 param.
Original file line number	Diff line number	Diff line change
`@@ -253,11 +253,11 @@ bool write_directory_node(InMemoryDirectoryNode* node,`
`253`	`253`	`return false;`
`254`	`254`	`}`
`255`	`255`
`256`		`- for (const auto& [_, node]: node->get_items()) {`
`257`		`- if (node.get()->isDirectory() && !recursive) {`
	`256`	`+ for (const auto& [_, node_2]: node->get_items()) {`
	`257`	`+ if (node_2.get()->isDirectory() && !recursive) {`
`258`	`258`	`continue;`
`259`	`259`	`}`
`260`		`- if (!write_node(node.get(), dir_path, recursive, error)) {`
	`260`	`+ if (!write_node(node_2.get(), dir_path, recursive, error)) {`
`261`	`261`	`return false;`
`262`	`262`	`}`
`263`	`263`	`}`
`@@ -383,9 +383,9 @@ FlattenedInMemoryNode::unflatten(const std::vector<FlattenedInMemoryNode>& flatt`
`383`	`383`	`case InMemoryFileSystem::InMemoryNode::Kind::Directory: {`
`384`	`384`	`std::unordered_map<std::string, std::unique_ptr<InMemoryFileSystem::InMemoryNode>> items;`
`385`	`385`	`items.reserve(flattened_node_metadata.child_name_to_indices_map.size());`
`386`		`- for (const auto& [name, index]: flattened_node_metadata.child_name_to_indices_map) {`
`387`		`- auto moveIt = std::make_move_iterator(nodes.begin() + index);`
`388`		`- items[name] = *moveIt;`
	`386`	`+ for (const auto& [name_2, index_2]: flattened_node_metadata.child_name_to_indices_map) {`
	`387`	`+ auto moveIt = std::make_move_iterator(nodes.begin() + index_2);`
	`388`	`+ items[name_2] = *moveIt;`
`389`	`389`	`}`
`390`	`390`	`auto directory_node =`
`391`	`391`	`std::make_unique<InMemoryDirectoryNode>(std::move(name), std::move(attributes), std::move(items));`