pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test.sh
Lines changed: 20 additions & 2 deletions b/‎.ci/scripts/test.sh
Lines changed: 20 additions & 2 deletions
diff --git a/‎.github/pytorch-probot.yml
Lines changed: 2 additions & 0 deletions b/‎.github/pytorch-probot.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 10 additions & 9 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 10 additions & 9 deletions
diff --git a/‎.github/workflows/android.yml
Lines changed: 3 additions & 13 deletions b/‎.github/workflows/android.yml
Lines changed: 3 additions & 13 deletions
diff --git a/‎.github/workflows/apple.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/apple.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 7 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/arm/arm_partitioner.py
Lines changed: 2 additions & 0 deletions b/‎backends/arm/arm_partitioner.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/operators/__init__.py
Lines changed: 2 additions & 0 deletions b/‎backends/arm/operators/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/operators/op_conv2d.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/op_conv2d.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_exp.py
Lines changed: 81 additions & 0 deletions b/‎backends/arm/operators/op_exp.py
Lines changed: 81 additions & 0 deletions
diff --git a/‎backends/arm/operators/op_log.py
Lines changed: 81 additions & 0 deletions b/‎backends/arm/operators/op_log.py
Lines changed: 81 additions & 0 deletions
diff --git a/‎backends/arm/passes/annotate_channels_last_dim_order_pass.py
Lines changed: 3 additions & 1 deletion b/‎backends/arm/passes/annotate_channels_last_dim_order_pass.py
Lines changed: 3 additions & 1 deletion
@@ -1 +1 @@
-e4cd76cf8283c8ddbf95674b020fbfcff467cb4b
+00e3eea170ce5db8ea9c62ce5e48f13886cd6d20
@@ -6,6 +6,7 @@
 # LICENSE file in the root directory of this source tree.
 
 set -eux
+set -o xtrace
 
 build_qnn_backend() {
   echo "Start building qnn backend."
 
@@ -156,9 +156,27 @@ test_model_with_qnn() {
   export PYTHONPATH=$EXECUTORCH_ROOT/..
 
   if [[ "${MODEL_NAME}" == "dl3" ]]; then
-    "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.deeplab_v3 -b ${CMAKE_OUTPUT_DIR} -m SM8550 --compile_only --download
-    EXPORTED_MODEL=./deeplab_v3/dlv3_qnn.pte
+    EXPORT_SCRIPT=deeplab_v3
+    EXPORTED_MODEL_NAME=dlv3_qnn.pte
+  elif [[ "${MODEL_NAME}" == "mv3" ]]; then
+    EXPORT_SCRIPT=mobilenet_v3
+    EXPORTED_MODEL_NAME=mv3_qnn.pte
+  elif [[ "${MODEL_NAME}" == "mv2" ]]; then
+    EXPORT_SCRIPT=mobilenet_v2
+    EXPORTED_MODEL_NAME=mv2_qnn.pte
+  elif [[ "${MODEL_NAME}" == "ic4" ]]; then
+    EXPORT_SCRIPT=inception_v4
+    EXPORTED_MODEL_NAME=ic4_qnn.pte
+  elif [[ "${MODEL_NAME}" == "ic3" ]]; then
+    EXPORT_SCRIPT=inception_v3
+    EXPORTED_MODEL_NAME=ic3_qnn.pte
+  elif [[ "${MODEL_NAME}" == "vit" ]]; then
+    EXPORT_SCRIPT=torchvision_vit
+    EXPORTED_MODEL_NAME=vit_qnn.pte
   fi
+
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m SM8550 --compile_only
+  EXPORTED_MODEL=./${EXPORT_SCRIPT}/${EXPORTED_MODEL_NAME}
 }
 
 if [[ "${BACKEND}" == "portable" ]]; then
 
@@ -1,5 +1,7 @@
 # The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml
 ciflow_push_tags:
+- ciflow/android
+- ciflow/apple
 - ciflow/nightly
 - ciflow/trunk
 - ciflow/binaries
 
@@ -84,9 +84,9 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: "stories110M"
+          CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3,vit"
           CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x"
-          CRON_DEFAULT_DELEGATES: "xnnpack"
+          CRON_DEFAULT_DELEGATES: "xnnpack,qnn"
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
@@ -162,6 +162,11 @@ jobs:
             # Test llama2
             if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
                 DELEGATE_CONFIG="xnnpack+custom+qe"
+            elif [[ ${{ matrix.delegate }} == "qnn" ]]; then
+                DELEGATE_CONFIG="qnn"
+            else
+                echo "Unsupported delegate ${{ matrix.delegate }}"
+                exit 1
             fi
             PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
         else
@@ -201,9 +206,6 @@ jobs:
     name: build-llm-demo
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     needs: set-parameters
-    strategy:
-      matrix:
-          tokenizer: [bpe]
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
@@ -222,7 +224,7 @@ jobs:
 
         # TODO: This needs to be replaced with a generic loader .apk
         # Build LLM Demo for Android
-        bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME}
+        bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
   upload-android-apps:
@@ -278,9 +280,8 @@ jobs:
       # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
       # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
       # one app+flavor that could load and run the model.
-      # TODO: Hard code llm_demo_bpe for now in this job.
-      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug.apk
-      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug-androidTest.apk
+      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug.apk
+      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug-androidTest.apk
       # NB: Need to set the default spec here so that it works for periodic too
       test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }}
       # Uploaded to S3 from the previous job
 
@@ -26,9 +26,6 @@ jobs:
   build-llm-demo:
     name: build-llm-demo
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      matrix:
-          tokenizer: [bpe, tiktoken]
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
@@ -46,7 +43,7 @@ jobs:
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
         # Build LLM Demo for Android
-        bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME}
+        bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
   upload-artifacts:
@@ -157,13 +154,6 @@ jobs:
       id-token: write
       contents: read
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
-    strategy:
-      matrix:
-        # https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/README.md#alternative-2-build-from-local-machine
-        # mentions that tiktoken is only for Llama3. So, we can export it later in another archive
-        # like https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip when this is
-        # updated to run Llama3
-        tokenizer: [bpe]
     with:
       device-type: android
       runner: linux.2xlarge
@@ -173,8 +163,8 @@ jobs:
       # This is the custom Android device pool that only includes Samsung Galaxy S2x
       device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa
       # Uploaded to S3 from the previous job, the name of the app comes from the project itself
-      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug.apk
-      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug-androidTest.apk
+      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug.apk
+      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug-androidTest.apk
       test-spec: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml
       # Among the input, this is the biggest file, so it is cached on AWS to make the test faster. Note that the file is deleted by AWS after 30
       # days and the job will automatically re-upload the file when that happens.
 
@@ -15,7 +15,7 @@ on:
       - build/build_apple_frameworks.sh
       - build/create_frameworks.sh
       - build/test_ios_ci.sh
-      - examples/demo-apps/**
+      - examples/demo-apps/apple/**
       - extension/apple/**
       - extension/module/**
   workflow_dispatch:
 
@@ -305,7 +305,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        model: [dl3]
+        model: [dl3, mv3, mv2, ic4, ic3, vit]
       fail-fast: false
     with:
       runner: linux.2xlarge
 
@@ -637,6 +637,13 @@ if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
 endif()
 
+if(EXECUTORCH_BUILD_PTHREADPOOL
+   AND EXECUTORCH_BUILD_CPUINFO
+   AND CMAKE_CXX_STANDARD GREATER_EQUAL 14
+)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
+endif()
+
 if(EXECUTORCH_BUILD_PYBIND)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)
 
 
@@ -45,6 +45,8 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.hardtanh.default,
             exir_ops.edge.aten.convolution.default,
             exir_ops.edge.aten.div.Tensor,
+            exir_ops.edge.aten.exp.default,
+            exir_ops.edge.aten.log.default,
             exir_ops.edge.aten.split_with_sizes_copy.default,
             exir_ops.edge.aten.full.default,
             exir_ops.edge.aten.mul.Tensor,
 
@@ -14,9 +14,11 @@
     op_conv2d,
     op_dequant,
     op_div,
+    op_exp,
     op_full,
     op_get_item,
     op_hardtanh,
+    op_log,
     op_mean_dim,
     op_mm,
     op_mul,
 
@@ -40,7 +40,7 @@ def adjust_pad_if_needed(self, input, weight, stride, pad, dilation):
 
         if mod_remainder > pad:
             raise RuntimeError(
-                f"ignoring input element is not currently supported, got a large stride {stride}"
+                "This case should be handled by the SizeAdjustConv2d pass, is it enabled?"
             )
         return pad - mod_remainder
 
 
@@ -0,0 +1,81 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+
+import numpy as np
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+
+from executorch.backends.arm.tosa_quant_utils import (
+    dequantize_value,
+    get_quant_node_args,
+    QuantArgs,
+    quantize_value,
+)
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class ExpVisitor(NodeVisitor):
+    target = "aten.exp.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+
+        assert len(node.all_input_nodes) == 1
+        assert len(node.users) == 1
+
+        if is_quant_node:
+            # Assume quantized input is 8 bit.
+
+            # Create attribute for 8 bit table lookup.
+            input_node = node.all_input_nodes[0]
+            in_quantargs = get_quant_node_args(input_node)
+            output_node = list(node.users)[0]
+            out_quantargs = get_quant_node_args(output_node)
+
+            table = exp_table_8bit(in_quantargs, out_quantargs)
+            table_attr = ts.TosaSerializerAttribute()
+            table_attr.TableAttribute(table)
+
+            tosa_graph.addOperator(
+                TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr
+            )
+        else:
+            tosa_graph.addOperator(TosaOp.Op().EXP, [inputs[0].name], [output.name])
+
+
+def exp_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs):
+    """
+    Returns a table mapping 256 entries to exp([qmin,qmax])
+    """
+
+    def exp(x):
+        # Convert quantized input to floating point exp input space.
+        v = dequantize_value(x, in_quantargs)
+        # Compute exp.
+        v = np.exp(v)
+        # Convert exp output back to quantized space.
+        return quantize_value(v, out_quantargs)
+
+    return [
+        exp(x)
+        for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8)
+    ]
@@ -0,0 +1,81 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+
+import numpy as np
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+
+from executorch.backends.arm.tosa_quant_utils import (
+    dequantize_value,
+    get_quant_node_args,
+    QuantArgs,
+    quantize_value,
+)
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class LogVisitor(NodeVisitor):
+    target = "aten.log.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+
+        assert len(node.all_input_nodes) == 1
+        assert len(node.users) == 1
+
+        if is_quant_node:
+            # Assume quantized input is 8 bit.
+
+            # Create attribute for 8 bit table lookup.
+            input_node = node.all_input_nodes[0]
+            in_quantargs = get_quant_node_args(input_node)
+            output_node = list(node.users)[0]
+            out_quantargs = get_quant_node_args(output_node)
+
+            table = log_table_8bit(in_quantargs, out_quantargs)
+            table_attr = ts.TosaSerializerAttribute()
+            table_attr.TableAttribute(table)
+
+            tosa_graph.addOperator(
+                TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr
+            )
+        else:
+            tosa_graph.addOperator(TosaOp.Op().LOG, [inputs[0].name], [output.name])
+
+
+def log_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs):
+    """
+    Returns a table mapping 256 entries to log([qmin,qmax])
+    """
+
+    def log(x):
+        # Convert quantized input to floating point log input space.
+        v = dequantize_value(x, in_quantargs)
+        # Compute log.
+        v = np.log(v)
+        # Convert log output back to quantized space.
+        return quantize_value(v, out_quantargs)
+
+    return [
+        log(x)
+        for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8)
+    ]
@@ -46,7 +46,9 @@ def call(self, graph_module: torch.fx.GraphModule):
         NHWC_Order = (0, 2, 3, 1)
         HWCM_Order = (2, 3, 0, 1)
         for node in graph_module.graph.nodes:
-            if isinstance(node.meta["val"], tuple):
+            if isinstance(
+                node.meta["val"], (tuple, torch.fx.immutable_collections.immutable_list)
+            ):
                 node_data = node.meta["val"][0].data
             else:
                 node_data = node.meta["val"].data
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e4cd76cf8283c8ddbf95674b020fbfcff467cb4b`
	`1`	`+00e3eea170ce5db8ea9c62ce5e48f13886cd6d20`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ def adjust_pad_if_needed(self, input, weight, stride, pad, dilation):`
`40`	`40`
`41`	`41`	`if mod_remainder > pad:`
`42`	`42`	`raise RuntimeError(`
`43`		`- f"ignoring input element is not currently supported, got a large stride {stride}"`
	`43`	`+ "This case should be handled by the SizeAdjustConv2d pass, is it enabled?"`
`44`	`44`	`)`
`45`	`45`	`return pad - mod_remainder`
`46`	`46`