pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/common/install_clang.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/common/install_clang.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/requirements-ci.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/requirements-ci.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/setup-linux.sh
Lines changed: 3 additions & 2 deletions b/‎.ci/scripts/setup-linux.sh
Lines changed: 3 additions & 2 deletions
diff --git a/‎.ci/scripts/setup-qnn-deps.sh
Lines changed: 4 additions & 2 deletions b/‎.ci/scripts/setup-qnn-deps.sh
Lines changed: 4 additions & 2 deletions
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 5 additions & 2 deletions b/‎.ci/scripts/utils.sh
Lines changed: 5 additions & 2 deletions
diff --git a/‎.github/pull_request_template.md
Lines changed: 9 additions & 0 deletions b/‎.github/pull_request_template.md
Lines changed: 9 additions & 0 deletions
diff --git a/‎.github/workflows/_unittest.yml
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/_unittest.yml
Lines changed: 7 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 15 additions & 2 deletions b/‎CONTRIBUTING.md
Lines changed: 15 additions & 2 deletions
diff --git a/‎backends/apple/coreml/README.md
Lines changed: 2 additions & 2 deletions b/‎backends/apple/coreml/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/aot/functions.yaml
Lines changed: 4 additions & 0 deletions b/‎backends/cadence/aot/functions.yaml
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml
Lines changed: 4 additions & 0 deletions b/‎backends/cadence/aot/functions_hifi.yaml
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py
Lines changed: 21 additions & 0 deletions b/‎backends/cadence/aot/ops_registrations.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎backends/cadence/hifi/operators/quantized_layer_norm.cpp
Lines changed: 41 additions & 3 deletions b/‎backends/cadence/hifi/operators/quantized_layer_norm.cpp
Lines changed: 41 additions & 3 deletions
diff --git a/‎backends/cadence/reference/operators/quantized_layer_norm.cpp
Lines changed: 51 additions & 7 deletions b/‎backends/cadence/reference/operators/quantized_layer_norm.cpp
Lines changed: 51 additions & 7 deletions
diff --git a/‎backends/qualcomm/runtime/backends/QnnFunctionInterface.h
Lines changed: 1 addition & 0 deletions b/‎backends/qualcomm/runtime/backends/QnnFunctionInterface.h
Lines changed: 1 addition & 0 deletions
@@ -1 +1 @@
-bd5482c7c3e1197e10c46ff739027f917d9c1fcc
+c8a648d4dffb9f0133ff4a2ea0e660b42105d3ad
@@ -13,7 +13,7 @@ install_ubuntu() {
   apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
   apt-get install -y --no-install-recommends llvm-"$CLANG_VERSION"
   # Also require LLD linker from llvm and libomp to build PyTorch from source
-  apt-get install -y lld "libomp-${CLANG_VERSION}-dev"
+  apt-get install -y lld "libomp-${CLANG_VERSION}-dev" "libc++-${CLANG_VERSION}-dev"
 
   # Use update-alternatives to make this version the default
   update-alternatives --install /usr/bin/clang clang /usr/bin/clang-"$CLANG_VERSION" 50
 
@@ -1,5 +1,5 @@
 mpmath==1.3.0
-numpy==1.22.0; python_version == '3.10'
+numpy==1.21.3; python_version == '3.10'
 numpy==1.23.2; python_version == '3.11'
 numpy; python_version >= '3.12'
 PyYAML==6.0.1
 
@@ -19,6 +19,7 @@ else
 fi
 
 # As Linux job is running inside a Docker container, all of its dependencies
-# have already been installed
-install_executorch
+# have already been installed, so we use PyTorch build from source here instead
+# of nightly. This allows CI to test against latest commits from PyTorch
+install_executorch "use-pt-pinned-commit"
 build_executorch_runner "${BUILD_TOOL}"
@@ -31,8 +31,9 @@ install_qnn() {
 }
 
 setup_libc++() {
+  clang_version=$1
   sudo apt-get update
-  pkgs_to_check=('libc++-dev')
+  pkgs_to_check=("libc++-${clang_version}-dev")
   j=0
   while [ $j -lt ${#pkgs_to_check[*]} ]; do
     install_status=$(verify_pkg_installed ${pkgs_to_check[$j]})
@@ -47,5 +48,6 @@ setup_libc++() {
   done
 }
 
-setup_libc++
+# This needs to match with the clang version from the Docker image
+setup_libc++ 12
 install_qnn
@@ -20,8 +20,11 @@ install_executorch() {
   which pip
   # Install executorch, this assumes that Executorch is checked out in the
   # current directory.
-  # TODO(T199538337): clean up install scripts to use install_requirements.sh
-  ./install_requirements.sh --pybind xnnpack
+  if [[ "${1:-}" == "use-pt-pinned-commit" ]]; then
+    ./install_requirements.sh --pybind xnnpack --use-pt-pinned-commit
+  else
+    ./install_requirements.sh --pybind xnnpack
+  fi
   # Just print out the list of packages for debugging
   pip list
 }
 
@@ -0,0 +1,9 @@
+### Summary
+[PLEASE REMOVE] See [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests) for ExecuTorch PR guidelines.
+
+[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes #<issue-id>` line.
+
+[PLEASE REMOVE] If this PR introduces a fix or feature that should be the upcoming release notes, please add a "Release notes: <area>" label. For a list of available release notes labels, check out [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).
+
+### Test plan
+[PLEASE REMOVE] How did you test this PR? Please write down any manual commands you used and note down tests that you have written if applicable.
@@ -37,6 +37,9 @@ jobs:
         CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
         .ci/scripts/setup-linux.sh cmake
 
+        # Install llama3_2_vision dependencies.
+        PYTHON_EXECUTABLE=python ./examples/models/llama3_2_vision/install_requirements.sh
+
         # Run pytest with coverage
         pytest -n auto --cov=./ --cov-report=xml
         # Run gtest
@@ -67,6 +70,10 @@ jobs:
         ${CONDA_RUN} --no-capture-output \
         .ci/scripts/setup-macos.sh cmake
 
+        # Install llama3_2_vision dependencies.
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        ./examples/models/llama3_2_vision/install_requirements.sh
+
         # Run pytest with coverage
         ${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
         # Run gtest
 
@@ -137,6 +137,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-arm-sdk
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -162,6 +163,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-arm-sdk
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
 
@@ -242,14 +242,27 @@ for basics.
    - Give the PR a clear and thorough description. Don't just describe what the PR
      does: the diff will do that. Explain *why* you are making this change, in a
      way that will make sense to someone years from now.
-   - Add the line `Test Plan:` (with that spelling, capitalization, and trailing
-     colon character), followed by lines containing repeatable instructions for
+   - Explain how you have tested your changes by including repeatable instructions for
      testing the PR.
      - If you added tests, this can be as simple as the command you used to run the
        tests.
      - If you tested the PR manually, include the steps and the outputs. Help a
        future editor understand how to test the code that you're modifying
        today.
+   - If your PR contains or is representative of a feature/bug fix that should be
+     called out in the release notes, please add a label for "Release notes: \<area\>",
+	 where \<area\> describes which part of ExecuTorch the change pertains to, e.g.
+	 "Release notes: runtime". Here are all of the categories:
+     - `Release notes: runtime`: changes related to the core runtime which loads the program methods, initializes delegates, and runs the lowered graph.
+     - `Release notes: exir`: changes to any internal representations, such as any edge-related dialects. Also any changes to passes that may modify the exir, such as memory planning.
+     - `Release notes: quantization`: changes to quantization.
+     - `Release notes: ops & kernels`: changes to the opset and any new / changed kernel implementations.
+     - `Release notes: api`: changes to public facing apis (any interfaces, pybinded runtime methods, etc.).
+     - `Release notes: backends`: changes to any of the backend delegates.
+     - `Release notes: build`: changes related to the build system, including major dependency upgrades, notable build flags, optimizations, etc.
+     - `Release notes: devtools`: changes to any of ExecuTorch's developer tools, for example the debugger & profiler.
+     - `Release notes: examples`: changes to any code under `examples/`.
+     - `Release notes: misc`: anything notable that doesn't belong in the above categories.
    - See https://github.com/pytorch/executorch/pull/3612 for an example PR that
      follows this advice.
 1. Before asking for a review, ensure that all [CI (continuous integration)
 
@@ -65,7 +65,7 @@ To quantize a Program in a Core ML favored way, the client may utilize **CoreMLQ
 import torch
 import executorch.exir
 
-from torch._export import capture_pre_autograd_graph
+from torch.export import export_for_training
 from torch.ao.quantization.quantize_pt2e import (
     convert_pt2e,
     prepare_pt2e,
@@ -93,7 +93,7 @@ class Model(torch.nn.Module):
 source_model = Model()
 example_inputs = (torch.randn((1, 3, 256, 256)), )
 
-pre_autograd_aten_dialect = capture_pre_autograd_graph(model, example_inputs)
+pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
 
 quantization_config = LinearQuantizerConfig.from_dict(
     {
 
@@ -154,6 +154,10 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_layer_norm_out
+- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_layer_norm_per_tensor_out
 
 - func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
 
@@ -125,6 +125,10 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_layer_norm_out
+- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_layer_norm_per_tensor_out
 
 - func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
 
@@ -36,6 +36,12 @@
 lib.define(
     "quantized_layer_norm.out(Tensor X, Tensor X_scale, Tensor X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
 )
+lib.define(
+    "quantized_layer_norm.per_tensor(Tensor X, float X_scale, int X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point) -> (Tensor Y)"
+)
+lib.define(
+    "quantized_layer_norm.per_tensor_out(Tensor X, float X_scale, int X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
 
 lib.define(
     "quantized_linear(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset) -> (Tensor Z)"
@@ -180,6 +186,21 @@ def quantized_layer_norm_meta(
     return input.new_empty(input.size(), dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_layer_norm.per_tensor")
+def quantized_layer_norm_per_tensor_meta(
+    input: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    normalized_shape: int,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    output_scale: float,
+    output_zero_point: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_relu")
 def quantized_relu_meta(
     X: torch.Tensor,
 
@@ -27,7 +27,7 @@ namespace native {
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
 template <typename T>
-void quantized_layer_norm_(
+void quantized_layer_norm_per_tensor_(
     const Tensor& input,
     float input_scale,
     int64_t input_zero_point,
@@ -107,7 +107,7 @@ void quantized_layer_norm_(
   int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
 
   // Call other overload
-  quantized_layer_norm_<T>(
+  quantized_layer_norm_per_tensor_<T>(
       input,
       input_scale,
       input_zero_point,
@@ -120,7 +120,7 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    KernelRuntimeContext& ctx,
+    __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
@@ -157,6 +157,44 @@ void quantized_layer_norm_out(
 #undef typed_quantized_layer_norm
 }
 
+void quantized_layer_norm_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    __ET_UNUSED const IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+#define typed_quantized_layer_norm(ctype, dtype) \
+  case ScalarType::dtype: {                      \
+    quantized_layer_norm_per_tensor_<ctype>(     \
+        input,                                   \
+        in_scale,                                \
+        in_zero_point,                           \
+        weight,                                  \
+        bias,                                    \
+        eps,                                     \
+        output_scale,                            \
+        output_zero_point,                       \
+        out);                                    \
+    break;                                       \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_layer_norm)
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_layer_norm
+}
+
 }; // namespace native
 }; // namespace HiFi
 }; // namespace impl
 
@@ -11,9 +11,11 @@
 
 #include <cmath>
 
-using executorch::aten::Tensor;
-using executorch::runtime::getLeadingDims;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::getLeadingDims;
+using ::executorch::runtime::KernelRuntimeContext;
 
 namespace impl {
 namespace reference {
@@ -22,7 +24,7 @@ namespace native {
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
 template <typename T>
-void quantized_layer_norm_(
+void quantized_layer_norm_per_tensor_(
     const Tensor& input,
     double input_scale,
     int64_t input_zero_point,
@@ -98,7 +100,7 @@ void quantized_layer_norm_(
   int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
 
   // Call other overload
-  quantized_layer_norm_<T>(
+  quantized_layer_norm_per_tensor_<T>(
       input,
       input_scale,
       input_zero_point,
@@ -111,11 +113,11 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    KernelRuntimeContext& ctx,
+    __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
-    const executorch::aten::IntArrayRef normalized_shape,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
     const Tensor& weight,
     const Tensor& bias,
     double eps,
@@ -152,6 +154,48 @@ void quantized_layer_norm_out(
   }
 }
 
+void quantized_layer_norm_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
+    quantized_layer_norm_per_tensor_<uint8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
+    quantized_layer_norm_per_tensor_<int8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
+  }
+}
+
 }; // namespace native
 }; // namespace reference
 }; // namespace impl
@@ -70,6 +70,7 @@ class QnnInterface {
   DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
   // --------- QnnProfile ---------
   DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
+  DEFINE_SHIM_FUNCTION_INTERFACE(profile_set_config, profileSetConfig);
   DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
   DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
   DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-bd5482c7c3e1197e10c46ff739027f917d9c1fcc`
	`1`	`+c8a648d4dffb9f0133ff4a2ea0e660b42105d3ad`