pytorch
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 3 additions & 3 deletions b/‎.ci/scripts/utils.sh
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/apple.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/apple.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pull.yml
Lines changed: 8 additions & 5 deletions b/‎.github/workflows/pull.yml
Lines changed: 8 additions & 5 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/apple/mps/setup.md
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/setup.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/build_cadence_fusionG3.sh
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/build_cadence_fusionG3.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/build_cadence_hifi4.sh
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/build_cadence_hifi4.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/fusion_g3/operators/op_dequantize.cpp
Lines changed: 6 additions & 5 deletions b/‎backends/cadence/fusion_g3/operators/op_dequantize.cpp
Lines changed: 6 additions & 5 deletions
diff --git a/‎backends/cadence/reference/operators/quantized_conv_out.cpp
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/reference/operators/quantized_conv_out.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/_passes/insert_requantize.py
Lines changed: 0 additions & 6 deletions b/‎backends/qualcomm/_passes/insert_requantize.py
Lines changed: 0 additions & 6 deletions
diff --git a/‎backends/qualcomm/quantizer/custom_annotation.py
Lines changed: 68 additions & 6 deletions b/‎backends/qualcomm/quantizer/custom_annotation.py
Lines changed: 68 additions & 6 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 3 additions & 1 deletion b/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/vulkan/docs/android_demo.md
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/docs/android_demo.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
Lines changed: 3 additions & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
Lines changed: 32 additions & 5 deletions b/‎backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
Lines changed: 32 additions & 5 deletions
diff --git a/‎backends/xnnpack/README.md
Lines changed: 1 addition & 1 deletion b/‎backends/xnnpack/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎build/test_ios.sh
Lines changed: 1 addition & 1 deletion b/‎build/test_ios.sh
Lines changed: 1 addition & 1 deletion
@@ -17,17 +17,17 @@ retry () {
 }
 
 clean_executorch_install_folders() {
-  ./install_requirements.sh --clean
+  ./install_executorch.sh --clean
 }
 
 install_executorch() {
   which pip
   # Install executorch, this assumes that Executorch is checked out in the
   # current directory.
   if [[ "${1:-}" == "use-pt-pinned-commit" ]]; then
-    ./install_requirements.sh --pybind xnnpack --use-pt-pinned-commit
+    ./install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
   else
-    ./install_requirements.sh --pybind xnnpack
+    ./install_executorch.sh --pybind xnnpack
   fi
   # Just print out the list of packages for debugging
   pip list
 
@@ -9,7 +9,7 @@ on:
     paths:
       - .ci/scripts/setup-ios.sh
       - .github/workflows/apple.yml
-      - install_requirements.sh
+      - install_executorch.sh
       - backends/apple/**
       - build/build_apple_frameworks.sh
       - build/build_apple_llm_demo.sh
 
@@ -200,7 +200,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_requirements.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack
 
         # install Llava requirements
         bash examples/models/llama/install_requirements.sh
@@ -333,6 +333,9 @@ jobs:
 
   unittest-arm:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -433,7 +436,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_requirements.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack
 
         # install phi-3-mini requirements
         bash examples/models/phi-3-mini/install_requirements.sh
@@ -460,7 +463,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_requirements.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
@@ -487,7 +490,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_requirements.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
@@ -514,7 +517,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_requirements.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
 
@@ -132,6 +132,9 @@ jobs:
   test-arm-backend-delegation:
     name: test-arm-backend-delegation
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -159,6 +162,9 @@ jobs:
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
 
@@ -97,7 +97,7 @@ I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successf
 ### [Optional] Run the generated model directly using pybind
 1. Make sure `pybind` MPS support was installed:
 ```bash
-./install_requirements.sh --pybind mps
+./install_executorch.sh --pybind mps
 ```
 2. Run the `mps_example` script to trace the model and run it directly from python:
 ```bash
 
@@ -12,7 +12,7 @@ unset XTENSA_CORE
 export XTENSA_CORE=FCV_FG3GP
 git submodule sync
 git submodule update --init
-./install_requirements.sh
+./install_executorch.sh
 
 rm -rf cmake-out
 
 
@@ -12,7 +12,7 @@ unset XTENSA_CORE
 export XTENSA_CORE=nxp_rt600_RI23_11_newlib
 git submodule sync
 git submodule update --init
-./install_requirements.sh
+./install_executorch.sh
 
 rm -rf cmake-out
 
 
@@ -67,8 +67,8 @@ void check_dequantize_per_tensor_args(
 
   ET_CHECK_MSG(
       input.scalar_type() == dtype,
-      "input.scalar_type() %" PRId8 " is not matching dtype argumenta:",
-      static_cast<int8_t>(input.scalar_type()));
+      "input.scalar_type() %s is not matching dtype arguments:",
+      ::executorch::runtime::toString(input.scalar_type()));
 
   if (out_dtype.has_value()) {
     ET_CHECK_MSG(
@@ -561,11 +561,12 @@ Tensor& dequantize_per_tensor_out(
     const Tensor& input,
     double scale,
     int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
+    __ET_UNUSED int64_t quant_min,
+    __ET_UNUSED int64_t quant_max,
     ScalarType dtype,
-    ::executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
+  constexpr ScalarType out_dtype = ScalarType::Float;
+
 #ifdef OP_ARG_CHECK
   torch::executor::Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
 
@@ -119,7 +119,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
                     if (((_h + d0 * _wh - p0) >= 0) &&
                         ((_h + d0 * _wh - p0) < h) &&
                         ((_w + d1 * _ww - p1) >= 0) &&
-                        ((_w + d1 * _ww - p1 < w))) {
+                        ((_w + d1 * _ww - p1) < w)) {
                       int ioff =
                           (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1);
                       int woff = _wh * ww + _ww;
 
@@ -89,15 +89,9 @@ def _single_output_annotation(
         requantize_dict = n.meta.pop(QCOM_REQUANTIZE)
         # {quant_attr: user_node_name_list}
         group_quant_attr_dict = self._invert_dict(requantize_dict)
-        # TODO: If users of the node contain output node,
-        # we replace the node with to_copy op. However, it would
-        # be problem when the node has multiple to_copy ops
-        add_output = len(group_quant_attr_dict) == 1
 
         for hashable_quant_attr, user_nodes in group_quant_attr_dict.items():
             user_nodes_copy = user_nodes.copy()
-            if add_output:
-                user_nodes_copy.append("output")
             self._insert_to_copy(gm, n, dict(hashable_quant_attr), user_nodes_copy)
 
     def _insert(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
 
@@ -14,17 +14,80 @@
     QuantizationConfig,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
-from torch.ao.quantization.observer import MinMaxObserver
+from torch.ao.quantization.observer import FixedQParamsObserver, MinMaxObserver
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
+    QuantizationSpec,
     SharedQuantizationSpec,
 )
 from torch.fx import Node
 
 
-def annotate_matmul_16a8w(  # noqa: C901
-    gm: torch.fx.GraphModule, traverse_input1=True
-) -> None:
+def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None:
+    def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
+        input_qspec_map = {}
+        input_act = node.args[0]
+        input_spec = quantization_config.input_activation
+        input_qspec_map[input_act] = input_spec
+
+        weight = node.args[1]
+        input_qspec_map[weight] = quantization_config.weight
+
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+    quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config(
+        torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
+    )
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default:
+            if "nn_module_stack" in node.meta:
+                module_values_list = list(node.meta["nn_module_stack"].values())
+                full_qualified_name = module_values_list[-1][0]
+                if full_qualified_name == "output.conv":
+                    annotate_conv2d(
+                        node, quantization_config=quantization_config_16a8w_per_channel
+                    )
+
+
+def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
+    for node in gm.graph.nodes:
+        if node.op == "output":
+            for index, prefill_output in enumerate(node.args[0]):
+                kv_quant_attr = kv_quant_attrs[index]
+                fixed_observer = FixedQParamsObserver.with_args(
+                    scale=kv_quant_attr[0],
+                    zero_point=kv_quant_attr[1],
+                    quant_min=kv_quant_attr[2],
+                    quant_max=kv_quant_attr[3],
+                    dtype=kv_quant_attr[4],
+                    qscheme=torch.torch.per_tensor_affine,
+                )
+
+                fixed_output_spec = QuantizationSpec(
+                    quant_min=kv_quant_attr[2],
+                    quant_max=kv_quant_attr[3],
+                    dtype=kv_quant_attr[4],
+                    ch_axis=0,
+                    observer_or_fake_quant_ctr=fixed_observer,
+                )
+
+                input_qspec_map = {}
+                for input in prefill_output.args:
+                    if isinstance(input, Node):
+                        input_qspec_map[input] = fixed_output_spec
+
+                prefill_output.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+                    input_qspec_map=input_qspec_map,
+                    output_qspec=fixed_output_spec,
+                    _annotated=True,
+                )
+
+
+def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
     """
     This function is specific for matmul op 16a8w.
     For k, we will tag such as the below, and
@@ -142,8 +205,7 @@ def annotate_matmul_input1(node: Node):
     for node in gm.graph.nodes:
         if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
             annotate_matmul(node, quantization_config_16a8w)
-            if traverse_input1:
-                annotate_matmul_input1(node.args[1])
+            annotate_matmul_input1(node.args[1])
 
 
 def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
 
@@ -3280,7 +3280,7 @@ def test_stories_single_llama(self):
 
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama2/llama.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
             "--artifact",
             self.artifact_dir,
             "--build_folder",
@@ -3307,6 +3307,8 @@ def test_stories_single_llama(self):
             "16a4w",
             "--temperature",
             "0",
+            "--llama_model",
+            "stories110m",
         ]
         if self.host:
             cmds.extend(["--host", self.host])
 
@@ -81,7 +81,7 @@ First, build and install ExecuTorch libraries, then build the LLaMA runner
 binary using the Android NDK toolchain.
 
 ```shell
-./install_requirements.sh --clean
+./install_executorch.sh --clean
 (mkdir cmake-android-out && \
   cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 
@@ -33,7 +33,9 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 // shared memory to hold calculated positions, this would reduce register usage thus improving performance.
-shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
+// 64 is the number of threads in the local wg
+$num_shared = 64 * TILE_SIZE * TILE_SIZE
+shared ivec2 pos_shared[${num_shared}];
 
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
 
@@ -30,16 +30,38 @@ at::Tensor linear_weight_int4_reference_impl(
   const size_t ndim = original_x_size.size();
   const int64_t out_features = weights_4x2.size(0);
   const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]});
-  const at::Tensor packed_weights =
-      at::_convert_weight_to_int4pack(weights_4x2, inner_k_tiles);
-  at::Tensor out = at::_weight_int4pack_mm(
-      x_flattened, packed_weights, groupsize, scales_and_zeros);
+  at::Tensor out = at::_weight_int4pack_mm_for_cpu(
+      x_flattened, weights_4x2, groupsize, scales_and_zeros);
   std::vector<int64_t> out_shape(
       original_x_size.begin(), original_x_size.end());
   out_shape.at(ndim - 1) = out_features;
   return out.reshape(out_shape);
 }
 
+at::Tensor unpack_weights_4x2(const at::Tensor& weights_4x2) {
+  std::vector<int64_t> weights_shape(weights_4x2.sizes().vec());
+  weights_shape[1] *= 2;
+
+  at::Tensor weights_unpacked =
+      at::empty(weights_shape, at::device(at::kCPU).dtype(at::kInt));
+
+  const int64_t N = weights_unpacked.size(0);
+  const int64_t K = weights_unpacked.size(1);
+
+  for (int n = 0; n < N; n++) {
+    for (int k = 0; k < K; k += 2) {
+      const uint8_t packed_val = weights_4x2[n][k / 2].item().to<uint8_t>();
+      const uint8_t second_val = packed_val & 0x0F;
+      const uint8_t first_val = (packed_val & 0xF0) >> 4;
+
+      weights_unpacked[n][k] = int(first_val);
+      weights_unpacked[n][k + 1] = int(second_val);
+    }
+  }
+
+  return weights_unpacked;
+}
+
 at::Tensor dequantize_and_linear(
     const at::Tensor& x,
     const at::Tensor& weights_4x2,
@@ -91,13 +113,18 @@ void test_reference_linear_int4(
   at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
   at::Tensor weights_4x2 =
       at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
+  at::Tensor weights_int = unpack_weights_4x2(weights_4x2);
 
   const int k_groups = K / group_size;
   at::Tensor scales_and_zeros =
       at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat));
 
   at::Tensor out = linear_weight_int4_reference_impl(
-      x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles);
+      x,
+      at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size),
+      group_size,
+      scales_and_zeros,
+      inner_k_tiles);
 
   at::Tensor out_ref = dequantize_and_linear(
       x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles);
 
@@ -98,7 +98,7 @@ After exporting the XNNPACK Delegated model, we can now try running it with exam
 cd executorch
 
 # Get a clean cmake-out directory
-./install_requirements.sh --clean
+./install_executorch.sh --clean
 mkdir cmake-out
 
 # Configure cmake
 
@@ -63,7 +63,7 @@ say "Installing Requirements"
 
 pip install --upgrade cmake pip setuptools wheel zstd
 
-./install_requirements.sh --pybind coreml mps xnnpack
+./install_executorch.sh --pybind coreml mps xnnpack
 export PATH="$(realpath third-party/flatbuffers/cmake-out):$PATH"
 ./build/install_flatc.sh