pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 0 additions & 1 deletion b/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 0 additions & 1 deletion
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 2 deletions b/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 2 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/quantizer/quantization_annotator.py
Lines changed: 35 additions & 10 deletions b/‎backends/arm/quantizer/quantization_annotator.py
Lines changed: 35 additions & 10 deletions
diff --git a/‎backends/arm/scripts/build_executorch.sh
Lines changed: 1 addition & 40 deletions b/‎backends/arm/scripts/build_executorch.sh
Lines changed: 1 addition & 40 deletions
diff --git a/‎backends/arm/test/tester/analyze_output_utils.py
Lines changed: 16 additions & 8 deletions b/‎backends/arm/test/tester/analyze_output_utils.py
Lines changed: 16 additions & 8 deletions
diff --git a/‎backends/mediatek/partitioner.py
Lines changed: 1 addition & 0 deletions b/‎backends/mediatek/partitioner.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/mediatek/scripts/mtk_build.sh
Lines changed: 1 addition & 0 deletions b/‎backends/mediatek/scripts/mtk_build.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/vulkan/_passes/fuse_quantized_ops.py
Lines changed: 4 additions & 1 deletion b/‎backends/vulkan/_passes/fuse_quantized_ops.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/vulkan/_passes/int4_weight_only_quantizer.py
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/_passes/int4_weight_only_quantizer.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/_passes/tag_memory_meta_pass.py
Lines changed: 2 additions & 2 deletions b/‎backends/vulkan/_passes/tag_memory_meta_pass.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/op_registry.py
Lines changed: 25 additions & 8 deletions b/‎backends/vulkan/op_registry.py
Lines changed: 25 additions & 8 deletions
diff --git a/‎backends/vulkan/partitioner/vulkan_partitioner.py
Lines changed: 4 additions & 3 deletions b/‎backends/vulkan/partitioner/vulkan_partitioner.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 9 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 9 additions & 0 deletions
@@ -1 +1 @@
-01f1cc44cbbfdf6307aa01b803a4ee22f9ade946
+5616fa4a68718ead203314a3467f7dd9547153ae
@@ -40,7 +40,6 @@ cmake --build cmake-out -j16 --target install --config Release
 
 # Install llama runner with torchao
 cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
 
@@ -158,8 +158,7 @@ build_executorch_runner() {
 cmake_install_executorch_lib() {
   echo "Installing libexecutorch.a and libportable_kernels.a"
   clean_executorch_install_folders
-  retry cmake -DBUCK2="$BUCK" \
-          -DCMAKE_INSTALL_PREFIX=cmake-out \
+  retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
 
@@ -371,7 +371,7 @@ jobs:
         size=${arr[4]}
         # threshold=48120 on devserver with gcc11.4
         # todo(lfq): update once binary size is below 50kb.
-        threshold="51408"
+        threshold="55504"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
@@ -406,7 +406,7 @@ jobs:
         output=$(ls -la cmake-out/test/size_test)
         arr=($output)
         size=${arr[4]}
-        threshold="47560"
+        threshold="51656"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
 
@@ -95,7 +95,10 @@ def _is_ok_for_quantization(
             continue
 
         for n_arg in _as_list(node.args[quant_property.index]):
-            assert isinstance(n_arg, Node)
+            if not isinstance(n_arg, Node):
+                raise TypeError(
+                    f"n_arg must be a Node instance, got {type(n_arg).__name__!r}"
+                )
             if not is_ok_for_quantization(n_arg, gm):  # type: ignore[attr-defined]
                 logger.debug(
                     f'could not quantize node due to input "{node}": '
@@ -108,7 +111,10 @@ def _is_ok_for_quantization(
 
 
 def _annotate_input(node: Node, quant_property: _QuantProperty):
-    assert not is_annotated(node)
+    if is_annotated(node):
+        raise RuntimeError(
+            f"Cannot annotate input: node '{node.name}' is already annotated"
+        )
     if quant_property.optional and (
         quant_property.index >= len(node.args)
         or node.args[quant_property.index] is None
@@ -120,17 +126,28 @@ def _annotate_input(node: Node, quant_property: _QuantProperty):
         _as_list(quant_property.qspec),
         strict=True,
     ):
-        assert isinstance(n_arg, Node)
+        if not isinstance(n_arg, Node):
+            raise TypeError(
+                f"n_arg must be a Node instance, got {type(n_arg).__name__!r}"
+            )
         annotate_input_qspec_map(node, n_arg, qspec)
         if quant_property.mark_annotated:
             mark_node_as_annotated(n_arg)  # type: ignore[attr-defined]
 
 
 def _annotate_output(node: Node, quant_property: _QuantProperty):
-    assert not is_annotated(node)
-    assert not quant_property.mark_annotated
-    assert not quant_property.optional
-    assert quant_property.index == 0, "Only one output annotation supported currently"
+    if is_annotated(node):
+        raise RuntimeError(
+            f"Cannot annotate output: node '{node.name}' is already annotated"
+        )
+    if quant_property.mark_annotated:
+        raise ValueError(
+            "quant_property.mark_annotated must be False for output annotation"
+        )
+    if quant_property.optional:
+        raise ValueError("quant_property.optional must be False for output annotation")
+    if quant_property.index != 0:
+        raise ValueError("Only one output annotation supported currently")
 
     annotate_output_qspec(node, quant_property.qspec)
 
@@ -145,7 +162,9 @@ def _match_pattern(
 
     Each 'pattern' element is composed of a list of disjunctive nodes types.
     """
-    assert len(pattern) > 0, "No pattern provided"
+    if len(pattern) < 1:
+        raise ValueError("No pattern provided")
+
     if filter_fn is not None:
         if not filter_fn(node):
             return False
@@ -417,8 +436,14 @@ def any_or_hardtanh_min_zero(n: Node):
         torch.ops.aten.concatenate.default,
         torch.ops.aten.stack.default,
     ):
-        assert isinstance(node.args[0], list)
-        assert len(node.args[0]) != 0
+        # first argument should be a non-empty list of nodes
+        if not isinstance(node.args[0], list):
+            raise TypeError(
+                "Expected node.args[0] to be a list, got "
+                f"{type(node.args[0]).__name__!r}"
+            )
+        if len(node.args[0]) == 0:
+            raise ValueError("Expected non-empty list for node.args[0]")
 
         shared_qspec = SharedQuantizationSpec((node.args[0][0], node))
         quant_properties.quant_inputs = [
 
@@ -54,47 +54,9 @@ source ${setup_path_script}
 
 et_build_dir="${et_build_root}/cmake-out"
 
-# Used for flatcc host excutable if Devtools is used
-et_build_host_dir=${et_build_root}/cmake-out-host-tools
-
 set -x
 cd "${et_root_dir}"
 
-if [ "$build_with_etdump" = true ] ; then
-    ( set +x ;
-        echo "--------------------------------------------------------------------------------" ;
-        echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_build_host_dir}/bin/flatcc" ;
-        echo "--------------------------------------------------------------------------------" )
-
-    # Build host flatcc bin
-    # This is a way to work around that the flatcc executable get build for target (e.g. Arm) later
-    # and get replaced. flatcc is a tool used on the host for etdump and BundleIO handling.
-    # The way to solve this is to generate it once for the host, then copy it to ${et_build_host_dir}/bin
-    # and later point that out with -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc later.
-
-    cmake                                                 \
-        -DCMAKE_INSTALL_PREFIX=${et_build_host_dir}       \
-        -DCMAKE_BUILD_TYPE=${build_type}                  \
-        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
-        -DEXECUTORCH_ENABLE_LOGGING=ON                    \
-        -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
-        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
-        -DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
-        -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
-        -DFLATCC_ALLOW_WERROR=OFF                         \
-        -B"${et_build_host_dir}"                          \
-        "${et_root_dir}"
-
-    # third-party/flatcc/bin/flatcc gets build already in the in the cmake config step above
-    # so there is no cmake building step done
-
-    # Copy host flatcc excutable so it's saved when we build for target (Arm) later
-    et_build_host_dir=$(realpath ${et_build_host_dir})
-    mkdir -p ${et_build_host_dir}/bin
-    cp third-party/flatcc/bin/flatcc ${et_build_host_dir}/bin
-fi
-
 ( set +x ;
     echo "--------------------------------------------------------------------------------" ;
     echo "Build ExecuTorch target libs ${build_type} into '${et_build_dir}'" ;
@@ -111,8 +73,7 @@ if [ "$build_with_etdump" = true ] ; then
     build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
                             -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
                             -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
-                            -DFLATCC_ALLOW_WERROR=OFF                         \
-                            -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc "
+                            -DFLATCC_ALLOW_WERROR=OFF "
 fi
 
 echo "Building with Devtools: ${build_devtools_flags} ${build_with_etdump_flags}"
 
@@ -154,6 +154,13 @@ def print_error_diffs(
         output_str += f"BATCH {n}\n"
         result_batch = result[n, :, :, :]
         reference_batch = reference[n, :, :, :]
+
+        if reference_batch.dtype == torch.bool or result_batch.dtype == torch.bool:
+            mismatches = (reference_batch != result_batch).sum().item()
+            total = reference_batch.numel()
+            output_str += f"(BOOLEAN tensor) {mismatches} / {total} elements differ ({mismatches / total:.2%})\n"
+            continue
+
         is_close = torch.allclose(result_batch, reference_batch, rtol, atol)
         if is_close:
             output_str += ".\n"
@@ -180,14 +187,15 @@ def print_error_diffs(
                 output_str += _print_elements(
                     result[n, :, :, :], reference[n, :, :, :], C, H, W, rtol, atol
                 )
-
-    reference_range = torch.max(reference) - torch.min(reference)
-    diff = torch.abs(reference - result).flatten()
-    diff = diff[diff.nonzero()]
-    if not len(diff) == 0:
-        diff_percent = diff / reference_range
-        output_str += "\nMEAN      MEDIAN    MAX       MIN    (error as % of reference output range)\n"
-        output_str += f"{torch.mean(diff_percent):<8.2%}  {torch.median(diff_percent):<8.2%}  {torch.max(diff_percent):<8.2%}  {torch.min(diff_percent):<8.2%}\n"
+    # Only compute numeric error metrics if tensor is not boolean
+    if reference.dtype != torch.bool and result.dtype != torch.bool:
+        reference_range = torch.max(reference) - torch.min(reference)
+        diff = torch.abs(reference - result).flatten()
+        diff = diff[diff.nonzero()]
+        if not len(diff) == 0:
+            diff_percent = diff / reference_range
+            output_str += "\nMEAN      MEDIAN    MAX       MIN    (error as % of reference output range)\n"
+            output_str += f"{torch.mean(diff_percent):<8.2%}  {torch.median(diff_percent):<8.2%}  {torch.max(diff_percent):<8.2%}  {torch.min(diff_percent):<8.2%}\n"
 
     # Over-engineer separators to match output width
     lines = output_str.split("\n")
 
@@ -81,6 +81,7 @@ def ops_to_not_decompose(
             torch.ops.aten.upsample_bilinear2d.vec,
             torch.ops.aten.upsample_nearest2d.default,
             torch.ops.aten.upsample_nearest2d.vec,
+            torch.ops.aten._safe_softmax.default,
         ]
         return (ops_not_decompose, None)
 
 
@@ -33,6 +33,7 @@ rm -rf cmake-android-out && mkdir cmake-android-out && cd cmake-android-out
 cmake -DBUCK2="$BUCK_PATH" \
       -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \
       -DANDROID_ABI=arm64-v8a \
+      -DANDROID_PLATFORM=android-26 \
       -DEXECUTORCH_BUILD_NEURON=ON \
       -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
       ..
 
@@ -17,6 +17,7 @@
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
 
 #################
 ## linear_qcnw ##
@@ -224,6 +225,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 )
 
         graph_module.recompile()
-        graph_module = super().call(graph_module).graph_module
+        dead_code_elimination_pass(graph_module)
 
+        # Re-trace the graph since new nodes were (potentially) inserted
+        graph_module = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
@@ -7,7 +7,7 @@
 import torch
 import torch.nn.functional as F
 
-from torchao.quantization.GPTQ import _check_linear_int4_k
+from torchao.quantization.GPTQ.GPTQ import _check_linear_int4_k
 from torchao.quantization.unified import Quantizer
 from torchao.quantization.utils import groupwise_affine_quantize_tensor
 
 
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from copy import deepcopy
 from typing import Any, Optional, Set
 
 import executorch.backends.vulkan.utils as utils
@@ -22,6 +21,7 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.tensor import TensorSpec
 
 logger: logging.Logger = logging.getLogger("")
 logger.setLevel(logging.INFO)
@@ -52,7 +52,7 @@ def insert_transition_node(
             (arg,),
         )
         clone_node.meta["val"] = arg.meta["val"]
-        clone_node.meta["spec"] = deepcopy(arg.meta["spec"])
+        clone_node.meta["spec"] = TensorSpec.from_tensor(clone_node.meta["val"])
         clone_node.meta["spec"].const = False
         set_memory_metadata(clone_node, storage, layout)
         arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y)
 
@@ -230,6 +230,14 @@ def update_features_impl(op: OpKey):
         exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
         # Symbolic integer ops
         torch.ops.aten.sym_size.int,
+        operator.add,
+        operator.lt,
+        operator.gt,
+        operator.ge,
+        operator.le,
+        # Guard and assert ops
+        torch.ops.aten._assert_scalar.default,
+        torch.ops.aten.sym_constrain_range_for_size.default,
     ]
 )
 def register_ephemeral_op(features: OpFeatures):
@@ -500,7 +508,12 @@ def register_sdpa_with_kv_cache_op(features: OpFeatures):
     return features
 
 
-@update_features(["llama::update_cache", "llama::custom_sdpa"])
+@update_features(
+    [
+        "llama::update_cache",
+        "llama::custom_sdpa",
+    ]
+)
 def register_sdpa_ops(features: OpFeatures):
     features.resize_fn = False
     features.buffer_impl = False
@@ -520,8 +533,17 @@ def register_rotary_emb_op(features: OpFeatures):
     return features
 
 
-@update_features(exir_ops.edge.aten.view_copy.default)
-def register_view_op(features: OpFeatures):
+@update_features(
+    [
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.permute.default,
+        exir_ops.edge.aten.permute_copy.default,
+        exir_ops.edge.aten.select_copy.int,
+        exir_ops.edge.aten.slice_copy.Tensor,
+        exir_ops.edge.aten.view_copy.default,
+    ]
+)
+def register_view_ops(features: OpFeatures):
     features.texture_impl = TextureImplFeatures(
         valid_packed_dims=all_packed_dims,
     )
@@ -538,10 +560,8 @@ def register_view_op(features: OpFeatures):
         # Indexing and lookup
         exir_ops.edge.aten.flip.default,
         exir_ops.edge.aten.index_select.default,
-        exir_ops.edge.aten.select_copy.int,
         # Tensor creation
         exir_ops.edge.aten.arange.start_step,
-        exir_ops.edge.aten.clone.default,
         exir_ops.edge.aten.constant_pad_nd.default,
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.full_like.default,
@@ -564,12 +584,9 @@ def register_ported_op(features: OpFeatures):
 # Ops ported from PyTorch Vulkan backend. These ops are in a separate registry becasue they support all packed dimensions
 @update_features(
     [
-        # Indexing and lookup
-        exir_ops.edge.aten.slice_copy.Tensor,
         # Shape Manipulation
         exir_ops.edge.aten.squeeze_copy.dims,
         exir_ops.edge.aten.unsqueeze_copy.default,
-        exir_ops.edge.aten.permute_copy.default,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.repeat.default,
 
@@ -146,10 +146,11 @@ def op_node_is_compatible(  # noqa: C901: Function is too complex
     def node_is_compatible(
         self, node: torch.fx.Node, features: Optional[OpFeatures] = None
     ) -> Tuple[bool, str]:
-        if utils.is_symint_node(node):
-            return node.target in vulkan_supported_ops, "Op is compatible"
-        elif utils.is_tensor_node(node):
+        if utils.is_tensor_node(node):
             return self.op_node_is_compatible(node, features=features)
+        # For non-tensor nodes, just check if the op is registered
+        elif hasattr(node, "target"):
+            return node.target in vulkan_supported_ops, "Op is compatible"
 
         return False, f"Unsupported node type: {node.format_node()}"
 
 
@@ -449,6 +449,15 @@ ValueRef ComputeGraph::add_symint(const int32_t val) {
   return idx;
 }
 
+ValueRef ComputeGraph::get_or_add_value_for_int(const int64_t val) {
+  for (int i = 0; i < values_.size(); ++i) {
+    if (values_.at(i).isInt() && values_.at(i).toInt() == val) {
+      return i;
+    }
+  }
+  return add_scalar(val);
+}
+
 ValueRef ComputeGraph::set_input_tensor(
     const ValueRef idx,
     const bool use_staging) {
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-01f1cc44cbbfdf6307aa01b803a4ee22f9ade946`
	`1`	`+5616fa4a68718ead203314a3467f7dd9547153ae`
Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,7 @@ def ops_to_not_decompose(`
`81`	`81`	`torch.ops.aten.upsample_bilinear2d.vec,`
`82`	`82`	`torch.ops.aten.upsample_nearest2d.default,`
`83`	`83`	`torch.ops.aten.upsample_nearest2d.vec,`
	`84`	`+ torch.ops.aten._safe_softmax.default,`
`84`	`85`	`]`
`85`	`86`	`return (ops_not_decompose, None)`
`86`	`87`