pytorch
diff --git a/‎.github/workflows/ghstack_land.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/ghstack_land.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/__init__.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operators/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/op_select.py
Lines changed: 69 additions & 0 deletions b/‎backends/arm/operators/op_select.py
Lines changed: 69 additions & 0 deletions
diff --git a/‎backends/arm/quantizer/quantization_annotation/generic_annotator.py
Lines changed: 2 additions & 0 deletions b/‎backends/arm/quantizer/quantization_annotation/generic_annotator.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/test/ops/test_select.py
Lines changed: 198 additions & 0 deletions b/‎backends/arm/test/ops/test_select.py
Lines changed: 198 additions & 0 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/ops_registrations.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/hifi/operators/dequantize_per_tensor.cpp
Lines changed: 4 additions & 1 deletion b/‎backends/cadence/hifi/operators/dequantize_per_tensor.cpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/cadence/hifi/operators/quantize_per_tensor.cpp
Lines changed: 4 additions & 1 deletion b/‎backends/cadence/hifi/operators/quantize_per_tensor.cpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/cadence/reference/operators/quantized_conv_out.cpp
Lines changed: 5 additions & 0 deletions b/‎backends/cadence/reference/operators/quantized_conv_out.cpp
Lines changed: 5 additions & 0 deletions
@@ -11,6 +11,7 @@ on:
       - 'gh/kimishpatel/[0-9]+/base'
       - 'gh/kirklandsign/[0-9]+/base'
       - 'gh/larryliu0820/[0-9]+/base'
+      - 'gh/lucylq/[0-9]+/base'
       - 'gh/manuelcandales/[0-9]+/base'
       - 'gh/mcr229/[0-9]+/base'
       - 'gh/swolchok/[0-9]+/base'
 
@@ -92,7 +92,7 @@ tools.
 ├── runtime                         #  Core C++ runtime.
 |   ├── backend                     #  Backend delegate runtime APIs.
 |   ├── core                        #  Core structures used across all levels of the runtime.
-|   ├── executor                    #  Model loading, initalization, and execution.
+|   ├── executor                    #  Model loading, initialization, and execution.
 |   ├── kernel                      #  Kernel registration and management.
 |   ├── platform                    #  Layer between architecture specific code and portable C++.
 ├── schema                          #  ExecuTorch PTE file format flatbuffer
 
@@ -62,6 +62,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.relu.default,
             exir_ops.edge.aten.rsqrt.default,
             exir_ops.edge.aten._softmax.default,
+            exir_ops.edge.aten.select_copy.int,
             exir_ops.edge.aten._log_softmax.default,
             exir_ops.edge.aten.slice_copy.Tensor,
             exir_ops.edge.aten.sub.Tensor,
 
@@ -28,6 +28,7 @@
     op_relu,
     op_repeat,
     op_rsqrt,
+    op_select,
     op_sigmoid,
     op_slice,
     op_squeeze,
 
@@ -0,0 +1,69 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+
+from executorch.backends.arm.tosa_mapping import TosaArg
+
+from executorch.backends.arm.tosa_utils import build_reshape, tosa_shape
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class SelectVisitor(NodeVisitor):
+    target = "aten.select_copy.int"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+
+        assert len(inputs) == 3
+        input_node, dim, index = inputs
+        shape = input_node.shape
+        rank = len(shape)
+
+        dim = dim.number % rank if dim.number < 0 else dim.number
+        index = index.number % rank if index.number < 0 else index.number
+
+        # For aten.select_copy, the output will be rank[input_shape - 1]
+        # For TOSA rank(in) == rank(out).
+        # Add an intermediate with the same rank
+        expanded_shape = tuple(1 if i == dim else shape[i] for i in range(rank))
+        expanded_shape = tosa_shape(expanded_shape, input_node.dim_order)
+
+        output_reshaped = tosa_graph.addIntermediate(
+            expanded_shape, ts.DType.INT8 if is_quant_node else output.dtype
+        )
+
+        attr_slice = ts.TosaSerializerAttribute()
+
+        start_attr = [index if i == dim else 0 for i in input_node.dim_order]
+        size_attr = [
+            1 if i == dim else input_node.shape[i] for i in input_node.dim_order
+        ]
+
+        attr_slice.SliceAttribute(start_attr, size_attr)
+
+        tosa_graph.addOperator(
+            TosaOp.Op().SLICE, [input_node.name], [output_reshaped.name], attr_slice
+        )
+
+        # Reshape back to original rank of output.
+        build_reshape(tosa_graph, output_reshaped.name, output.shape, output.name)
@@ -34,6 +34,8 @@
     # torch.ops.aten.view_as_real.default,
     # torch.ops.aten.view_as_real_copy.default,
     torch.ops.aten.view_copy.default,
+    torch.ops.aten.select.int,
+    torch.ops.aten.select_copy.int,
     torch.ops.aten.slice.Tensor,
     torch.ops.aten.slice_copy.Tensor,
     # 'concat' should be handled separately as it has a sequence of inputs and
 
@@ -0,0 +1,198 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+test_data_t = tuple[torch.Tensor, int, int]
+
+test_data_suite: list[tuple[test_data_t]] = [
+    # (test_data, dim, index)
+    ((torch.zeros(5, 3, 20), -1, 0),),
+    ((torch.zeros(5, 3, 20), 0, -1),),
+    ((torch.zeros(5, 3, 20), 0, 4),),
+    ((torch.ones(10, 10, 10), 0, 2),),
+    ((torch.rand(5, 3, 20, 2), 0, 2),),
+    ((torch.rand(10, 10) - 0.5, 0, 0),),
+    ((torch.randn(10) + 10, 0, 1),),
+    ((torch.randn(10) - 10, 0, 2),),
+    ((torch.arange(-16, 16, 0.2), 0, 1),),
+]
+
+
+class TestSelect(unittest.TestCase):
+    class SelectCopy(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, dim: int, index: int):
+            return torch.select_copy(x, dim=dim, index=index)
+
+    class SelectInt(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, dim: int, index: int):
+            return torch.select(x, dim=dim, index=index)
+
+    def _test_select_tosa_MI_pipeline(
+        self,
+        module: torch.nn.Module,
+        test_data: test_data_t,
+        export_target: str,
+    ):
+        # For 4D tensors, do not permute to NHWC
+        permute = False if len(test_data[0].shape) == 4 else True
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(
+                    permute_memory_to_nhwc=permute
+                ),
+            )
+            .export()
+            .check([export_target])
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_select_tosa_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        test_data: test_data_t,
+        export_target: str,
+    ):
+        # For 4D tensors, do not permute to NHWC
+        permute = False if len(test_data[0].shape) == 4 else True
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(
+                    permute_memory_to_nhwc=permute
+                ),
+            )
+            .quantize()
+            .export()
+            .check([export_target])
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .dump_artifact()
+            .dump_operator_distribution()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_select_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: test_data_t,
+        export_target: str,
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .check([export_target])
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .dump_artifact()
+            .dump_operator_distribution()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    def _test_select_tosa_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: test_data_t, export_target: str
+    ):
+        # For 4D tensors, do not permute to NHWC
+        permute = False if len(test_data[0].shape) == 4 else True
+        self._test_select_ethos_BI_pipeline(
+            common.get_u55_compile_spec(permute_memory_to_nhwc=permute),
+            module,
+            test_data,
+            export_target,
+        )
+
+    def _test_select_tosa_u85_BI_pipeline(
+        self, module: torch.nn.Module, test_data: test_data_t, export_target: str
+    ):
+        # For 4D tensors, do not permute to NHWC
+        permute = False if len(test_data[0].shape) == 4 else True
+        self._test_select_ethos_BI_pipeline(
+            common.get_u85_compile_spec(permute_memory_to_nhwc=permute),
+            module,
+            test_data,
+            export_target,
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_select_copy_tosa_MI(self, test_data: test_data_t):
+        self._test_select_tosa_MI_pipeline(
+            self.SelectCopy(), test_data, export_target="torch.ops.aten.select_copy.int"
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_select_int_tosa_MI(self, test_data: test_data_t):
+        self._test_select_tosa_MI_pipeline(
+            self.SelectInt(), test_data, export_target="torch.ops.aten.select.int"
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_select_copy_tosa_BI(self, test_data: test_data_t):
+        self._test_select_tosa_BI_pipeline(
+            self.SelectCopy(), test_data, export_target="torch.ops.aten.select_copy.int"
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_select_int_tosa_BI(self, test_data: test_data_t):
+        self._test_select_tosa_BI_pipeline(
+            self.SelectInt(), test_data, export_target="torch.ops.aten.select.int"
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_select_copy_tosa_u55_BI(self, test_data: test_data_t):
+        self._test_select_tosa_u55_BI_pipeline(
+            self.SelectCopy(), test_data, export_target="torch.ops.aten.select_copy.int"
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_select_int_tosa_u55_BI(self, test_data: test_data_t):
+        self._test_select_tosa_u55_BI_pipeline(
+            self.SelectInt(), test_data, export_target="torch.ops.aten.select.int"
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_select_copy_tosa_u85_BI(self, test_data: test_data_t):
+        self._test_select_tosa_u85_BI_pipeline(
+            self.SelectCopy(), test_data, export_target="torch.ops.aten.select_copy.int"
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_select_int_tosa_u85_BI(self, test_data: test_data_t):
+        self._test_select_tosa_u85_BI_pipeline(
+            self.SelectInt(), test_data, export_target="torch.ops.aten.select.int"
+        )
@@ -188,7 +188,7 @@ def quantized_relu_meta(
     out_multiplier: torch.Tensor,
     out_shift: torch.Tensor,
 ) -> torch.Tensor:
-    return X.new_empty(X.size(), dtype=torch.uint8)
+    return X.new_empty(X.size(), dtype=X.dtype)
 
 
 @register_fake("cadence::quantized_matmul")
 
@@ -45,7 +45,10 @@ void dequantize_per_tensor_out(
     const int32_t* input_data = input.const_data_ptr<int32_t>();
     dequantize<int32_t>(out_data, input_data, scale, zero_point, numel);
   } else {
-    ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
   }
 }
 
 
@@ -49,7 +49,10 @@ void quantize_per_tensor_out(
     cadence::impl::HiFi::kernels::quantize<int32_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else {
-    ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type());
+    ET_CHECK_MSG(
+        false,
+        "Unhandled output dtype %hhd",
+        static_cast<int8_t>(out.scalar_type()));
   }
 }
 
 
@@ -248,6 +248,11 @@ void quantized_conv_out(
         output_scale,
         (int8_t)output_zero_point,
         per_tensor_quantized);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
   }
 }
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,10 @@ void dequantize_per_tensor_out(`
`45`	`45`	`const int32_t* input_data = input.const_data_ptr<int32_t>();`
`46`	`46`	`dequantize<int32_t>(out_data, input_data, scale, zero_point, numel);`
`47`	`47`	`} else {`
`48`		`- ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());`
	`48`	`+ ET_CHECK_MSG(`
	`49`	`+ false,`
	`50`	`+ "Unhandled input dtype %hhd",`
	`51`	`+ static_cast<int8_t>(input.scalar_type()));`
`49`	`52`	`}`
`50`	`53`	`}`
`51`	`54`
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,10 @@ void quantize_per_tensor_out(`
`49`	`49`	`cadence::impl::HiFi::kernels::quantize<int32_t>(`
`50`	`50`	`out_data, input_data, 1. / scale, zero_point, numel);`
`51`	`51`	`} else {`
`52`		`- ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type());`
	`52`	`+ ET_CHECK_MSG(`
	`53`	`+ false,`
	`54`	`+ "Unhandled output dtype %hhd",`
	`55`	`+ static_cast<int8_t>(out.scalar_type()));`
`53`	`56`	`}`
`54`	`57`	`}`
`55`	`58`
Original file line number	Diff line number	Diff line change
`@@ -248,6 +248,11 @@ void quantized_conv_out(`
`248`	`248`	`output_scale,`
`249`	`249`	`(int8_t)output_zero_point,`
`250`	`250`	`per_tensor_quantized);`
	`251`	`+ } else {`
	`252`	`+ ET_CHECK_MSG(`
	`253`	`+ false,`
	`254`	`+ "Unhandled input dtype %hhd",`
	`255`	`+ static_cast<int8_t>(input.scalar_type()));`
`251`	`256`	`}`
`252`	`257`	`}`
`253`	`258`