Merge branch 'main' into jz/fix-prefill

jackzhxng · jackzhxng · commit 9d84a42a5d57 · 2025-01-10T10:20:23.000-08:00
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-2ea4b56ec872424e486c4fe2d55da061067a2ed3
+0a94bb432ed75cc2d950d81b2921363218a7e459
diff --git a/backends/arm/README.md b/backends/arm/README.md
@@ -122,6 +122,28 @@ The you can run the tests with
 pytest -c /dev/null -v -n auto backends/arm/test --arm_quantize_io --arm_run_corstoneFVP
 ```
 
+### Code coverage
+
+To get code coverage:
+
+```
+coverage run --source=<SRC> --rcfile=backends/arm/test/.coveragerc -m pytest \
+--config-file=/dev/null backends/arm/test/
+```
+
+All files in `SRC` and its child directories will be analysed for code coverage,
+unless explicitly exluded in the .coveragerc file. If using venv this might be
+under `env/lib/python<VERSION_NUMBER>/site-packages/executorch/`. To get the
+absolute path, run:
+
+```
+python -c "import executorch; print(executorch.__path__)"
+```
+
+This contains a list of paths where the source directory is located. Pick the
+one that is located in `env/lib`. If that does not work try the others. Add
+`backends/arm` to the path in `--source` to only get code coverage for the Arm
+backend.
 
 ### A note on unit tests
 
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -37,6 +37,9 @@
     QuantizeFullArgument,
     RetraceFoldedDtypesPass,
 )
+from executorch.backends.arm._passes.fuse_quantized_activation_pass import (
+    FuseQuantizedActivationPass,
+)
 from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import (
     KeepDimsFalseToSqueezePass,
@@ -73,6 +76,7 @@ def transform_to_backend_pipeline(
         self, exported_program: ExportedProgram, compile_spec: list[CompileSpec]
     ):
         """Apply passes before transforming program to backend"""
+        self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(DecomposeLinearPass())
         self.add_pass(RemoveGetItemPass())
         self.add_pass(DecomposeLayerNormPass())
diff --git a/backends/arm/_passes/fuse_quantized_activation_pass.py b/backends/arm/_passes/fuse_quantized_activation_pass.py
@@ -0,0 +1,60 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm.tosa_quant_utils import q_op
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx import Node
+
+
+class FuseQuantizedActivationPass(ExportPass):
+    def _is_fuseable_quantized_activation(self, node: Node):
+        """Fuse activations that have a 0 lower bound and quantized with a qmin zero-point"""
+        is_fuseable = node.target == exir_ops.edge.aten.relu.default
+        if node.target == exir_ops.edge.aten.hardtanh.default:
+            min_val = node.args[1]
+            is_fuseable = min_val == 0
+
+        is_quantized = len(node.users) == 1 and next(iter(node.users)).target == q_op
+        if is_quantized:
+            quant_node = next(iter(node.users))
+            zp = quant_node.args[2]
+            qmin = quant_node.args[3]
+
+        return is_fuseable and is_quantized and zp == qmin
+
+    def _is_fuseable_input(self, node: Node):
+        return (
+            node.target
+            in (
+                exir_ops.edge.aten.convolution.default,
+                exir_ops.edge.aten.linear.default,
+            )
+            and len(node.users) == 1
+        )
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        modified = False
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+
+            if not self._is_fuseable_quantized_activation(node):
+                continue
+
+            input_node = node.args[0]
+            if not self._is_fuseable_input(input_node):
+                continue
+
+            node.replace_all_uses_with(input_node)
+            graph_module.graph.erase_node(node)
+            modified = True
+
+        if modified:
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -89,6 +89,41 @@ def _annotate_output(node: Node, quant_property: _QuantProperty):
     _annotate_output_qspec(node, quant_property.qspec)
 
 
+def _match_pattern(
+    node: Node, pattern: List[List], filter_fn: Optional[Callable[[Node], bool]] = None
+) -> bool:
+    """
+    Check if there's a chain of node.ancestors? -> node -> node.descendant? that matches the
+    chain provided in 'pattern'. If 'filter_fn' is provided, check that all the nodes in the
+    chain pass the filtering.
+
+    Each 'pattern' element is composed of a list of disjunctive nodes types.
+    """
+    assert len(pattern) == 2, "Only two-nodes patterns supported currently"
+
+    if node.target in pattern[0]:
+        assert len(node.users) != 0
+        parent = node
+        child = next(iter(node.users))
+    elif node.target in pattern[1]:
+        assert len(node.args) != 0
+        parent = node.args[0]
+        child = node
+    else:
+        return False
+
+    if len(parent.users) != 1:
+        return False
+
+    if parent.target not in pattern[0] or child.target not in pattern[1]:
+        return False
+
+    if filter_fn is not None:
+        return filter_fn(parent) and filter_fn(child)
+
+    return True
+
+
 _one_to_one = [
     torch.ops.aten.exp.default,
     torch.ops.aten.log.default,
@@ -164,7 +199,36 @@ def get_quant_properties(  # noqa: C901
     bias_qspec = quantization_config.get_bias_qspec()
 
     quant_properties = _OpQuantProperties()
-    if node.target in (
+
+    def any_or_hardtanh_min_zero(n: Node):
+        # Check that if the node is a hardtanh, its min_val is zero
+        return n.target != torch.ops.aten.hardtanh.default or n.args[1] == 0
+
+    if _match_pattern(
+        node,
+        [
+            [
+                torch.ops.aten.conv1d.default,
+                torch.ops.aten.conv2d.default,
+                torch.ops.aten.linear.default,
+            ],
+            [torch.ops.aten.relu.default, torch.ops.aten.hardtanh.default],
+        ],
+        any_or_hardtanh_min_zero,
+    ):
+        if node.target in (
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+            torch.ops.aten.linear.default,
+        ):
+            quant_properties.quant_inputs = [
+                _QuantProperty(0, input_act_qspec),
+                _QuantProperty(1, weight_qspec, mark_annotated=True),
+                _QuantProperty(2, bias_qspec, optional=True, mark_annotated=True),
+            ]
+        else:
+            quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
+    elif node.target in (
         torch.ops.aten.conv1d.default,
         torch.ops.aten.conv2d.default,
         torch.ops.aten.linear.default,
diff --git a/backends/arm/test/.coveragerc b/backends/arm/test/.coveragerc
@@ -0,0 +1,8 @@
+[run]
+omit =
+    *__init__.py*
+
+[report]
+skip_covered = true
+exclude_also =
+    raise NotImplementedError
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
@@ -137,10 +137,11 @@ class ComboConvRelu6(torch.nn.Module):
     ]
 
     test_data = [
-        (20 * torch.randn(1, 3, 256, 256),),
-        (5 * torch.randn(1, 3, 256, 256),),
+        (2 * torch.randn(1, 3, 256, 256),),
+        (0.5 * torch.randn(1, 3, 256, 256),),
         (torch.randn(1, 3, 256, 256),),
-        (-5 * torch.randn(1, 3, 256, 256),),
+        (-0.5 * torch.randn(1, 3, 256, 256),),
+        (-2 * torch.randn(1, 3, 256, 256),),
     ]
 
     def __init__(self):
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -35,7 +35,11 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * output at a single output location.
  */
 void main() {
-  const ivec3 pos = idx_to_ipos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits.y);
+  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
+  const ivec3 pos = ivec3(
+    gl_GlobalInvocationID.x % out_limits.x,
+    div_by_x % out_limits.y,
+    div_by_x / out_limits.y);
 
   if (any(greaterThanEqual(pos, out_limits))) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -47,7 +47,11 @@ void main() {
   // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z
   const ivec2 out_limits_xy_scaled = (out_limits.xy + ivec2(BATCH_SIZE_X, BATCH_SIZE_Y) - 1) / ivec2(BATCH_SIZE_X, BATCH_SIZE_Y);
 
-  ivec3 pos = idx_to_ipos_x_wise(gl_GlobalInvocationID.x, out_limits_xy_scaled.x, out_limits_xy_scaled.y);
+  const uint div_by_x = gl_GlobalInvocationID.x / out_limits_xy_scaled.x;
+  ivec3 pos = ivec3(
+    gl_GlobalInvocationID.x % out_limits_xy_scaled.x,
+    div_by_x % out_limits_xy_scaled.y,
+    div_by_x / out_limits_xy_scaled.y);
 
   // scale pos.xy by batch sizes, because that's the top pixel to be processed
   pos.x *= BATCH_SIZE_X;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -44,7 +44,11 @@ void main() {
   const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE;
   const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
 
-  const ivec3 gpos = idx_to_ipos_x_wise(gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y);
+  const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x;
+  const ivec3 gpos = ivec3(
+    gl_GlobalInvocationID.x % out_limits_scaled.x,
+    div_by_x % out_limits_scaled.y,
+    div_by_x / out_limits_scaled.y);
 
   // Output position for TILE_SIZE = 2
   // +--------+--------+
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -223,11 +223,6 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
   return pos;
 }
 
-ivec3 idx_to_ipos_x_wise(uint idx, int size_x, int size_y) {
-  const uint div_by_x = idx / size_x;
-  return ivec3(idx % size_x, div_by_x % size_y, div_by_x / size_y);
-}
-
 #ifdef USING_BUFFER
 #define load_texel(buf, idx) buf[idx]
 #elif defined(USING_TEXTURE2D)
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
@@ -1,4 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -679,6 +680,9 @@ def _assert_outputs_equal(model_output, ref_output, atol=1e-03, rtol=1e-03):
         for i in range(len(model_output)):
             model = model_output[i]
             ref = ref_output[i]
+            assert (
+                ref.shape == model.shape
+            ), f"Output {i} shape {model.shape} does not match reference output shape {ref.shape}"
             assert torch.allclose(
                 model,
                 ref,
diff --git a/install_requirements.py b/install_requirements.py
@@ -132,7 +132,7 @@ def python_is_compatible():
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION = "dev20241218"
+NIGHTLY_VERSION = "dev20250104"
 
 # The pip repository that hosts nightly torch packages.
 TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-2ea4b56ec872424e486c4fe2d55da061067a2ed3`
	`1`	`+0a94bb432ed75cc2d950d81b2921363218a7e459`
Original file line number	Diff line number	Diff line change
`@@ -137,10 +137,11 @@ class ComboConvRelu6(torch.nn.Module):`
`137`	`137`	`]`
`138`	`138`
`139`	`139`	`test_data = [`
`140`		`- (20 * torch.randn(1, 3, 256, 256),),`
`141`		`- (5 * torch.randn(1, 3, 256, 256),),`
	`140`	`+ (2 * torch.randn(1, 3, 256, 256),),`
	`141`	`+ (0.5 * torch.randn(1, 3, 256, 256),),`
`142`	`142`	`(torch.randn(1, 3, 256, 256),),`
`143`		`- (-5 * torch.randn(1, 3, 256, 256),),`
	`143`	`+ (-0.5 * torch.randn(1, 3, 256, 256),),`
	`144`	`+ (-2 * torch.randn(1, 3, 256, 256),),`
`144`	`145`	`]`
`145`	`146`
`146`	`147`	`def __init__(self):`