pytorch
diff --git a/‎.ci/scripts/build_android_instrumentation.sh
Lines changed: 6 additions & 5 deletions b/‎.ci/scripts/build_android_instrumentation.sh
Lines changed: 6 additions & 5 deletions
diff --git a/‎.github/workflows/_android.yml
Lines changed: 5 additions & 4 deletions b/‎.github/workflows/_android.yml
Lines changed: 5 additions & 4 deletions
diff --git a/‎.github/workflows/doc-build.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/doc-build.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/lint.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/lint.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/arm/_passes/insert_rescales_pass.py
Lines changed: 6 additions & 6 deletions b/‎backends/arm/_passes/insert_rescales_pass.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎backends/arm/_passes/insert_table_ops.py
Lines changed: 106 additions & 15 deletions b/‎backends/arm/_passes/insert_table_ops.py
Lines changed: 106 additions & 15 deletions
diff --git a/‎backends/arm/operator_support/tosa_supported_operators.py
Lines changed: 20 additions & 5 deletions b/‎backends/arm/operator_support/tosa_supported_operators.py
Lines changed: 20 additions & 5 deletions
diff --git a/‎backends/arm/operators/node_visitor.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/node_visitor.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_rescale.py
Lines changed: 5 additions & 3 deletions b/‎backends/arm/operators/op_rescale.py
Lines changed: 5 additions & 3 deletions
@@ -13,9 +13,11 @@ fi
 which "${PYTHON_EXECUTABLE}"
 
 build_android_test() {
-  pushd extension/android_test
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew testDebugUnitTest
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
+  mkdir -p extension/android/executorch_android/src/androidTest/resources
+  cp extension/module/test/resources/add.pte extension/android/executorch_android/src/androidTest/resources
+  pushd extension/android
+  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
+  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
   popd
 }
 
@@ -24,8 +26,7 @@ collect_artifacts_to_be_uploaded() {
   # Collect Java library test
   JAVA_LIBRARY_TEST_DIR="${ARTIFACTS_DIR_NAME}/library_test_dir"
   mkdir -p "${JAVA_LIBRARY_TEST_DIR}"
-  cp extension/android_test/build/outputs/apk/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
-  cp extension/android_test/build/outputs/apk/androidTest/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
+  cp extension/android/executorch_android/build/outputs/apk/androidTest/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
 }
 
 main() {
 
@@ -28,14 +28,16 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool buck2
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
+        mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
+        bash examples/models/llama/install_requirements.sh
+        bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
+
         # Build LLM Demo for Android
         export BUILD_AAR_DIR=aar-out
+        mkdir -p $BUILD_AAR_DIR
         bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
         bash .ci/scripts/build_android_instrumentation.sh ${ARTIFACTS_DIR_NAME}
 
-        mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
-        bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
-
         mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
         cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs
         pushd examples/demo-apps/android/LlamaDemo
@@ -94,7 +96,6 @@ jobs:
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug.apk
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug-androidTest.apk
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/fp32-xnnpack-custom/model.zip
-          curl -o android-test-debug.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch-debug.apk
           curl -o android-test-debug-androidTest.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch-debug-androidTest.apk
           unzip model.zip
           mv *.pte model.pte
 
@@ -26,7 +26,7 @@ jobs:
     with:
       job-name: Build doc
       runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
+      docker-image: executorch-ubuntu-22.04-clang12-android
       submodules: 'true'
       repository: pytorch/executorch
       upload-artifact: docs
@@ -70,8 +70,8 @@ jobs:
 
         # Build javadoc:
         cd extension/android
-        ./gradlew javadoc
-        cp -rf build/docs/javadoc "${RUNNER_DOCS_DIR}"
+        ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:javaDocReleaseGeneration
+        cp -rf executorch_android/build/intermediates/java_doc_dir/release/javaDocReleaseGeneration "${RUNNER_DOCS_DIR}/javadoc"
         cd ../..
 
         # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
 
@@ -76,8 +76,8 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
-        FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \
-          examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \
+        FILES_NEEDS_FORMAT=$(/opt/google-java-format -n \
+          extension/android/executorch_android/src/main/java/org/pytorch/executorch/*.java \
           examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java \
           extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java)
         if [ -n "$FILES_NEEDS_FORMAT" ]; then
 
@@ -60,7 +60,7 @@ jobs:
           - runner: linux.arm64.2xlarge
             docker-image: executorch-ubuntu-22.04-clang12
           # TODO: Need to figure out why buck2 doesnt work on Graviton instances.
-          - runner: linux.arm64.2xlarge 
+          - runner: linux.arm64.2xlarge
             build-tool: buck2
       fail-fast: false
     with:
@@ -420,7 +420,6 @@ jobs:
     permissions:
       id-token: write
       contents: read
-    needs: test-llama-runner-linux
 
   unittest:
     uses: ./.github/workflows/_unittest.yml
 
@@ -38,17 +38,17 @@ def rescale_fake(
     """Casts the input tensor to dtype `dtype` to produce the correct tensor meta for a _rescale op.
     Additionally validates TOSA constraints of a RESCALE op.
     """
-    if not (dtype == torch.int32 or dtype == torch.int8):
+    if dtype not in (torch.int32, torch.int8, torch.int16):
         raise NotImplementedError(
-            "tosa::rescale currently only supports int32 and int8."
+            f"tosa::rescale currently only supports int32, int16 and int8, not {dtype}"
         )
-    if dtype == torch.int32 and out_zp != 0:
+    if dtype in (torch.int32, torch.int16) and out_zp != 0:
         raise ValueError(
-            "TOSA requires output_zp to be zero when the output dtype is int32."
+            f"TOSA requires output_zp to be zero when the output dtype is {dtype}."
         )
-    if x.dtype == torch.int32 and in_zp != 0:
+    if x.dtype in (torch.int32, torch.int16) and in_zp != 0:
         raise ValueError(
-            "TOSA requires input_zp to be zero when the input dtype is int32."
+            f"TOSA requires input_zp to be zero when the input dtype is {dtype}"
         )
     if x.dtype == torch.int8 and not -128 <= in_zp <= 127:
         raise ValueError(f"{in_zp=} outside valid range (-128,127) for int8.")
 
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,6 +17,7 @@
 
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule
+
 from torch.library import impl, Library
 
 lib = Library("tosa", "DEF")
@@ -26,7 +26,10 @@
 
 @impl(lib, "_table")
 def _table_impl(*args, **kwargs):  # pyre-ignore
-    return args[0]
+    in_dtype = args[0].dtype
+    if in_dtype == torch.int8:
+        return args[0]
+    return args[0].to(dtype=torch.int32)
 
 
 class InsertTableOpsPass(ExportPass):
@@ -59,29 +62,105 @@ def register_buffer(self, buffer_name: str, buffer: torch.Tensor) -> None:
         """
         self.exported_program.state_dict[buffer_name] = buffer
 
-    def generate_table_values(
+    def generate_8bit_table_values(
         self,
         torch_op: Callable[[torch.Tensor], torch.Tensor],
         in_quantargs: QuantArgs,
         out_quantargs: QuantArgs,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, int]:
+        """Compute LUT values for a INT8 TOSA.TABLE. Also returns 0 since no shifting is required after 8bit table.
+        The INT8 table is a simple 256 value 1-1 LUT.
+        """
+
         def f(x: torch.Tensor) -> torch.Tensor:
             x = in_quantargs.dequantize_value(x)
             x = torch_op(x)
             return out_quantargs.quantize_value(x)
 
-        input_dtype = in_quantargs.dtype
-        steps = in_quantargs.qmax - in_quantargs.qmin + 1
-        return f(
+        return (
+            f(
+                torch.linspace(
+                    start=in_quantargs.qmin,
+                    end=in_quantargs.qmax,
+                    steps=256,
+                    # use torch.int64 to avoid overflow when dequantizing (subtracting zp).
+                    # e.g. torch.tensor(-50, dtype=torch.int8) - 100 == torch.tensor(106, dtype=torch.int8)
+                    dtype=torch.int64,
+                )
+            ).to(dtype=torch.int8),
+            0,
+        )
+
+    def generate_16_bit_table_values(
+        self,
+        torch_op: Callable[[torch.Tensor], torch.Tensor],
+        in_quantargs: QuantArgs,
+        out_quantargs: QuantArgs,
+    ) -> tuple[torch.Tensor, int]:
+        """Compute LUT values for a INT16 TOSA.TABLE with 32 bit output.
+        In practice the output is 23 bits that should be interpreted as 16 'whole' bits and 7 fractional bits, see
+        the specification: https://www.mlplatform.org/tosa/tosa_spec.html#_table. This means that the output
+        will interpreted as 2**7=128 times too large unless accounted for by rescaling down the table output.
+
+        Quantization can be either int16 or int32 which means that the op output could be larger than the 23 bits from
+        the TOSA.TABLE output. In that case, we need to rescale up the output.
+
+        To handle this we need to:
+        1) Make sure that our table values fit within 16 bits.
+        2) Insert a rescale after the table to handle the x128 from the fractional bits and match the quantization.
+
+        The function returns rescale_lshift which says how much to rescale after the table. This value can negative.
+        """
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            # Dont use the 7 LSBs.
+            x = in_quantargs.dequantize_value((x & ~0x7F))
+            x = torch_op(x)
+            return out_quantargs.quantize_value(x)
+
+        lut_values = f(
             torch.linspace(
                 start=in_quantargs.qmin,
-                end=in_quantargs.qmax,
-                steps=steps,
+                end=in_quantargs.qmax + 1,
+                steps=513,
                 # use torch.int64 to avoid overflow when dequantizing (subtracting zp).
                 # e.g. torch.tensor(-50, dtype=torch.int8) - 100 == torch.tensor(106, dtype=torch.int8)
                 dtype=torch.int64,
             )
-        ).to(dtype=input_dtype)
+        )
+        # Calculate how much we need to shift table values to fit in 16 signed bits
+        # ceil(log2(max absolute table value)) + 1 bit for signedness - 16
+        # Example:
+        #       Max value in the table is 70 000. We want to fit it in 16 signed bits.
+        #       70 000=0b10001000101110000 (17 digits) has ceil(log2(70 000)) = ceil(16.095) = 17 bits.
+        #       If we shift it 17-16=1 bit, we do get 16 bits (0b1000100010111000),
+        #       but due to signedness this is a negative number! So we need to shift it one more bit.
+        # Note: for out_quantargs.dtype=torch.int16, rshift == 0 and rescale_lshift = -7.
+        rshift = int(torch.ceil(torch.log2(lut_values.abs().max()))) + 1 - 16
+        # The 7 fractional bits are equivalent to a lshift of 7, so subtract 7 from the lshift we do.
+        rescale_lshift = rshift - 7
+        lut_values = lut_values >> rshift
+        return lut_values.to(dtype=torch.int16), rescale_lshift
+
+    def generate_table_values(
+        self,
+        torch_op: Callable[[torch.Tensor], torch.Tensor],
+        in_quantargs: QuantArgs,
+        out_quantargs: QuantArgs,
+    ) -> tuple[torch.Tensor, int]:
+        match out_quantargs.dtype:
+            case torch.int8:
+                return self.generate_8bit_table_values(
+                    torch_op, in_quantargs, out_quantargs
+                )
+            case torch.int16 | torch.int32:
+                return self.generate_16_bit_table_values(
+                    torch_op, in_quantargs, out_quantargs
+                )
+            case _:
+                raise ValueError(
+                    f"Unsupported output dtype for table: {out_quantargs.dtype}"
+                )
 
     def call(self, graph_module: GraphModule) -> PassResult:
         modified = False
@@ -100,10 +179,12 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     op_target=torch.ops.tosa._table.default,
                     args=(node.args[0],),
                 )
+                output_node = table_node
                 assert len(input_qparams) == 1
                 assert len(output_qparams) == 1
-                # Generate table buffer
-                buffer = self.generate_table_values(
+
+                # Generate table buffer and how much to lshift the table output.
+                buffer, lshift = self.generate_table_values(
                     torch_op=self.table_ops[node.target],
                     in_quantargs=input_qparams[0],
                     out_quantargs=output_qparams[0],
@@ -114,10 +195,20 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 self.register_buffer(
                     buffer_name=table_node.name.replace("_default", ""), buffer=buffer
                 )
-                node.replace_all_uses_with(table_node)
+
+                if lshift != 0:
+                    scale = 2.0**lshift
+                    rescale_node = create_node(
+                        graph=graph_module.graph,
+                        op_target=torch.ops.tosa._rescale.default,
+                        args=(table_node, output_qparams[0].dtype, scale, 0, 0),
+                    )
+                    output_node = rescale_node
+
+                node.replace_all_uses_with(output_node)
             graph_module.graph.erase_node(node)
-            table_node.meta["input_qparams"] = input_qparams
-            table_node.meta["output_qparams"] = output_qparams
+            output_node.meta["input_qparams"] = input_qparams
+            output_node.meta["output_qparams"] = output_qparams
             modified = True
 
         if modified:
 
@@ -212,26 +212,41 @@ def is_node_supported(
 class EthosU55NotSupported(OperatorSupportBase):
     """
     Certain operators are not supported on U55. These are listed in `unsupported_ops`.
+    The comment mentions the unsupported TOSA operator that the aten operator maps to where it is not obvious.
+    For unimplemented operators, this is the anticipated mapping, and it might be incorrect.
     """
 
     unsupported_ops = [
-        exir_ops.edge.aten.any.default,
-        exir_ops.edge.aten.any.dim,
-        exir_ops.edge.aten.any.dims,
+        exir_ops.edge.aten.any.default,  # REDUCE_ANY
+        exir_ops.edge.aten.any.dim,  # REDUCE_ANY
+        exir_ops.edge.aten.any.dims,  # REDUCE_ANY
         exir_ops.edge.aten.bitwise_and.Tensor,
         exir_ops.edge.aten.bitwise_or.Tensor,
         exir_ops.edge.aten.bitwise_xor.Tensor,
+        exir_ops.edge.aten.bitwise_not,
         exir_ops.edge.aten.logical_and.default,
         exir_ops.edge.aten.logical_or.default,
         exir_ops.edge.aten.logical_xor.default,
         exir_ops.edge.aten.logical_not.default,
-        exir_ops.edge.aten.amax.default,
-        exir_ops.edge.aten.amin.default,
+        exir_ops.edge.aten.amax.default,  # REDUCE_MAX
+        exir_ops.edge.aten.amin.default,  # REDUCE_MIN
         exir_ops.edge.aten.eq.Tensor,
         exir_ops.edge.aten.ge.Tensor,
         exir_ops.edge.aten.gt.Tensor,
         exir_ops.edge.aten.le.Tensor,
         exir_ops.edge.aten.lt.Tensor,
+        exir_ops.edge.aten.flip.default,  # REVERSE
+        exir_ops.edge.aten.grid_sampler_2d,  # GATHER
+        exir_ops.edge.aten.scatter.src,
+        exir_ops.edge.aten.scatter.value,
+        exir_ops.edge.aten.select_scatter.default,
+        exir_ops.edge.aten.scatter_reduce.two,
+        exir_ops.edge.aten.scatter_add.default,
+        exir_ops.edge.aten.upsample_nearest2d.vec,  # RESIZE
+        exir_ops.edge.aten.upsample_bilinear2d.vec,  # RESIZE
+        exir_ops.edge.aten.reflection_pad1d.default,  # REVERSE
+        exir_ops.edge.aten.reflection_pad2d.default,  # REVERSE
+        exir_ops.edge.aten.reflection_pad3d.default,  # REVERSE
     ]
 
     def __init__(self, reporter: WhyNoPartitionReporter):
 
@@ -30,7 +30,7 @@ class NodeVisitor:
     ]
 
     def __init__(self, exported_program: ExportedProgram, tosa_spec: TosaSpecification):
-        self._exported_program = exported_program or None
+        self._exported_program = exported_program
         self.tosa_spec = tosa_spec
 
     def define_node(
 
@@ -38,7 +38,6 @@ def define_node(
         input_zp = cast(int, node.args[3])
         output_zp = cast(int, node.args[4])
 
-        # Skip int16 cases for now.
         if input_dtype != map_dtype(torch.int8) and input_zp != 0:
             raise ValueError(
                 f"If input dtype is not int8, input_zp must be 0. Got input_dtype{ts.DTypeNames[input_dtype]}, {input_zp=}"
@@ -48,7 +47,10 @@ def define_node(
                 f"If output dtype is not int8, output_zp must be 0. Got {output_dtype=}, {output_zp=}"
             )
 
-        scale_width = 32 if output_dtype == torch.int32 else 16
+        # scale32 gives higher accuracy but for a higher HW cost.
+        # For now, always go for scale32.
+        scale_32 = True
+        scale_width = 32 if scale_32 else 16
         multiplier, shift = tosa_quant_utils.compute_multiplier_and_shift(
             [scale], scale_width
         )
@@ -58,7 +60,7 @@ def define_node(
             output_zp=output_zp,
             multiplier=multiplier,
             shift=shift,
-            scale32=output_dtype == torch.int32,
+            scale32=scale_32,
             double_round=False,
             per_channel=False,
             input_unsigned=False,
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ class NodeVisitor:`
`30`	`30`	`]`
`31`	`31`
`32`	`32`	`def __init__(self, exported_program: ExportedProgram, tosa_spec: TosaSpecification):`
`33`		`- self._exported_program = exported_program or None`
	`33`	`+ self._exported_program = exported_program`
`34`	`34`	`self.tosa_spec = tosa_spec`
`35`	`35`
`36`	`36`	`def define_node(`