pytorch
diff --git a/‎.ci/scripts/gather_test_models.py
Lines changed: 11 additions & 9 deletions b/‎.ci/scripts/gather_test_models.py
Lines changed: 11 additions & 9 deletions
diff --git a/‎.ci/scripts/setup-macos.sh
Lines changed: 3 additions & 0 deletions b/‎.ci/scripts/setup-macos.sh
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/apple.yml
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/apple.yml
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/workflows/ghstack_land.yml
Lines changed: 1 addition & 15 deletions b/‎.github/workflows/ghstack_land.yml
Lines changed: 1 addition & 15 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/trunk.yml
Lines changed: 4 additions & 2 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 16 additions & 40 deletions b/‎CMakeLists.txt
Lines changed: 16 additions & 40 deletions
diff --git a/‎backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm
Lines changed: 0 additions & 1 deletion b/‎backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm
Lines changed: 0 additions & 1 deletion b/‎backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 32 additions & 11 deletions b/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 32 additions & 11 deletions
diff --git a/‎backends/arm/_passes/scalars_to_attribute_pass.py
Lines changed: 5 additions & 0 deletions b/‎backends/arm/_passes/scalars_to_attribute_pass.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/arm/test/ops/test_scalars.py
Lines changed: 9 additions & 0 deletions b/‎backends/arm/test/ops/test_scalars.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎backends/arm/test/runner_utils.py
Lines changed: 0 additions & 2 deletions b/‎backends/arm/test/runner_utils.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/cadence/aot/TARGETS
Lines changed: 20 additions & 0 deletions b/‎backends/cadence/aot/TARGETS
Lines changed: 20 additions & 0 deletions
diff --git a/‎backends/cadence/aot/export_example.py
Lines changed: 8 additions & 6 deletions b/‎backends/cadence/aot/export_example.py
Lines changed: 8 additions & 6 deletions
diff --git a/‎backends/cadence/aot/utils.py
Lines changed: 2 additions & 1 deletion b/‎backends/cadence/aot/utils.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/cadence/runtime/TARGETS
Lines changed: 2 additions & 0 deletions b/‎backends/cadence/runtime/TARGETS
Lines changed: 2 additions & 0 deletions
@@ -20,16 +20,16 @@
 CUSTOM_RUNNERS = {
     "linux": {
         # This one runs OOM on smaller runner, the root cause is unclear (T163016365)
-        "w2l": "linux.12xlarge",
-        "ic4": "linux.12xlarge",
-        "resnet50": "linux.12xlarge",
-        "llava": "linux.12xlarge",
-        "llama3_2_vision_encoder": "linux.12xlarge",
-        # "llama3_2_text_decoder": "linux.12xlarge",  # TODO: re-enable test when Huy's change is in / model gets smaller.
+        "w2l": "linux.4xlarge.memory",
+        "ic4": "linux.4xlarge.memory",
+        "resnet50": "linux.4xlarge.memory",
+        "llava": "linux.4xlarge.memory",
+        "llama3_2_vision_encoder": "linux.4xlarge.memory",
+        "llama3_2_text_decoder": "linux.4xlarge.memory",
         # This one causes timeout on smaller runner, the root cause is unclear (T161064121)
-        "dl3": "linux.12xlarge",
-        "emformer_join": "linux.12xlarge",
-        "emformer_predict": "linux.12xlarge",
+        "dl3": "linux.4xlarge.memory",
+        "emformer_join": "linux.4xlarge.memory",
+        "emformer_predict": "linux.4xlarge.memory",
     }
 }
 
@@ -39,10 +39,12 @@
     "linux": {
         "mobilebert": 90,
         "emformer_predict": 360,
+        "llama3_2_text_decoder": 360,
     },
     "macos": {
         "mobilebert": 90,
         "emformer_predict": 360,
+        "llama3_2_text_decoder": 360,
     },
 }
 
 
@@ -49,6 +49,9 @@ install_buck() {
 
   rm "${BUCK2}"
   popd
+
+  # Kill all running buck2 daemon for a fresh start
+  buck2 killall || true
 }
 
 function write_sccache_stub() {
 
@@ -42,6 +42,8 @@ jobs:
 
   build-demo-ios:
     name: build-demo-ios
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
@@ -190,6 +192,8 @@ jobs:
         ) done
 
   upload-frameworks-ios:
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-22.04
     needs: [build-frameworks-ios, set-version]
     timeout-minutes: 30
@@ -278,6 +282,8 @@ jobs:
 
   build-benchmark-app:
     name: build-benchmark-app
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
 
@@ -3,21 +3,7 @@ on:
   pull_request:
     types: [closed]
     branches:
-      - 'gh/cccclai/[0-9]+/base'
-      - 'gh/dbort/[0-9]+/base'
-      - 'gh/dvorjackz/[0-9]+/base'
-      - 'gh/guangy10/[0-9]+/base'
-      - 'gh/helunwencser/[0-9]+/base'
-      - 'gh/jorgep31415/[0-9]+/base'
-      - 'gh/kimishpatel/[0-9]+/base'
-      - 'gh/kirklandsign/[0-9]+/base'
-      - 'gh/larryliu0820/[0-9]+/base'
-      - 'gh/lucylq/[0-9]+/base'
-      - 'gh/manuelcandales/[0-9]+/base'
-      - 'gh/mcr229/[0-9]+/base'
-      - 'gh/swolchok/[0-9]+/base'
-      - 'gh/SS-JIA/[0-9]+/base'
-      - 'gh/trivedivivek/[0-9]+/base'
+      - 'gh/*/[0-9]+/base'
 
 jobs:
   ghstack_merge_to_main:
 
@@ -332,7 +332,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-clang12
 
   unittest-arm:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
 
@@ -131,7 +131,7 @@ jobs:
 
   test-arm-backend-delegation:
     name: test-arm-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -158,7 +158,7 @@ jobs:
 
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -352,6 +352,8 @@ jobs:
         done
 
   test-huggingface-transformers:
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     name: test-huggingface-transformers
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     secrets: inherit
 
@@ -682,6 +682,22 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
+  # Setup RPATH.
+  # See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
+  if(APPLE)
+    set(CMAKE_MACOSX_RPATH ON)
+    set(_rpath_portable_origin "@loader_path")
+  else()
+    set(_rpath_portable_origin $ORIGIN)
+  endif(APPLE)
+  # Use separate rpaths during build and install phases
+  set(CMAKE_SKIP_BUILD_RPATH  FALSE)
+  # Don't use the install-rpath during the build phase
+  set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
+  set(CMAKE_INSTALL_RPATH "${_rpath_portable_origin}")
+  # Automatically add all linked folders that are NOT in the build directory to
+  # the rpath (per library?)
+  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)
 
   if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
@@ -765,46 +781,6 @@ if(EXECUTORCH_BUILD_PYBIND)
   target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
   target_link_libraries(portable_lib PRIVATE ${_dep_libs})
-  if(APPLE)
-    # pip wheels will need to be able to find the torch libraries. On Linux, the
-    # .so has non-absolute dependencies on libs like "libtorch.so" without
-    # paths; as long as we `import torch` first, those dependencies will work.
-    # But Apple dylibs do not support non-absolute dependencies, so we need to
-    # tell the loader where to look for its libraries. The LC_LOAD_DYLIB entries
-    # for the torch libraries will look like "@rpath/libtorch.dylib", so we can
-    # add an LC_RPATH entry to look in a directory relative to the installed
-    # location of our _portable_lib.so file. To see these LC_* values, run
-    # `otool -l _portable_lib*.so`.
-    set_target_properties(
-      portable_lib
-      PROPERTIES # Assume that this library will be installed in
-                 # `site-packages/executorch/extension/pybindings`, and that
-                 # the torch libs are in `site-packages/torch/lib`.
-                 BUILD_RPATH "@loader_path/../../../torch/lib"
-                 INSTALL_RPATH "@loader_path/../../../torch/lib"
-                 # Assume <executorch> is the root `site-packages/executorch`
-                 # Need to add <executorch>/extension/llm/custom_ops for
-                 # libcustom_ops_aot_lib.dylib
-                 BUILD_RPATH "@loader_path/../../extension/llm/custom_ops"
-                 INSTALL_RPATH "@loader_path/../../extension/llm/custom_ops"
-                 # Need to add <executorch>/kernels/quantized for
-                 # libquantized_ops_aot_lib.dylib
-                 BUILD_RPATH "@loader_path/../../kernels/quantized"
-                 INSTALL_RPATH "@loader_path/../../kernels/quantized"
-    )
-  else()
-    set_target_properties(
-      portable_lib
-      PROPERTIES
-        # Assume <executorch> is the root `site-packages/executorch`
-        # Need to add <executorch>/extension/llm/custom_ops for
-        # libcustom_ops_aot_lib
-        # Need to add <executorch>/kernels/quantized for
-        # libquantized_ops_aot_lib
-        BUILD_RPATH
-        "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
-    )
-  endif()
 
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings
 
@@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
-    XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
 }
 
@@ -146,7 +146,6 @@ - (void)testMV3ProgramProfiling {
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
-            XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
         };
 
@@ -5,8 +5,15 @@
 
 # pyre-unsafe
 
+import logging
+
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import is_param_node
 from executorch.exir.pass_base import ExportPass, PassResult
+from torch._export.utils import is_buffer
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
 
 
 class CastInt64ToInt32Pass(ExportPass):
@@ -18,17 +25,31 @@ def _to_int32(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             fake_tensor = node.meta["val"]
             if isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
-                if node.meta["val"].dtype == torch.int64:
-                    node.meta["val"] = node.meta["val"].to(torch.int32)
-                    buffer_name = (
-                        self.exported_program.graph_signature.inputs_to_buffers[
-                            node.name
-                        ]
-                    )
-                    new_tensor = self.exported_program.state_dict[buffer_name].to(
-                        torch.int32
-                    )
-                    self.exported_program.state_dict[buffer_name] = new_tensor
+                if node.meta["val"].dtype == torch.int64 and is_param_node(
+                    self.exported_program, node
+                ):
+                    if is_buffer(self.exported_program, node):
+                        node.meta["val"] = node.meta["val"].to(torch.int32)
+                        buffer_name = (
+                            self.exported_program.graph_signature.inputs_to_buffers[
+                                node.name
+                            ]
+                        )
+                        buffer = self.exported_program.state_dict[node.name]
+                        logger.warning(
+                            f"Casting buffer {node.name} from torch.int64 to torch.int32"
+                            f" defined in {node.meta['stack_trace']}"
+                        )
+                        if torch.min(buffer) < torch.iinfo(torch.int32).min:
+                            raise RuntimeError(
+                                f"Buffer {node.name} has value < {torch.iinfo(torch.int32).min}"
+                            )
+                        if torch.max(buffer) > torch.iinfo(torch.int32).max:
+                            raise RuntimeError(
+                                f"Buffer {node.name} has value > {torch.iinfo(torch.int32).max}"
+                            )
+                        buffer_int32 = buffer.to(torch.int32)
+                        self.exported_program.state_dict[buffer_name] = buffer_int32
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._to_int32(graph_module)
 
@@ -51,6 +51,11 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 if isinstance(arg, Node):
                     new_args.append(arg)
                     continue
+                if isinstance(arg, int) and not torch.is_floating_point(
+                    get_first_fake_tensor(n)
+                ):
+                    new_args.append(arg)
+                    continue
 
                 prefix = "_tensor_constant_"
                 get_new_attr_name = get_new_attr_name_with_prefix(prefix)
 
@@ -75,6 +75,12 @@ def forward(self, x):
             x = 1.0 + x
             return x
 
+    class ShiftInplaceSub(torch.nn.Module):
+        def forward(self, x):
+            x = x >> 4
+            x -= 10
+            return x
+
     # Inplace ops end with '_' (from aten naming)
     ops = [
         ("Add", Add()),
@@ -160,3 +166,6 @@ def test_MI_const(self, test_name: str, op: torch.nn.Module, x):
     @parameterized.expand(tensor_scalar_tests)
     def test_BI(self, test_name: str, op: torch.nn.Module, x, y):
         self._test_add_tosa_BI_pipeline(op, (x, y))
+
+    def test_shift_sub_inplace_tosa_MI(self):
+        self._test_add_tosa_MI_pipeline(self.ShiftInplaceSub(), (torch.IntTensor(5),))
@@ -266,8 +266,6 @@ def run_corstone(
                 "-C",
                 "mps3_board.uart0.out_file='-'",
                 "-C",
-                "cpu0.CFGITCMSZ=11",
-                "-C",
                 "cpu0.semihosting-enable=1",
                 "-C",
                 "cpu0.semihosting-stack_base=0",
 
@@ -50,6 +50,26 @@ python_library(
     ],
 )
 
+python_library(
+    name = "export_example",
+    srcs = [
+        "export_example.py",
+    ],
+    deps = [
+        ":passes",
+        ":utils",
+        ":ops_registrations",
+        ":replace_ops",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot/quantizer:fusion_pass",
+        "//executorch/backends/cadence/runtime:runtime",
+        "//executorch/backends/cadence/aot/quantizer:quantizer",
+        "//executorch/backends/transforms:decompose_sdpa",
+        "//executorch/backends/transforms:remove_clone_ops",
+        "//executorch/exir:lib",
+        "//executorch/devtools:lib",
+    ],
+)
 
 python_library(
     name = "pass_utils",
 
@@ -60,6 +60,7 @@ def export_model(
     model: nn.Module,
     example_inputs: Tuple[Any, ...],
     file_name: str = "CadenceDemoModel",
+    run_and_compare: bool = True,
 ):
     # create work directory for outputs and model binary
     working_dir = tempfile.mkdtemp(dir="/tmp")
@@ -112,9 +113,10 @@ def export_model(
     )
 
     # TODO: move to test infra
-    runtime.run_and_compare(
-        executorch_prog=exec_prog,
-        inputs=example_inputs,
-        ref_outputs=ref_outputs,
-        working_dir=working_dir,
-    )
+    if run_and_compare:
+        runtime.run_and_compare(
+            executorch_prog=exec_prog,
+            inputs=example_inputs,
+            ref_outputs=ref_outputs,
+            working_dir=working_dir,
+        )
@@ -162,7 +162,8 @@ def print_ops_info(
 
     # Print the final ops and their counts in a tabular format
     logging.info(
-        tabulate(
+        "\n"
+        + tabulate(
             sorted_ops_count,
             headers=[
                 "Final Operators                                    ",  # one character longer than the longest op name
 
@@ -7,6 +7,8 @@ python_library(
     srcs = [
         "__init__.py",
         "executor.py",
+        "runtime.py",
+        "utils.py"
     ] + glob([
         "xtsc-cfg/**/*",
     ]),
Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,9 @@ install_buck() {`
`49`	`49`
`50`	`50`	`rm "${BUCK2}"`
`51`	`51`	`popd`
	`52`	`+`
	`53`	`+ # Kill all running buck2 daemon for a fresh start`
	`54`	`+ buck2 killall \|\| true`
`52`	`55`	`}`
`53`	`56`
`54`	`57`	`function write_sccache_stub() {`
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {`
`151`	`151`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);`
`152`	`152`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);`
`153`	`153`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);`
`154`		`- XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);`
`155`	`154`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);`
`156`	`155`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);`
`157`	`156`	`}`