pytorch
diff --git a/‎.ci/scripts/gather_test_models.py
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/gather_test_models.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 0 additions & 1 deletion b/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 0 additions & 1 deletion
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 2 additions & 2 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/scripts/unittest-macos-buck2.sh
100644100755 b/‎.ci/scripts/unittest-macos-buck2.sh
100644100755
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 39 additions & 14 deletions b/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 39 additions & 14 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 13 additions & 7 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 13 additions & 7 deletions
diff --git a/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 28 additions & 27 deletions b/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 28 additions & 27 deletions
diff --git a/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
Lines changed: 2 additions & 15 deletions b/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
Lines changed: 2 additions & 15 deletions
@@ -33,7 +33,7 @@
         "dl3": "linux.4xlarge.memory",
         "emformer_join": "linux.4xlarge.memory",
         "emformer_predict": "linux.4xlarge.memory",
-        "phi-4-mini": "linux.4xlarge.memory",
+        "phi_4_mini": "linux.4xlarge.memory",
     }
 }
 
 
@@ -78,7 +78,6 @@ ${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
     -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
     --group_size ${QLINEAR_GROUP_SIZE} \
     -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
-    --disable_dynamic_shape \
     -d fp32
 
 # Test run
 
@@ -100,11 +100,11 @@ test_model() {
       rm "./${MODEL_NAME}.pte"
       return  # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
   fi
-  if [[ "${MODEL_NAME}" == "phi-4-mini" ]]; then
+  if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
       # Install requirements for export_llama
       bash examples/models/llama/install_requirements.sh
       # Test export_llama script: python3 -m examples.models.llama.export_llama.
-      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi-4-mini/config.json
+      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi_4_mini/config.json
       run_portable_executor_runner
       rm "./${MODEL_NAME}.pte"
       return
 
@@ -106,7 +106,7 @@ jobs:
           - model: emformer_join
             backend: xnnpack-quantization-delegation
             runner: linux.4xlarge.memory
-          - model: phi-4-mini
+          - model: phi_4_mini
             backend: portable
             runner: linux.4xlarge.memory
           - model: llama3_2_vision_encoder
 
@@ -72,7 +72,7 @@ jobs:
             backend: portable
           - model: softmax
             backend: portable
-          - model: phi-4-mini
+          - model: phi_4_mini
             backend: portable
           - model: qwen2_5
             backend: portable
 
@@ -1,4 +1,9 @@
 # Copyright © 2023 Apple Inc. All rights reserved.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
 
 cmake_minimum_required(VERSION 3.19)
 
@@ -111,32 +116,48 @@ set(PROTOBUF_SOURCES
     runtime/sdk/format/WordTagger.pb.cc
 )
 
+find_library(FOUNDATION_FRAMEWORK Foundation)
+
+# CoreML util
+add_library(coreml_util ${UTIL_SOURCES})
+target_include_directories(coreml_util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util)
+target_link_libraries(coreml_util PRIVATE ${FOUNDATION_FRAMEWORK})
+
+install(
+  TARGETS coreml_util
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
+
+# CoreML inmemoryfs
+add_library(coreml_inmemoryfs ${INMEMORYFS_SOURCES})
+target_include_directories(coreml_inmemoryfs PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/runtime/inmemoryfs)
+target_link_libraries(coreml_inmemoryfs PRIVATE coreml_util ${FOUNDATION_FRAMEWORK})
+
+install(
+  TARGETS coreml_inmemoryfs
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
+
 # Define the delegate library
 add_library(coremldelegate)
-target_sources(
-  coremldelegate PRIVATE ${INMEMORYFS_SOURCES} ${KVSTORE_SOURCES}
-                         ${DELEGATE_SOURCES} ${UTIL_SOURCES}
-)
+target_sources(coremldelegate PRIVATE ${KVSTORE_SOURCES} ${DELEGATE_SOURCES})
 
 target_include_directories(
   coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include
 )
 target_include_directories(
   coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kvstore
 )
-target_include_directories(
-  coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/inmemoryfs
-)
 target_include_directories(
   coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/delegate
 )
-target_include_directories(
-  coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
-)
 target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
 target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)
-target_link_libraries(coremldelegate PRIVATE executorch_core)
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
   target_sources(coremldelegate PRIVATE ${SDK_SOURCES} ${PROTOBUF_SOURCES})
@@ -156,13 +177,17 @@ endif()
 
 find_library(ACCELERATE_FRAMEWORK Accelerate)
 find_library(COREML_FRAMEWORK CoreML)
-find_library(FOUNDATION_FRAMEWORK Foundation)
 find_library(SQLITE_LIBRARY sqlite3)
 
 target_link_libraries(
   coremldelegate
-  PRIVATE executorch_core ${ACCELERATE_FRAMEWORK} ${COREML_FRAMEWORK}
-          ${FOUNDATION_FRAMEWORK} ${SQLITE_LIBRARY}
+  PUBLIC  coreml_util
+          coreml_inmemoryfs
+  PRIVATE executorch_core
+          ${ACCELERATE_FRAMEWORK}
+          ${COREML_FRAMEWORK}
+          ${FOUNDATION_FRAMEWORK}
+          ${SQLITE_LIBRARY}
 )
 
 target_link_options_shared_lib(coremldelegate)
 
@@ -55,7 +55,10 @@
     RetraceFoldedDtypesPass,
 )
 from executorch.backends.arm._passes.fuse_batchnorm2d_pass import FuseBatchnorm2DPass
-from executorch.backends.arm._passes.fuse_constant_ops_pass import FuseConstantOpsPass
+from executorch.backends.arm._passes.fuse_constant_ops_pass import (
+    ComputeConstantOpsAOT,
+    FuseConstantArgsPass,
+)
 from executorch.backends.arm._passes.fuse_quantized_activation_pass import (  # type: ignore[import-not-found]
     FuseQuantizedActivationPass,
 )
@@ -121,21 +124,23 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
+        self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
+        self.add_pass(MatchArgRanksPass(exported_program))
+        self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(RemoveClonePass())
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
-        self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(CastInt64ToInt32Pass(exported_program))
-        self.add_pass(MatchArgRanksPass(exported_program))
         self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
 
         self.add_pass(FuseViewCopyTransform())
-        self.add_pass(FuseConstantOpsPass(exported_program))
+        self.add_pass(FuseConstantArgsPass(exported_program))
+
         self.add_pass(InsertTableOpsPass(exported_program))
         self.add_pass(AnnotateChannelsLastDimOrder())
         self.add_pass(InsertRescalePass())
@@ -166,21 +171,22 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
+        self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
+        self.add_pass(MatchArgRanksPass(exported_program))
+        self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(RemoveClonePass())
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
-        self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(CastInt64ToInt32Pass(exported_program))
-        self.add_pass(MatchArgRanksPass(exported_program))
         self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
 
         self.add_pass(FuseViewCopyTransform())
-        self.add_pass(FuseConstantOpsPass(exported_program))
+        self.add_pass(FuseConstantArgsPass(exported_program))
         self.add_pass(InsertTableOpsPass(exported_program))
         self.add_pass(AnnotateChannelsLastDimOrder())
         self.add_pass(InsertRescalePass())
 
@@ -8,7 +8,6 @@
 import logging
 
 import torch
-from executorch.backends.arm._passes.arm_pass_utils import is_param_node
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch._export.utils import is_buffer
 
@@ -25,35 +24,37 @@ def __init__(self, exported_program: torch.export.ExportedProgram):
         super(CastInt64ToInt32Pass, self).__init__()
         self.exported_program = exported_program
 
+    def _assert_within_int32(self, tensor: torch.Tensor, node: torch.fx.Node):
+        if torch.min(tensor) < torch.iinfo(torch.int32).min:
+            raise RuntimeError(
+                f"Node {node.name} has value < {torch.iinfo(torch.int32).min}"
+            )
+        if torch.max(tensor) > torch.iinfo(torch.int32).max:
+            raise RuntimeError(
+                f"Node {node.name} has value > {torch.iinfo(torch.int32).max}"
+            )
+
     def _to_int32(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             fake_tensor = node.meta["val"]
-            if isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
-                if node.meta["val"].dtype == torch.int64 and is_param_node(
-                    self.exported_program, node
-                ):
-                    if is_buffer(self.exported_program, node):
-                        node.meta["val"] = node.meta["val"].to(torch.int32)
-                        buffer_name = (
-                            self.exported_program.graph_signature.inputs_to_buffers[
-                                node.name
-                            ]
-                        )
-                        buffer = self.exported_program.state_dict[node.name]
-                        logger.warning(
-                            f"Casting buffer {node.name} from torch.int64 to torch.int32"
-                            f" defined in {node.meta['stack_trace']}"
-                        )
-                        if torch.min(buffer) < torch.iinfo(torch.int32).min:
-                            raise RuntimeError(
-                                f"Buffer {node.name} has value < {torch.iinfo(torch.int32).min}"
-                            )
-                        if torch.max(buffer) > torch.iinfo(torch.int32).max:
-                            raise RuntimeError(
-                                f"Buffer {node.name} has value > {torch.iinfo(torch.int32).max}"
-                            )
-                        buffer_int32 = buffer.to(torch.int32)
-                        self.exported_program.state_dict[buffer_name] = buffer_int32
+            if not isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
+                continue
+            if fake_tensor.dtype != torch.int64:
+                continue
+            if is_buffer(self.exported_program, node):
+                node.meta["val"] = fake_tensor.to(torch.int32)
+                buffer_name = self.exported_program.graph_signature.inputs_to_buffers[
+                    node.name
+                ]
+                buffer = self.exported_program.state_dict[node.name]
+                self._assert_within_int32(buffer, node)
+                logger.warning(
+                    f"Casting buffer {node.name} from torch.int64 to torch.int32"
+                    f" defined in {node.meta.get('stack_trace','[no stack trace found]')}"
+                )
+                buffer_int32 = buffer.to(torch.int32)
+                self.exported_program.state_dict[buffer_name] = buffer_int32
+                continue
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._to_int32(graph_module)
 
@@ -174,11 +174,8 @@ def call(self, graph_module: GraphModule) -> PassResult:
 
 class QuantizeOperatorArguments(ExportPass):
     """
-    This pass makes sure that the arguments to full.default and clamp.default are quantized correctly.
+    This pass makes sure that the arguments to clamp.default are quantized correctly.
     More specifically, this pass:
-        - Makes sure the fill_value for full.default is quantized. This pass needs to be run before
-        the folding pass above to make sure that the retraced output of the full.default op is
-        the right dtype.
         - Makes sure the min and max values to clamp.default are quantized, if it's a quantized operator.
     """
 
@@ -189,7 +186,6 @@ def call(self, graph_module: GraphModule) -> PassResult:
             n = cast(Node, n)
             if n.target not in {
                 exir_ops.edge.aten.clamp.default,
-                exir_ops.edge.aten.full.default,
             }:
                 continue
 
@@ -200,16 +196,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
 
             qargs = QuantArgs.from_operator(user.target, user.args)
 
-            if n.target == exir_ops.edge.aten.full.default:
-                if "dtype" not in n.kwargs.keys() or n.kwargs["dtype"] != qargs.dtype:
-                    # replace the node arg with a quantized dito and also set dtype
-                    # to get the right output according to the Edge IR specification:
-                    # exir/dialects/edge/edge.yaml:3596
-                    quantized_full_value = qargs.quantize_value(n.args[1]).item()
-                    n.update_arg(1, quantized_full_value)
-                    n.update_kwarg("dtype", qargs.dtype)
-                    modified = True
-            elif n.target == exir_ops.edge.aten.clamp.default:
+            if n.target == exir_ops.edge.aten.clamp.default:
                 # Quantize the min and max arguments of clamp, if they are not None
                 min_val = n.args[1]
                 max_val = None if len(n.args) <= 2 else n.args[2]
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`"dl3": "linux.4xlarge.memory",`
`34`	`34`	`"emformer_join": "linux.4xlarge.memory",`
`35`	`35`	`"emformer_predict": "linux.4xlarge.memory",`
`36`		`- "phi-4-mini": "linux.4xlarge.memory",`
	`36`	`+ "phi_4_mini": "linux.4xlarge.memory",`
`37`	`37`	`}`
`38`	`38`	`}`
`39`	`39`