Add pass to extract mutable weights into a .ptd (#7798)

JacobSzwejbka · facebook-github-bot · commit b67b19358c5e · 2025-01-21T11:55:57.000-08:00
Summary:

Cleaned up the existing pass and fixed a typing error (EP -&gt; PassResult),

added another option in backend config to extract only mutable weights (training workflows will do this),

fixed the ordering of ET passes and added a warning not to add stuff after memory planning (this pass was actually fine but in general we like having the invariant that memory planning is last),

fixed the emitter to prioritize making it external vs mutable. Down the line we will need to support intermixing of mutable and non mutable in the same .ptd (memory regressions not correctness are the stakes), but no one needs that today so deferring.

Reviewed By: lucylq

Differential Revision: D68121580
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
@@ -92,3 +92,7 @@ class ExecutorchBackendConfig:
     # If set to true, all constant tensors will be stored in a separate file,
     # external to the PTE file.
     external_constants: bool = False
+
+    # If set to true, all trainable weights will be stored in a separate file,
+    # external to the PTE file.
+    external_mutable_weights: bool = False
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
@@ -387,38 +387,36 @@ def _save_new_const_tensor(
         # Update buffer_idx to point to the end of the list where we are adding the new buffer.
         buffer = Buffer(storage=buffer_data)
 
-        # Tensor is mutable with initial state.
-        if allocation_info:
+        # Tensor is stored outside of the PTE file.
+        if (
+            spec.extra_tensor_info is not None
+            and spec.extra_tensor_info.fully_qualified_name is not None
+            and spec.extra_tensor_info.location == TensorDataLocation.EXTERNAL
+        ):
+            assert (
+                constant_tag is not None
+            ), "Constant tag is not set for external tensor"
+            # TODO (#7633): Handle case where we have both mutable and non mutable weights that we want to put in the same external file.
+            # We will need to create 2 segments in that case, but it'll be a bit until we see this case. LLM finetuning will probably require this.
+
+            buffer_idx = len(self.program_state.external_constant_buffer)
+            self.program_state.external_constant_hash[hashed] = buffer_idx
+            self.program_state.external_constant_buffer.append(buffer_data)
+            if constant_tag not in self.program_state.external_constant_map:
+                self.program_state.external_constant_map[constant_tag] = {}
+            self.program_state.external_constant_map[constant_tag][
+                spec.extra_tensor_info.fully_qualified_name  # pyre-ignore Undefined attribute [16]: `Optional` has no attribute `fully_qualified_name`.
+            ] = buffer_idx
+        # Tensor is mutable with initial state. Place into mutable segment
+        elif allocation_info:
             buffer_idx = len(self.program_state.mutable_buffer)
             self.program_state.cached_spec_mutable_hash_values[hashed] = buffer_idx
             self.program_state.mutable_buffer.append(buffer)
-
-        # Tensor is constant.
+        # Tensor is stored in the PTE file.
         else:
-            # Tensor is stored outside of the PTE file.
-            if (
-                spec.extra_tensor_info is not None
-                and spec.extra_tensor_info.fully_qualified_name is not None
-                and spec.extra_tensor_info.location == TensorDataLocation.EXTERNAL
-            ):
-                assert (
-                    constant_tag is not None
-                ), "Constant tag is not set for external tensor"
-
-                buffer_idx = len(self.program_state.external_constant_buffer)
-                self.program_state.external_constant_hash[hashed] = buffer_idx
-                self.program_state.external_constant_buffer.append(buffer_data)
-                if constant_tag not in self.program_state.external_constant_map:
-                    self.program_state.external_constant_map[constant_tag] = {}
-                self.program_state.external_constant_map[constant_tag][
-                    spec.extra_tensor_info.fully_qualified_name  # pyre-ignore Undefined attribute [16]: `Optional` has no attribute `fully_qualified_name`.
-                ] = buffer_idx
-
-            # Tensor is stored in the PTE file.
-            else:
-                buffer_idx = len(self.program_state.constant_buffer)
-                self.program_state.cached_spec_hash_values[hashed] = buffer_idx
-                self.program_state.constant_buffer.append(buffer)
+            buffer_idx = len(self.program_state.constant_buffer)
+            self.program_state.cached_spec_hash_values[hashed] = buffer_idx
+            self.program_state.constant_buffer.append(buffer)
 
         return buffer_idx
 
@@ -458,7 +456,7 @@ def _tensor_spec_to_evalue(
 
             hashed = hashlib.sha256(buffer_data).hexdigest()
 
-            if allocation_info:
+            if allocation_info and spec.extra_tensor_info is None:
                 buffer_idx = self.program_state.cached_spec_mutable_hash_values.get(
                     hashed, -1
                 )
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
@@ -67,6 +67,7 @@
 from torch import nn
 
 from torch.export import Dim, export, export_for_training
+from torch.export.experimental import _export_forward_backward
 
 
 class WrapperModule(torch.nn.Module):
@@ -1733,3 +1734,53 @@ def forward(self, x):
         self.assertEqual(
             len(edge_program_manager.executorch_program.backend_delegate_data), 1
         )
+
+    def test_constant_tagged_mutable_tensors(self) -> None:
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(2, 2)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        # On device training requires the loss to be embedded in the model (and be the first output).
+        # We wrap the original model here and add the loss calculation. This will be the model we export.
+        class TrainingNet(nn.Module):
+            def __init__(self, net):
+                super().__init__()
+                self.net = net
+                self.loss = nn.CrossEntropyLoss()
+
+            def forward(self, input, label):
+                pred = self.net(input)
+                return self.loss(pred, label), pred.detach().argmax(dim=1)
+
+        net = TrainingNet(Net())
+
+        # Captures the forward graph. The graph will look similar to the model definition now.
+        # Will move to export_for_training soon which is the api planned to be supported in the long term.
+        ep = export(
+            net, (torch.randn(1, 2), torch.ones(1, dtype=torch.int64)), strict=True
+        )
+        # Captures the backward graph. The exported_program now contains the joint forward and backward graph.
+        ep = _export_forward_backward(ep)
+        # Lower the graph to edge dialect.
+        ep = to_edge(ep)
+        # Lower the graph to executorch.
+        ep = ep.to_executorch(
+            config=ExecutorchBackendConfig(external_mutable_weights=True)
+        )
+
+        emitter_output = ep._emitter_output
+        # Check that constant_buffer is empty besides the non-constant placeholder 0.
+        self.assertEqual(len(emitter_output.program.constant_buffer), 1)
+        # Check that constant weights are in the external constant buffer.
+        self.assertEqual(len(emitter_output.external_constant_buffer), 2)
+        # Setting external_mutable_weights=True, saves all constants with an associated gradient to the key
+        # '_default_external_constant'.
+        external_map = emitter_output.external_constant_map[
+            "_default_external_constant"
+        ]
+        self.assertEqual(external_map["net.linear.weight"], 0)
+        self.assertEqual(external_map["net.linear.bias"], 1)
diff --git a/exir/passes/external_constants_pass.py b/exir/passes/external_constants_pass.py
@@ -7,17 +7,20 @@
 # pyre-strict
 
 import torch
+from executorch.exir.pass_base import PassResult
 from executorch.exir.tensor import TensorSpec
-from torch.export.exported_program import ExportedProgram
+from torch.export.exported_program import ExportedProgram, OutputKind
+from torch.fx import GraphModule
 
 
 def external_constants_pass(
-    ep: ExportedProgram,
-) -> ExportedProgram:
+    gm: GraphModule,
+) -> PassResult:
     """
     Move all constants to external file.
     """
-    for module in ep.graph_module.modules():
+    mutated = False
+    for module in gm.modules():
         if not isinstance(module, torch.fx.GraphModule):
             continue
 
@@ -26,4 +29,46 @@ def external_constants_pass(
                 spec = node.meta.get("spec")
                 if isinstance(spec, TensorSpec) and spec.const:
                     node.meta["constant_tag"] = "_default_external_constant"
-    return ep
+                    mutated = True
+    return PassResult(gm, mutated)
+
+
+def _is_mutable_weight(node: torch.fx.Node, ep: ExportedProgram) -> bool:
+    grad_targets = [
+        spec.target
+        for spec in ep.graph_signature.output_specs
+        if spec.kind == OutputKind.GRADIENT_TO_PARAMETER
+    ]
+    return (
+        node.op == "placeholder"
+        and node.target in ep.graph_signature.inputs_to_parameters.keys()
+        and ep.graph_signature.inputs_to_parameters[node.target] in grad_targets
+    )
+
+
+def external_mutable_weights_pass(
+    gm: GraphModule,
+    ep: ExportedProgram,
+) -> PassResult:
+    """
+    Move all mutable weights to external file.
+    """
+    # pass the gm and the ep seperately as the gm is being mutated by a bunch of passes in to_executorch,
+    # so the gm in the ep is lagging the graph signature is still correct.
+    # This is really tech debt and all the passes should be refactored to just mutate the ep.
+    mutated = False
+    for module in gm.modules():
+        if not isinstance(module, torch.fx.GraphModule):
+            continue
+
+        for node in module.graph.nodes:
+            if node.op == "placeholder":
+                spec = node.meta.get("spec")
+                if (
+                    isinstance(spec, TensorSpec)
+                    and spec.const
+                    and _is_mutable_weight(node, ep)
+                ):
+                    node.meta["constant_tag"] = "_default_external_constant"
+                    mutated = True
+    return PassResult(gm, mutated)
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -35,7 +35,10 @@
     MemoryFormatOpsPass,
     OpReplacePass,
 )
-from executorch.exir.passes.external_constants_pass import external_constants_pass
+from executorch.exir.passes.external_constants_pass import (
+    external_constants_pass,
+    external_mutable_weights_pass,
+)
 from executorch.exir.passes.insert_write_back_for_buffers_pass import (
     insert_write_back_for_buffers_pass,
 )
@@ -1395,6 +1398,14 @@ def to_executorch(
                     # TODO(who?)
                     p.update_placeholder_tensor_specs(program, new_gm)
 
+            # Extract constants if the config says too.
+            if config.external_constants:
+                new_gm_res = external_constants_pass(new_gm)
+                new_gm = new_gm_res.graph_module
+            elif config.external_mutable_weights:
+                new_gm_res = external_mutable_weights_pass(new_gm, program)
+                new_gm = new_gm_res.graph_module
+
             if isinstance(config.memory_planning_pass, dict):
                 memory_planning_pass = config.memory_planning_pass.get(
                     name, ExecutorchBackendConfig().memory_planning_pass
@@ -1409,8 +1420,8 @@ def to_executorch(
             else:
                 new_gm_res = memory_planning_pass(new_gm)  # pyre-ignore[29]
 
-            if config.external_constants:
-                new_gm_res = external_constants_pass(new_gm_res)
+            # WARNING: DO NOT ADD ANY MORE PASSES AFTER MEMORY PLANNING PASS.
+            # THERE ARE A LOT OF ASSUMPTIONS IN THE STACK THAT MEMORY PLANNING IS THE LAST PASS BEFORE THE EMITTER.
             assert new_gm_res is not None
             new_gm = new_gm_res.graph_module