Fix TrainingModule Parameter Bug

JacobSzwejbka · web-flow · commit 643c3811e4d4 · 2025-02-13T13:58:55.000-08:00
Differential Revision: D69568035 Pull Request resolved: #8443
diff --git a/extension/training/pybindings/_training_module.py b/extension/training/pybindings/_training_module.py
@@ -41,20 +41,20 @@ def forward_backward(self, method_name: str, inputs: Sequence[Any]) -> List[Any]
             self.parameters_method_prefix + method_name, ()
         )[0]
 
-        full_outputs = self.model.run_method(method_name, inputs)
+        # Important that the outputs are not cloned because we need the optimizer to
+        # be able to mutate the actual weights and not clones of them.
+        full_outputs = self.model.run_method(method_name, inputs, clone_outputs=False)
 
         user_outs = full_outputs[:grad_start_idx]
+        user_outs = [x.clone() for x in user_outs]
         grads = full_outputs[grad_start_idx:params_start_idx]
-        params = full_outputs[params_start_idx:]
+        grads = [grad.clone() for grad in grads]
 
-        # Important that the outputs are not cloned because we need the optimizer to
-        # be able to mutate the actual weights and not clones of them.
-        fqn = self.model.run_method(
-            self.fqn_method_prefix + method_name, (), clone_outputs=False
-        )
+        fqn = self.model.run_method(self.fqn_method_prefix + method_name, ())
 
         self.named_grads = dict(zip(fqn, grads))
         if self.named_params is None:
+            params = full_outputs[params_start_idx:]
             self.named_params = dict(zip(fqn, params))
 
         return user_outs
@@ -65,7 +65,7 @@ def named_gradients(self) -> Dict[str, Tensor]:
         return self.named_grads
 
     def named_parameters(self) -> Dict[str, Tensor]:
-        if self.named_grads is None:
+        if self.named_params is None:
             raise RuntimeError(
                 "Must call forward_backward before named_params. This will be fixed in a later version"
             )
diff --git a/extension/training/pybindings/test/test.py b/extension/training/pybindings/test/test.py
@@ -28,20 +28,19 @@ def __init__(self):
         def forward(self, x, y):
             return self.loss(self.linear(x).softmax(dim=0), y)
 
-        def get_random_inputs(self):
-            return (torch.randn(3), torch.tensor([1.0, 0.0, 0.0]))
+        def get_inputs(self):
+            return (torch.ones(3, dtype=torch.float32), torch.tensor([1.0, 0.0, 0.0]))
 
     def test(self):
         m = self.ModuleSimpleTrain()
-        ep = torch.export.export(m, m.get_random_inputs(), strict=True)
+        ep = torch.export.export(m, m.get_inputs(), strict=True)
         ep = _export_forward_backward(ep)
         ep = to_edge(ep)
         ep = ep.to_executorch()
         buffer = ep.buffer
         tm = _load_for_executorch_for_training_from_buffer(buffer)
 
-        tm.forward_backward("forward", m.get_random_inputs())
-        orig_param = list(tm.named_parameters().values())[0].clone()
+        orig_loss = tm.forward_backward("forward", m.get_inputs())
         optimizer = get_sgd_optimizer(
             tm.named_parameters(),
             0.1,
@@ -50,7 +49,19 @@ def test(self):
             0,
             False,
         )
+
+        cloned_params = list(tm.named_parameters().values())
+        cloned_params = [p.clone() for p in cloned_params]
+
         optimizer.step(tm.named_gradients())
-        self.assertFalse(
-            torch.allclose(orig_param, list(tm.named_parameters().values())[0])
-        )
+
+        # The python module caches the param tensors after the first
+        # inference. So this doesn't test if the params are actually
+        # updated in cpp world.
+        for p, cloned_p in zip(tm.named_parameters().values(), cloned_params):
+            self.assertFalse(torch.allclose(p, cloned_p))
+
+        # Test that the params actually changed in cpp by running against
+        # the same inputs again and seeing that the loss is different.
+        second_loss = tm.forward_backward("forward", m.get_inputs())
+        self.assertFalse(torch.allclose(orig_loss[0], second_loss[0]))