Enable planner to be used for loading sharded optimizer state dict (pytorch#112520)

wz337 · b-chu · web-flow · commit 4b4c012a6033 · 2023-11-07T11:35:20.000-08:00
Cherry-pick [pytorch#112259](pytorch#112259) Requested by MosaicML Comments from users: > without this, we can't do training resumption because the model gets loaded without the optimizer --------------------------------------------------------------------------------------------------------------------- This creates a more consistent interface for saving and loading sharded state dicts. A planner is able to be specified when saving a sharded optimizer state dict, but there is currently no planner support for loading one. This change does not affect the default behavior of the function. Co-authored-by: Brian <23239305+b-chu@users.noreply.github.com>
diff --git a/torch/distributed/checkpoint/optimizer.py b/torch/distributed/checkpoint/optimizer.py
@@ -33,6 +33,7 @@
     DefaultLoadPlanner,
 )
 from torch.distributed._shard.api import _shard_tensor
+from torch.distributed.checkpoint.planner import LoadPlanner
 
 from torch.distributed.checkpoint._nested_dict import unflatten_state_dict
 from torch.distributed.checkpoint.utils import (
@@ -212,6 +213,7 @@ def load_sharded_optimizer_state_dict(
     model_state_dict: STATE_DICT_TYPE,
     optimizer_key: str,
     storage_reader: dist_cp.StorageReader,
+    planner: Optional[LoadPlanner] = None,
 ) -> STATE_DICT_TYPE:
     """
     Loads a state_dict in conjunction with FSDP sharded optimizer state.
@@ -337,7 +339,7 @@ def load_sharded_optimizer_state_dict(
         state_dict=state_dict,
         storage_reader=storage_reader,
         # FIXME the type of planner is wrong in load_state_dict
-        planner=_ReaderWithOffset(fqn_to_offset) if dp_pg is not None else None,
+        planner=_ReaderWithOffset(fqn_to_offset) if dp_pg is not None else planner,
     )
 
     state_dict = unflatten_state_dict(state_dict, metadata.planner_data)