Skip to content

Commit 916e62c

Browse files
kiukchungfacebook-github-bot
authored andcommitted
Changes to kd_monarch to reflect default port changes to remote-process-allocator and monarch CLI component
Summary: Updates `kd_monarch` with the capability to dynamically query for information needed to create and use the `MastAllocator` in `controller.py`. Specifically it makes it possible to get these fields from the provided `ALLOCATOR_JOB_NAME` 1. `(hosts, gpus)` ProcMesh dimensions 2. RemoteProcessAllocator `port` (no need to hard code or have the user provide) 3. Task group name Also changes the remote-process-allocator (aka `hyperactor mesh-worker`) entrypoint's binary name in `kd_monarch_pkg` to be `monarch_bootstrap` to be consistent with its equivalent in OSS (hence makes the TorchX AppDef portable) Next: ~~[6/n] Have kd_monarch use the default component (the custom mast.py is no longer needed). Update the README with updated instructions.~~ [7/n] Remove rust CLI in favor of all-python (we delegate to torchx for most things anyways) [8/n] Add E2E unittest using the local_cwd scheduler (actually run a mini-trainer actor) [9/n] Write an oss hyperactor mesh-worker entrypoint binary [10/n] Author a Dockerfile that sets up the environment (much like fbpkgs do it for internal runs) [11/n] Author a TorchXAllocator Reviewed By: vidhyav Differential Revision: D75162358 fbshipit-source-id: d2756ffb529b3069baf94cd9dfdde00b8ef2ede4
1 parent a1acccd commit 916e62c

File tree

3 files changed

+59
-3
lines changed

3 files changed

+59
-3
lines changed

tools/mesh_spec.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class MeshSpec:
2828
num_hosts: int
2929
host_type: str
3030
gpus: int
31+
port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
3132

3233

3334
def _tag(mesh_name: str, tag_template: str) -> str:
@@ -47,6 +48,7 @@ def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[Me
4748
num_hosts=role.num_replicas,
4849
host_type=appdef.metadata.get(_tag(mesh_name, _TAG_HOST_TYPE), ""),
4950
gpus=int(appdef.metadata.get(_tag(mesh_name, _TAG_GPUS), "-1")),
51+
port=role.port_map.get("mesh", DEFAULT_REMOTE_ALLOCATOR_PORT),
5052
)
5153

5254
return None
@@ -82,6 +84,18 @@ class ServerSpec:
8284
state: specs.AppState
8385
meshes: list[MeshSpec]
8486

87+
def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
88+
for mesh_spec in self.meshes:
89+
if mesh_spec.name == mesh_name:
90+
return mesh_spec
91+
92+
raise ValueError(
93+
f"Mesh: '{mesh_name}' not found in job: {self.name}. Try one of: {self.get_mesh_names()}"
94+
)
95+
96+
def get_mesh_names(self) -> list[str]:
97+
return [m.name for m in self.meshes]
98+
8599
def to_json(self) -> dict[str, Any]:
86100
"""Returns the JSON form of this struct that can be printed to console by:
87101

tools/tests/commands_test.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
from monarch.tools import commands
1313
from monarch.tools.commands import component_args_from_cli
1414

15-
from monarch.tools.config import defaults
15+
from monarch.tools.config import ( # @manual=//monarch/tools/config/meta:defaults
16+
defaults,
17+
)
1618
from monarch.tools.mesh_spec import MeshSpec, ServerSpec
1719
from torchx.specs import AppDef, AppDryRunInfo, AppState, AppStatus, Role
1820

@@ -68,7 +70,14 @@ def test_info(
6870

6971
appdef = AppDef(
7072
name="monarch_test_123",
71-
roles=[Role(name="trainer", image="__unused__", num_replicas=4)],
73+
roles=[
74+
Role(
75+
name="trainer",
76+
image="__unused__",
77+
num_replicas=4,
78+
port_map={"mesh": 26501},
79+
)
80+
],
7281
metadata={
7382
"monarch/meshes/trainer/host_type": "gpu.medium",
7483
"monarch/meshes/trainer/gpus": "2",
@@ -86,6 +95,7 @@ def test_info(
8695
num_hosts=4,
8796
host_type="gpu.medium",
8897
gpus=2,
98+
port=26501,
8999
)
90100
],
91101
),

tools/tests/mesh_spec_test.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
# pyre-strict
78
import json
89
import unittest
910
from dataclasses import asdict
@@ -12,6 +13,7 @@
1213
mesh_spec_from_metadata,
1314
mesh_spec_from_str,
1415
MeshSpec,
16+
ServerSpec,
1517
tag_as_metadata,
1618
)
1719

@@ -87,7 +89,37 @@ def test_mesh_spec_can_dump_as_json(self) -> None:
8789
"name": "trainer",
8890
"num_hosts": 4,
8991
"host_type": "gpu.medium",
90-
"gpus": 2
92+
"gpus": 2,
93+
"port": 26600
9194
}
9295
"""
9396
self.assertEqual(expected.strip("\n"), json.dumps(asdict(mesh_spec), indent=2))
97+
98+
99+
class ServerSpecTest(unittest.TestCase):
100+
def get_test_server_spec(self) -> ServerSpec:
101+
return ServerSpec(
102+
name="monarch-foo-1a2b3c",
103+
state=specs.AppState.RUNNING,
104+
meshes=[
105+
MeshSpec(name="trainer", num_hosts=4, host_type="gpu.medium", gpus=2),
106+
MeshSpec(name="generator", num_hosts=8, host_type="gpu.small", gpus=1),
107+
],
108+
)
109+
110+
def test_get_mesh_spec(self) -> None:
111+
server_spec = self.get_test_server_spec()
112+
mesh_spec = server_spec.get_mesh_spec("trainer")
113+
114+
self.assertEqual("trainer", mesh_spec.name)
115+
self.assertEqual(4, mesh_spec.num_hosts)
116+
self.assertEqual(2, mesh_spec.gpus)
117+
self.assertEqual("gpu.medium", mesh_spec.host_type)
118+
119+
def test_get_mesh_spec_not_found(self) -> None:
120+
server_spec = self.get_test_server_spec()
121+
with self.assertRaisesRegex(
122+
ValueError,
123+
r"Mesh: 'worker' not found in job: monarch-foo-1a2b3c. Try one of: \['trainer', 'generator'\]",
124+
):
125+
server_spec.get_mesh_spec("worker")

0 commit comments

Comments
 (0)