create UDF builtins for rng/seed torch functions (#27)

amirafzali · facebook-github-bot · commit d08e538ff9e7 · 2025-05-27T17:37:11.000-07:00
Summary: Pull Request resolved: #27 **Diff Purpose & Changes** 1. Creating the random/RNG remote functions for our new builtins library. torch.manual_seed(seed) torch.initial_seed() torch.get_rng_state() torch.set_rng_state(state) torch.cuda.get_rng_state_all() torch.cuda.set_rng_state_all(states) These two appear to be the same function. we can consider removing one or the other in the library. torch.seed() torch.random.seed() Reviewed By: vidhyav, colin2328 Differential Revision: D72944566 fbshipit-source-id: 77abd37b764685c9d6d29ecfb860b435f07b0e11
diff --git a/python/monarch/builtins/random.py b/python/monarch/builtins/random.py
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre strict
+from typing import Callable
+
+import torch
+from monarch.common.remote import remote
+
+
+@remote(propagate="inspect")
+def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None:
+    torch.manual_seed(seed ^ process_idx)
+
+
+@remote(propagate=lambda: 0)
+def initial_seed_remote() -> int:
+    return torch.initial_seed()
+
+
+@remote(propagate=lambda: torch.zeros(1))
+def get_rng_state_remote() -> torch.Tensor:
+    return torch.get_rng_state()
+
+
+@remote(propagate="inspect")
+def set_rng_state_remote(new_state: torch.Tensor) -> None:
+    torch.set_rng_state(new_state)
+
+
+def _run_no_return(f: Callable) -> None:
+    f()
+    return None
+
+
+# TODO: return result when uint64 is supported from remote function
+@remote(propagate=lambda: _run_no_return(torch.seed))
+def seed_remote() -> None:
+    torch.seed()
+
+
+# same underlying implementation as seed_remote (torch.seed)
+# TODO: return result when uint64 is supported from remote function
+@remote(propagate=lambda: _run_no_return(torch.random.seed))
+def random_seed_remote() -> None:
+    torch.random.seed()
+
+
+@remote(propagate="inspect")
+def manual_seed_cuda_remote(seed: int) -> None:
+    torch.cuda.manual_seed(seed)
+
+
+@remote(propagate="inspect")
+def manual_seed_all_cuda_remote(seed: int) -> None:
+    torch.cuda.manual_seed_all(seed)
+
+
+@remote(propagate=lambda: [torch.zeros(1)])
+def get_rng_state_all_cuda_remote() -> list[torch.Tensor]:
+    return torch.cuda.get_rng_state_all()
+
+
+@remote(propagate="inspect")
+def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None:
+    torch.cuda.set_rng_state_all(states)
diff --git a/python/tests/builtins/test_random.py b/python/tests/builtins/test_random.py
@@ -0,0 +1,194 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+import pytest
+import torch
+from monarch import fetch_shard, no_mesh
+
+from monarch._testing import BackendType, TestingContext
+from monarch.builtins.random import (
+    get_rng_state_all_cuda_remote,
+    get_rng_state_remote,
+    initial_seed_remote,
+    manual_seed_all_cuda_remote,
+    manual_seed_cuda_remote,
+    random_seed_remote,
+    seed_remote,
+    set_manual_seed_remote,
+    set_rng_state_all_cuda_remote,
+    set_rng_state_remote,
+)
+
+
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
+class TestRandomFunctions:
+    local = None
+
+    @classmethod
+    def setup_class(cls):
+        cls.local = TestingContext().__enter__()
+
+    @classmethod
+    def teardown_class(cls):
+        if cls.local is not None:
+            cls.local.__exit__(None, None, None)
+
+    @classmethod
+    def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True):
+        return cls.local.local_device_mesh(
+            num_hosts,
+            gpu_per_host,
+            activate,
+            rust=backend_type == BackendType.RS,
+        )
+
+    def test_set_manual_seed_remote(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                set_manual_seed_remote(12345)
+                t1 = torch.rand(5, 5)
+
+                set_manual_seed_remote(12345)
+                t2 = torch.rand(5, 5)
+
+                set_manual_seed_remote(12346)
+                t3 = torch.rand(5, 5)
+
+                # t1 == t2 (same seed), t1 != t3 (different seed)
+                result = fetch_shard((t1, t2, t3)).result()
+                with no_mesh.activate():
+                    assert torch.equal(result[0], result[1])
+                    assert not torch.equal(result[0], result[2])
+
+    def test_set_manual_seed_remote_with_process_idx(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                set_manual_seed_remote(12345, process_idx=0)
+                t1 = torch.rand(5, 5)
+
+                set_manual_seed_remote(12345, process_idx=1)
+                t2 = torch.rand(5, 5)
+
+                result = fetch_shard((t1, t2)).result()
+                with no_mesh.activate():
+                    assert not torch.equal(result[0], result[1])
+
+    def test_initial_seed_remote(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                seed_value = initial_seed_remote()
+
+                result = fetch_shard(seed_value).result()
+                with no_mesh.activate():
+                    assert isinstance(result, int)
+
+    def test_get_rng_state(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                state1 = get_rng_state_remote()
+                state2 = get_rng_state_remote()
+
+                # generate a random tensor to change the state
+                _ = torch.rand(5, 5)
+
+                state3 = get_rng_state_remote()
+
+                result = fetch_shard((state1, state2, state3)).result()
+                with no_mesh.activate():
+                    assert torch.equal(result[0], result[1])
+                    assert not torch.equal(result[0], result[2])
+
+    def test_set_rng_state(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                # save the initial RNG state
+                state = get_rng_state_remote()
+
+                t1 = torch.rand(3, 3)
+                t2 = torch.rand(3, 3)
+
+                # restore the saved RNG state
+                set_rng_state_remote(state)
+                t3 = torch.rand(3, 3)
+
+                # t1 == t3 (same state), t1 != t2 (different state)
+                result = fetch_shard((t1, t2, t3)).result()
+                with no_mesh.activate():
+                    assert not torch.equal(result[0], result[1])
+                    assert torch.equal(result[0], result[2])
+
+    # seed and random.seed seem to be the same function.
+    def test_random_seed(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                random_seed_remote()
+                t1 = torch.rand(5, 5)
+
+                random_seed_remote()
+                t2 = torch.rand(5, 5)
+
+                seed_remote()
+                t3 = torch.rand(5, 5)
+
+                result = fetch_shard((t1, t2, t3)).result()
+                with no_mesh.activate():
+                    assert not torch.equal(result[0], result[1])
+                    assert not torch.equal(result[1], result[2])
+
+    def test_get_rng_state_all_cuda(self, backend_type):
+        NUM_GPUS = 1
+        with self.local_device_mesh(1, NUM_GPUS, backend_type) as device_mesh:
+            with device_mesh.activate():
+                states = get_rng_state_all_cuda_remote()
+
+                result = fetch_shard(states).result()
+                with no_mesh.activate():
+                    assert isinstance(result, list)
+                    assert len(result) == NUM_GPUS
+
+    def test_set_rng_state_all_cuda(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                # save the initial RNG states
+                states = get_rng_state_all_cuda_remote()
+                t1 = torch.rand(3, 3, device="cuda")
+
+                # restore the saved RNG states
+                set_rng_state_all_cuda_remote(states)
+                t2 = torch.rand(3, 3, device="cuda")
+
+                # t1 == t2 (same state)
+                result = fetch_shard((t1, t2)).result()
+                with no_mesh.activate():
+                    assert torch.equal(result[0], result[1])
+
+    def test_cuda_manual_seed(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                self._cuda_seed_test(manual_seed_cuda_remote)
+
+    def test_cuda_manual_seed_all(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                self._cuda_seed_test(manual_seed_all_cuda_remote)
+
+    def _cuda_seed_test(self, seed_func):
+        seed_func(12345)
+        t1 = torch.rand(5, 5, device="cuda")
+
+        seed_func(12345)
+        t2 = torch.rand(5, 5, device="cuda")
+
+        seed_func(54321)
+        t3 = torch.rand(5, 5, device="cuda")
+
+        # t1 = t2 (same seed), t1 != t3 (different seed)
+        result = fetch_shard((t1, t2, t3)).result()
+        with no_mesh.activate():
+            assert torch.equal(result[0], result[1])
+            assert not torch.equal(result[0], result[2])