create UDF builtins for rng/seed torch functions

amirafzali · facebook-github-bot · commit ae89bec1cea1 · 2025-05-22T09:56:49.000-07:00
Summary:
**Diff Purpose &amp; Changes**
1. Creating the random/RNG remote functions for our new builtins library.
torch.manual_seed(seed)
torch.initial_seed()
torch.get_rng_state()
torch.set_rng_state(state)
torch.cuda.get_rng_state_all() 
torch.cuda.set_rng_state_all(states)

These two appear to be the same function. we can consider removing one or the other in the library.
torch.seed()
torch.random.seed()

Reviewed By: vidhyav, colin2328

Differential Revision: D72944566
diff --git a/python/monarch/builtins/random.py b/python/monarch/builtins/random.py
@@ -0,0 +1,54 @@
+import random
+
+import torch
+from monarch.common.remote import remote
+
+
+@remote(propagate="inspect")
+def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None:
+    torch.manual_seed(seed ^ process_idx)
+
+
+@remote(propagate=lambda: 0)
+def initial_seed_remote() -> int:
+    return torch.initial_seed()
+
+
+@remote(propagate=lambda: torch.zeros(1))
+def get_rng_state_remote() -> torch.Tensor:
+    return torch.get_rng_state()
+
+
+@remote(propagate="inspect")
+def set_rng_state_remote(new_state: torch.Tensor) -> None:
+    torch.set_rng_state(new_state)
+
+
+@remote(propagate=lambda: int(random.random()))
+def seed_remote() -> int:
+    return torch.seed()
+
+
+@remote(propagate=lambda: int(random.random()))
+def random_seed_remote() -> int:
+    return torch.random.seed()
+
+
+@remote(propagate="inspect")
+def manual_seed_cuda_remote(seed: int) -> None:
+    torch.cuda.manual_seed(seed)
+
+
+@remote(propagate="inspect")
+def manual_seed_all_cuda_remote(seed: int) -> None:
+    torch.cuda.manual_seed_all(seed)
+
+
+@remote(propagate=lambda: [torch.zeros(1)])
+def get_rng_state_all_cuda_remote() -> list[torch.Tensor]:
+    return torch.cuda.get_rng_state_all()
+
+
+@remote(propagate="inspect")
+def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None:
+    torch.cuda.set_rng_state_all(states)
diff --git a/python/tests/builtins/test_random.py b/python/tests/builtins/test_random.py
@@ -0,0 +1,184 @@
+# pyre-unsafe
+import pytest
+import torch
+from monarch import fetch_shard, no_mesh
+
+from monarch._testing import BackendType, TestingContext
+from monarch.builtins.random import (
+    get_rng_state_all_cuda_remote,
+    get_rng_state_remote,
+    initial_seed_remote,
+    manual_seed_all_cuda_remote,
+    manual_seed_cuda_remote,
+    random_seed_remote,
+    seed_remote,
+    set_manual_seed_remote,
+    set_rng_state_all_cuda_remote,
+    set_rng_state_remote,
+)
+
+
+@pytest.fixture(scope="module", autouse=True)
+def testing_context():
+    global local
+    with TestingContext() as local:
+        yield
+
+
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
+class TestRandomFunctions:
+    @classmethod
+    def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True):
+        return local.local_device_mesh(
+            num_hosts,
+            gpu_per_host,
+            activate,
+            rust=backend_type == BackendType.RS,
+        )
+
+    def test_set_manual_seed_remote(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                set_manual_seed_remote(12345)
+                t1 = torch.rand(5, 5)
+
+                set_manual_seed_remote(12345)
+                t2 = torch.rand(5, 5)
+
+                set_manual_seed_remote(12346)
+                t3 = torch.rand(5, 5)
+
+                # t1 == t2 (same seed), t1 != t3 (different seed)
+                result = fetch_shard((t1, t2, t3)).result()
+                with no_mesh.activate():
+                    assert torch.equal(result[0], result[1])
+                    assert not torch.equal(result[0], result[2])
+
+    def test_set_manual_seed_remote_with_process_idx(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                set_manual_seed_remote(12345, process_idx=0)
+                t1 = torch.rand(5, 5)
+
+                set_manual_seed_remote(12345, process_idx=1)
+                t2 = torch.rand(5, 5)
+
+                result = fetch_shard((t1, t2)).result()
+                with no_mesh.activate():
+                    assert not torch.equal(result[0], result[1])
+
+    def test_initial_seed_remote(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                seed_value = initial_seed_remote()
+
+                result = fetch_shard(seed_value).result()
+                with no_mesh.activate():
+                    assert isinstance(result, int)
+
+    def test_get_rng_state(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                state1 = get_rng_state_remote()
+                state2 = get_rng_state_remote()
+
+                # generate a random tensor to change the state
+                _ = torch.rand(5, 5)
+
+                state3 = get_rng_state_remote()
+
+                result = fetch_shard((state1, state2, state3)).result()
+                with no_mesh.activate():
+                    assert torch.equal(result[0], result[1])
+                    assert not torch.equal(result[0], result[2])
+
+    def test_set_rng_state(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                # save the initial RNG state
+                state = get_rng_state_remote()
+
+                t1 = torch.rand(3, 3)
+                t2 = torch.rand(3, 3)
+
+                # restore the saved RNG state
+                set_rng_state_remote(state)
+                t3 = torch.rand(3, 3)
+
+                # t1 == t3 (same state), t1 != t2 (different state)
+                result = fetch_shard((t1, t2, t3)).result()
+                with no_mesh.activate():
+                    assert not torch.equal(result[0], result[1])
+                    assert torch.equal(result[0], result[2])
+
+    # seed and random.seed seem to be the same function.
+    def test_random_seed(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                random_seed_remote()
+                t1 = torch.rand(5, 5)
+
+                random_seed_remote()
+                t2 = torch.rand(5, 5)
+
+                seed_remote()
+                t3 = torch.rand(5, 5)
+
+                result = fetch_shard((t1, t2, t3)).result()
+                with no_mesh.activate():
+                    assert not torch.equal(result[0], result[1])
+                    assert not torch.equal(result[1], result[2])
+
+    def test_get_rng_state_all_cuda(self, backend_type):
+        NUM_GPUS = 1
+        with self.local_device_mesh(1, NUM_GPUS, backend_type) as device_mesh:
+            with device_mesh.activate():
+                states = get_rng_state_all_cuda_remote()
+
+                result = fetch_shard(states).result()
+                with no_mesh.activate():
+                    assert isinstance(result, list)
+                    assert len(result) == NUM_GPUS
+
+    def test_set_rng_state_all_cuda(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                # save the initial RNG states
+                states = get_rng_state_all_cuda_remote()
+                t1 = torch.rand(3, 3, device="cuda")
+
+                # restore the saved RNG states
+                set_rng_state_all_cuda_remote(states)
+                t2 = torch.rand(3, 3, device="cuda")
+
+                # t1 == t2 (same state)
+                result = fetch_shard((t1, t2)).result()
+                with no_mesh.activate():
+                    assert torch.equal(result[0], result[1])
+
+    def test_cuda_manual_seed(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                self._cuda_seed_test(manual_seed_cuda_remote)
+
+    def test_cuda_manual_seed_all(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type) as device_mesh:
+            with device_mesh.activate():
+                self._cuda_seed_test(manual_seed_all_cuda_remote)
+
+    def _cuda_seed_test(self, seed_func):
+        seed_func(12345)
+        t1 = torch.rand(5, 5, device="cuda")
+
+        seed_func(12345)
+        t2 = torch.rand(5, 5, device="cuda")
+
+        seed_func(54321)
+        t3 = torch.rand(5, 5, device="cuda")
+
+        # t1 = t2 (same seed), t1 != t3 (different seed)
+        result = fetch_shard((t1, t2, t3)).result()
+        with no_mesh.activate():
+            assert torch.equal(result[0], result[1])
+            assert not torch.equal(result[0], result[2])