Enable remaining tests for rust backend in test_remote_functions (#16)

Sam Lurye · facebook-github-bot · commit caf6cd4f12ba · 2025-05-21T19:53:23.000-07:00
Summary: Pull Request resolved: #16 As titled. This involved fixing two issues with pipes (allow pipe processes to create tensors from wire values and explicitly flush the pipe buffer during pipe send) and fixing an issue with tensor serialization. Reviewed By: dulinriley Differential Revision: D74854607 fbshipit-source-id: b457a51feb18741a0b4535c405e1ca9c46a8f24e
diff --git a/monarch_worker/src/bootstrap.rs b/monarch_worker/src/bootstrap.rs
@@ -106,8 +106,12 @@ pub fn bootstrap_pipe() -> Result<(), anyhow::Error> {
     // Value of 4 is arbitrary as our side does not need to do buffering.
     let mut pipe = StreamPipe::new(std::io::stdin(), std::io::stdout(), 4);
     let init: OutOfProcessSetupParams = pipe.recv()?;
+    // Create a PyPipe that allows unsafe object conversion. This allows the pipe to
+    // receive tensors, which we know is safe because StreamPipe receives the serialized
+    // tensors from out-of-process, and they therefore can't be owned by anything except
+    // the pipe's python code.
     run_py_pipe(
-        PyPipe::new(Box::new(pipe), init.ranks, init.sizes),
+        PyPipe::new(Box::new(pipe), init.ranks, init.sizes, true),
         init.function,
         init.args,
         init.kwargs,
diff --git a/monarch_worker/src/pipe.rs b/monarch_worker/src/pipe.rs
@@ -235,6 +235,7 @@ impl<T: Serialize + DeserializeOwned> Pipe<T> for StreamPipe {
         let len = bytes.len();
         self.writer.write_all(&len.to_be_bytes())?;
         self.writer.write_all(&bytes)?;
+        self.writer.flush()?;
         Ok(())
     }
 
@@ -374,7 +375,7 @@ impl PipeMessageHandler for PipeActor {
         // TODO(agallagher): Propagate failures and use a timeout?
         tokio::select! {
             res = self.handle.wait() => bail!("pipe server exited: {:?}", res),
-            res = self.pipe.as_mut().unwrap().recv() => res,
+            res = self.pipe.as_mut().unwrap().recv() => res
         }
     }
 }
diff --git a/monarch_worker/src/py_pipe.rs b/monarch_worker/src/py_pipe.rs
@@ -11,6 +11,7 @@ use std::collections::HashMap;
 use monarch_messages::worker::ResolvableFunction;
 use monarch_types::PyTree;
 use monarch_types::TryIntoPyObject;
+use monarch_types::TryIntoPyObjectUnsafe;
 use pyo3::prelude::*;
 use pyo3::types::PyTuple;
 use torch_sys::RValue;
@@ -25,15 +26,22 @@ pub struct PyPipe {
     ranks: HashMap<String, usize>,
     #[pyo3(get)]
     sizes: HashMap<String, usize>,
+    allow_unsafe_obj_conversion: bool,
 }
 
 impl PyPipe {
     pub fn new(
         pipe: Box<dyn Pipe<PyTree<RValue>> + Send>,
         ranks: HashMap<String, usize>,
         sizes: HashMap<String, usize>,
+        allow_unsafe_obj_conversion: bool,
     ) -> Self {
-        Self { pipe, ranks, sizes }
+        Self {
+            pipe,
+            ranks,
+            sizes,
+            allow_unsafe_obj_conversion,
+        }
     }
 }
 
@@ -46,8 +54,14 @@ impl PyPipe {
     }
 
     fn recv<'a>(&mut self, py: Python<'a>) -> PyResult<Bound<'a, PyAny>> {
-        py.allow_threads(move || self.pipe.recv())?
-            .try_to_object(py)
+        let val = py.allow_threads(|| self.pipe.recv())?;
+        if self.allow_unsafe_obj_conversion {
+            // SAFETY: A caller who initialized this PyPipe with allow_unsafe_obj_conversion=True
+            // asserts that it is safe to use this unsafe method.
+            unsafe { val.try_to_object_unsafe(py) }
+        } else {
+            val.try_to_object(py)
+        }
     }
 }
 
@@ -122,7 +136,12 @@ mod tests {
             async move {
                 tokio::task::spawn_blocking(move || {
                     run_py_pipe(
-                        PyPipe::new(Box::new(server), HashMap::new(), HashMap::new()),
+                        PyPipe::new(
+                            Box::new(server),
+                            HashMap::new(),
+                            HashMap::new(),
+                            false, // allow_unsafe_obj_conversion
+                        ),
                         "test_helpers.func".into(),
                         vec![],
                         HashMap::new(),
diff --git a/python/tests/test_remote_functions.py b/python/tests/test_remote_functions.py
@@ -518,8 +518,6 @@ def test_remote_function_isend(self, backend_type):
         assert local_finished_1.item() == 1.0
 
     def test_distributed_error(self, backend_type):
-        if backend_type == BackendType.RS:
-            pytest.skip("FIXME: Rust support for this function")
         with self.local_device_mesh(2, 2, backend_type) as _:
             x = torch.rand(3, 4).cuda()
             y = torch.rand(3, 4).cuda()
@@ -545,8 +543,6 @@ def test_distributed_error(self, backend_type):
             fetch_shard(2 * x, gpu=1, host=0).result()
 
     def test_pipe(self, backend_type):
-        if backend_type == BackendType.RS:
-            pytest.skip("FIXME: Rust support for this function")
         with self.local_device_mesh(2, 2, backend_type):
             p = example_echo_add()
             for _i in range(10):
@@ -566,8 +562,6 @@ def test_loader(self, backend_type):
                     assert x.item() == i
 
     def test_loader_blocks_with_small_pipe(self, backend_type):
-        if backend_type == BackendType.RS:
-            pytest.skip("FIXME: Rust support for this function")
         with self.local_device_mesh(2, 2, backend_type):
             iters = 10
             p = example_data_loader_small_pipe(iters, (1000, 1000))
@@ -581,8 +575,6 @@ def test_loader_blocks_with_small_pipe(self, backend_type):
         assert t[0][0].item() == -1.0
 
     def test_streams_run_parallel(self, backend_type):
-        if backend_type == BackendType.RS:
-            pytest.skip("FIXME: Rust support for this function")
         with self.local_device_mesh(2, 2, backend_type):
             # test that these two streams do in fact run in parallel
             # on the worker by having each stream wait on a barrier.
@@ -643,8 +635,6 @@ def test_fetch_preprocess(self, backend_type):
             )
 
     def test_cached_remote_function(self, backend_type):
-        if backend_type == BackendType.RS:
-            pytest.skip("FIXME: Rust support for this function")
         fn = remote("monarch.worker._testing_function.how_many_of_these_do_you_want")
         start_hits = remote_module._hit
         with self.local_device_mesh(2, 2, backend_type):
@@ -713,9 +703,6 @@ def test_cached_remote_aliases(self, backend_type):
             assert outs[2]._fake.storage_offset() == 40
 
     def test_live_function(self, backend_type):
-        if backend_type == BackendType.RS:
-            pytest.skip("FIXME: Rust support for this function")
-
         def bar(x, y):
             return (
                 a_function_called_by_a_live_function(x)
@@ -1094,18 +1081,6 @@ def test_nccl_barrier(self, backend_type: BackendType) -> None:
                             inspect(t, {"host": host, "gpu": gpu}),
                         )
 
-    def test_nccl_barrier_device_ids(self, backend_type: BackendType) -> None:
-        if backend_type == BackendType.PY:
-            # pyre-ignore[29]: pytest.skip is callable.
-            pytest.skip("FIXME: Python support for this function")
-        with self.local_device_mesh(
-            self.N_HOSTS, self.N_GPUS, backend_type
-        ) as device_mesh:
-            pg = device_mesh.process_group(("host", "gpu"))
-            rank = device_mesh.rank("host") * self.N_GPUS + device_mesh.rank("gpu")
-            with pytest.raises(monarch.common.invocation.RemoteException):
-                inspect(barrier(device_ids=[rank], group=pg))
-
     def test_tensor_dtype_complex(self, backend_type: BackendType) -> None:
         self._test_tensor_dtype_complex(backend_type)
 
diff --git a/torch-sys/src/bridge.cpp b/torch-sys/src/bridge.cpp
@@ -508,6 +508,7 @@ Tensor tensor_from_py_object(PyObject* unowned) {
 // TODO: We can do better for IValue serde as we dont need pickle compat here.
 const char kIValueStart = '\x01';
 const char kTensorsStart = '\x02';
+const char kWrappedNumberStart = '\x03';
 rust::Vec<uint8_t> serialize_ivalue(const IValue& iv) {
   if (iv.isTensor() && !iv.toTensor().defined()) {
     // Special case for undefined tensors as pickle doesnt
@@ -529,6 +530,18 @@ rust::Vec<uint8_t> serialize_ivalue(const IValue& iv) {
     }
     std::copy(
         tensors_data.begin(), tensors_data.end(), std::back_inserter(out));
+    // Tensor serialization doesn't maintain the wrapped number flag, so we
+    // need to manually serialize it. This is important to maintain because
+    // it has implications for the output type of torch ops.
+    out.push_back(kWrappedNumberStart);
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      uint8_t offset = i % sizeof(uint8_t);
+      if (offset == 0) {
+        out.push_back(0);
+      }
+      out.back() |= static_cast<uint8_t>(
+          tensors.at(i).unsafeGetTensorImpl()->is_wrapped_number() << offset);
+    }
   }
   out.push_back(kIValueStart);
   out.reserve(out.size() + pickle_data.size());
@@ -565,6 +578,33 @@ IValue deserialize_ivalue(rust::Slice<const uint8_t> buf) {
     rust::Slice<const uint8_t> tensor_data(buf.data() + i, tensors_size);
     tensors = load<std::vector<Tensor>>(tensor_data);
     i += tensors_size;
+    if (i >= buf.size() || buf.at(i) != kWrappedNumberStart) {
+      throw std::runtime_error(
+          "Invalid IValue serialization: missing wrapped number start byte");
+    }
+    for (size_t tensor_index = 0; tensor_index < tensors.size();
+         tensor_index++) {
+      uint8_t offset = tensor_index % sizeof(uint8_t);
+      if (offset == 0) {
+        i++;
+      }
+      if (i >= buf.size()) {
+        throw std::runtime_error(
+            "Invalid IValue serialization: wrapped number data truncated");
+      }
+      bool wrapped_number = (buf.at(i) >> offset) & 0x01;
+      if (wrapped_number) {
+        // You would think we could just call
+        // set_wrapped_number(wrapped_number), but you'd be wrong. Internally,
+        // set_wrapped_number asserts a 0-dim tensor regardless of whether its
+        // argument is true or false, so we can only call set_wrapped_number
+        // safely when wrapped_number == true.
+        tensors.at(tensor_index)
+            .unsafeGetTensorImpl()
+            ->set_wrapped_number(true);
+      }
+    }
+    i++;
   }
   if (i >= buf.size() || buf.at(i++) != kIValueStart) {
     throw std::runtime_error(

Original file line number	Diff line number	Diff line change
`@@ -235,6 +235,7 @@ impl<T: Serialize + DeserializeOwned> Pipe<T> for StreamPipe {`
`235`	`235`	`let len = bytes.len();`
`236`	`236`	`self.writer.write_all(&len.to_be_bytes())?;`
`237`	`237`	`self.writer.write_all(&bytes)?;`
	`238`	`+ self.writer.flush()?;`
`238`	`239`	`Ok(())`
`239`	`240`	`}`
`240`	`241`
`@@ -374,7 +375,7 @@ impl PipeMessageHandler for PipeActor {`
`374`	`375`	`// TODO(agallagher): Propagate failures and use a timeout?`
`375`	`376`	`tokio::select! {`
`376`	`377`	`res = self.handle.wait() => bail!("pipe server exited: {:?}", res),`
`377`		`- res = self.pipe.as_mut().unwrap().recv() => res,`
	`378`	`+ res = self.pipe.as_mut().unwrap().recv() => res`
`378`	`379`	`}`
`379`	`380`	`}`
`380`	`381`	`}`