add wait_tensor() after all_gather in float8 to fix mem leak (#262)

bdhirsh · facebook-github-bot · commit 6891cbe4293c · 2024-05-16T16:45:12.000-07:00
Summary: I'm going to write a more detailed post internally to explain this memory leak Tracking issue for a better fix in inductor: pytorch/pytorch#126338 Pull Request resolved: #262 Reviewed By: drisspg Differential Revision: D57464230 Pulled By: bdhirsh fbshipit-source-id: 134c50e95045c43f95b5aec4dd3df496ff3fb9a3
diff --git a/float8_experimental/float8_ops.py b/float8_experimental/float8_ops.py
@@ -238,6 +238,7 @@ def allgather_fp8(aten_op, args, kwargs=None):
     fp8_data = fp8_data.view(torch.uint8)
     fp8_data = fp8_data.contiguous()
     fp8_out = aten_op(fp8_data, *args[1:], **kwargs)
+    fp8_out = torch.ops._c10d_functional.wait_tensor(fp8_out)
     fp8_out = fp8_out.view(fp8_input._data.dtype)
     return Float8Tensor(
         fp8_out, fp8_input._scale, fp8_input._orig_dtype, fp8_input._mm_config