Used functional all-reduce for amax reduction

Andrew Gu · Andrew Gu · commit 3f858141d0ad · 2024-02-16T08:05:59.000-08:00
ghstack-source-id: 256aebe Pull Request resolved: #219
diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py
@@ -75,7 +75,13 @@ def tensor_to_amax(x, distributed_reduction=False):
     # If the user did not ask for it, assume that it will
     # happen elsewhere.
     if distributed_reduction and dist.is_initialized():
-        dist.all_reduce(amax, op=dist.ReduceOp.MAX)
+        # TODO: Dynamo rewriting synchronous in-place collectives fails:
+        # https://github.com/pytorch/pytorch/issues/120082
+        # Use functional all-reduce to avoid graph breaking.
+        amax = dist._functional_collectives.all_reduce(
+            amax, "MAX", list(range(dist.get_world_size()))
+        )
+        # dist.all_reduce(amax, op=dist.ReduceOp.MAX)
 
     return amax