Used functional all-reduce for amax reduction

Andrew Gu · Andrew Gu · commit 746519f03c36 · 2024-02-16T07:36:20.000-08:00
ghstack-source-id: 8bbd5fa Pull Request resolved: #219
diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py
@@ -75,7 +75,12 @@ def tensor_to_amax(x, distributed_reduction=False):
     # If the user did not ask for it, assume that it will
     # happen elsewhere.
     if distributed_reduction and dist.is_initialized():
-        dist.all_reduce(amax, op=dist.ReduceOp.MAX)
+        # TODO: Dynamo rewriting synchronous in-place collectives does not work
+        # at the moment. Use functional all-reduce to avoid graph break.
+        amax = dist._functional_collectives.all_reduce(
+            amax, "MAX", list(range(dist.get_world_size()))
+        )
+        # dist.all_reduce(amax, op=dist.ReduceOp.MAX)
 
     return amax