format

drisspg · alugorey · commit f32a4a4d0499 · 2024-06-18T18:19:56.000Z
diff --git a/float8_experimental/float8_dynamic_linear.py b/float8_experimental/float8_dynamic_linear.py
@@ -22,7 +22,7 @@
     tensor_already_casted_to_fp8,
     to_fp8_no_autograd,
 )
-from float8_experimental.float8_utils import tensor_to_scale, e4m3_dtype, e5m2_dtype
+from float8_experimental.float8_utils import e4m3_dtype, e5m2_dtype, tensor_to_scale
 from torch._prims_common import suggest_memory_format
 
 
@@ -106,9 +106,7 @@ def cast_to_float8_e4m3fn(
     if tensor_already_casted_to_fp8(inpt_tensor):
         return inpt_tensor
     scale = tensor_to_scale(inpt_tensor, e4m3_dtype, reduce_amax)
-    return Float8Tensor.to_float8(
-        inpt_tensor, scale, e4m3_dtype, mm_config=mm_config
-    )
+    return Float8Tensor.to_float8(inpt_tensor, scale, e4m3_dtype, mm_config=mm_config)
 
 
 def cast_to_float8_e5m2_bw(
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -21,7 +21,12 @@
     to_fp8_no_autograd,
 )
 
-from float8_experimental.float8_utils import amax_history_to_scale, tensor_to_amax, e4m3_dtype, e5m2_dtype
+from float8_experimental.float8_utils import (
+    amax_history_to_scale,
+    e4m3_dtype,
+    e5m2_dtype,
+    tensor_to_amax,
+)
 
 
 def _maybe_initialize_amaxes_scales_for_float8_cast(
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -14,7 +14,11 @@
 from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
 from float8_experimental.float8_linear import Float8Linear
 
-from float8_experimental.float8_utils import amax_history_to_scale_stack, e4m3_dtype, e5m2_dtype
+from float8_experimental.float8_utils import (
+    amax_history_to_scale_stack,
+    e4m3_dtype,
+    e5m2_dtype,
+)
 from torch.distributed._functional_collectives import all_reduce, AsyncCollectiveTensor
 
 log = logging.getLogger(__name__)
diff --git a/float8_experimental/float8_tensor.py b/float8_experimental/float8_tensor.py
@@ -9,7 +9,11 @@
 import torch
 
 import torch.distributed._functional_collectives as funcol
-from float8_experimental.float8_utils import tensor_to_amax, to_fp8_saturated, e4m3_dtype
+from float8_experimental.float8_utils import (
+    e4m3_dtype,
+    tensor_to_amax,
+    to_fp8_saturated,
+)
 from torch.distributed._tensor import DTensor
 
 aten = torch.ops.aten
diff --git a/test/test_base.py b/test/test_base.py
@@ -30,11 +30,11 @@
 )
 from float8_experimental.float8_utils import (
     compute_error,
+    e4m3_dtype,
+    e5m2_dtype,
     fp8_tensor_statistics,
     FP8_TYPES,
     tensor_to_scale,
-    e4m3_dtype,
-    e5m2_dtype,
 )
 
 random.seed(0)
@@ -389,10 +389,15 @@ def test_merge_configs(self):
 
 
 class TestNumerics:
-    @pytest.mark.parametrize("float8_dtype", [torch.float8_e4m3fn,
-                                              torch.float8_e5m2,
-                                              torch.float8_e4m3fnuz,
-                                              torch.float8_e5m2fnuz])
+    @pytest.mark.parametrize(
+        "float8_dtype",
+        [
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        ],
+    )
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_small_amax_float16(self, float8_dtype):
         # If we calculate scale naively with FP8_MAX_POS / amax,