pytorch-labs
diff --git a/‎.github/workflows/python-app.yml
Lines changed: 36 additions & 0 deletions b/‎.github/workflows/python-app.yml
Lines changed: 36 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 33 additions & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 33 additions & 0 deletions
diff --git a/‎benchmarks/bench_linear_float8.py
Lines changed: 1 addition & 2 deletions b/‎benchmarks/bench_linear_float8.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/bench_matmul.py
Lines changed: 0 additions & 1 deletion b/‎benchmarks/bench_matmul.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarks/bench_multi_gpu.py
Lines changed: 0 additions & 1 deletion b/‎benchmarks/bench_multi_gpu.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎float8_experimental/distributed_utils.py
Lines changed: 0 additions & 1 deletion b/‎float8_experimental/distributed_utils.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎float8_experimental/float8_linear_utils.py
Lines changed: 0 additions & 2 deletions b/‎float8_experimental/float8_linear_utils.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎float8_experimental/float8_ops.py
Lines changed: 8 additions & 2 deletions b/‎float8_experimental/float8_ops.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎float8_experimental/float8_tensor.py
Lines changed: 1 addition & 5 deletions b/‎float8_experimental/float8_tensor.py
Lines changed: 1 addition & 5 deletions
diff --git a/‎pyproject.toml
Lines changed: 57 additions & 10 deletions b/‎pyproject.toml
Lines changed: 57 additions & 10 deletions
diff --git a/‎test/test_base.py
Lines changed: 33 additions & 22 deletions b/‎test/test_base.py
Lines changed: 33 additions & 22 deletions
@@ -0,0 +1,36 @@
+# Basic flak8 + pytest workflow for Python 3.10
+
+name: Python Lint and Test
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e .
+        pip install -e .'[dev]'
+        pip install -e .'[test]'
+    - name: Lint with ruff
+      run: |
+        ruff check .
+    - name: Running Tests
+      run: |
+        ./test/test_everything.sh
@@ -0,0 +1,33 @@
+exclude: 'build'
+
+default_language_version:
+    python: python3
+
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: 6306a48f7dae5861702d573c9c247e4e9498e867
+    hooks:
+    -   id: trailing-whitespace
+    -   id: check-ast
+    -   id: check-merge-conflict
+    -   id: no-commit-to-branch
+        args: ['--branch=main']
+    -   id: check-added-large-files
+        args: ['--maxkb=500']
+    -   id: end-of-file-fixer
+        exclude: '^(.*\.svg)$'
+
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.3.0
+    hooks:
+        # Run the linter.
+        - id: ruff
+
+-   repo: https://github.com/omnilib/ufmt
+    rev: v2.3.0
+    hooks:
+    -   id: ufmt
+        additional_dependencies:
+          - black == 23.3.0
+          - usort == 1.0.6
@@ -222,8 +222,7 @@ def wrapper(*args, **kwargs):
     print(data_pd_simple)
 
     sweep_path = sweep_path.with_suffix(".csv")
-    with open(sweep_path, mode="w") as file:
-        data_pd.to_csv(sweep_path)
+    data_pd.to_csv(sweep_path)
 
 
 def invoke_main() -> None:
 
@@ -66,7 +66,6 @@ def run(n_limit: Optional[int] = None):
     results = []
 
     name_to_shapes = name_to_shapes_70b
-    bsz_and_seq_len = ((4, 4096),)
     dtypes = torch.bfloat16, torch.float16
 
     for idx, (dtype, (name, (K, N))) in enumerate(
 
@@ -76,7 +76,6 @@ def fsdp_main(rank, world_size, args):
     base_dtype, input_global, compile = args
 
     # basic distributed data sampling
-    bsz_global = input_global.shape[0]
     assert B % world_size == 0
     bsz_local_start = int(rank / world_size * B)
     bsz_local_end = int((rank + 1) / world_size * B)
 
@@ -58,7 +58,6 @@ def _transform(t):
 
 def _reduce_scatter(ctx: Any, input_: torch.Tensor):
     group = get_model_parallel_group()
-    rank = torch.distributed.get_rank(group)
     world_size = torch.distributed.get_world_size(group)
 
     assert input_.shape[0] % world_size == 0
 
@@ -8,8 +8,6 @@
 from enum import auto, Enum
 from typing import List, Optional, Type
 
-import float8_experimental.config as fp8_config
-
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 
@@ -14,6 +14,7 @@
 
 aten = torch.ops.aten
 c10d_functional = torch.ops.c10d_functional
+_c10d_functional = torch.ops._c10d_functional
 FLOAT8_OPS_TABLE: Dict[Any, Any] = {}
 
 
@@ -148,7 +149,12 @@ def autocast_to_copy(aten_op, args, kwargs=None):
     )
 
 
-@implements([c10d_functional.all_gather_into_tensor.default])
+@implements(
+    [
+        c10d_functional.all_gather_into_tensor.default,
+        _c10d_functional.all_gather_into_tensor.default,
+    ]
+)
 def allgather_fp8(aten_op, args, kwargs=None):
     """
     override funcol with FP8 handling
@@ -166,7 +172,7 @@ def allgather_fp8(aten_op, args, kwargs=None):
     return Float8Tensor(fp8_out, fp8_input._scale, fp8_input._orig_dtype)
 
 
-@implements([c10d_functional.wait_tensor.default])
+@implements([c10d_functional.wait_tensor.default, _c10d_functional.wait_tensor.default])
 def wait_tensor_fp8(aten_op, args, kwargs=None):
     fp8_input = args[0]
     assert isinstance(fp8_input, Float8Tensor)
 
@@ -7,11 +7,7 @@
 
 import torch
 
-from float8_experimental.float8_utils import (
-    tensor_to_amax,
-    tensor_to_scale,
-    to_fp8_saturated,
-)
+from float8_experimental.float8_utils import tensor_to_amax, to_fp8_saturated
 
 from torch.distributed._tensor import DTensor
 
 
@@ -22,8 +22,9 @@ dependencies = [
 test = [
     "transformers==4.32.0",
     "pandas >= 2.0",
-    "tqdm==4.66.1",
-    "fire==0.5.0"
+    "tqdm==4.66.2",
+    "fire==0.5.0",
+    "expecttest",
 ]
 dev = [
     "black==23.3.0",
@@ -32,16 +33,62 @@ dev = [
     "libcst==1.0.1",
     "pytest==7.4.0",
     "bumpver",
-    "pip-tools"
+    "pip-tools",
+    "ruff==0.3.0"
 ]
-
-# Since we have multiple top level folders we specify what we want to be included
-# in the package
-[tool.setuptools]
-packages = ["float8_experimental"]
-
+# ---------- TOOL CONFIGURATIONS ------------
 [tool.usort]
 first_party_detection = false
 
 [tool.black]
-target-version = ["py38"]
+target-version = ["py310"]
+
+[tool.ruff]
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".ipynb_checkpoints",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pyenv",
+    ".pytest_cache",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    ".vscode",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "site-packages",
+    "venv",
+]
+
+# Same as Black.
+line-length = 88
+indent-width = 4
+
+# Assume Python 3.10
+target-version = "py310"
+
+[tool.ruff.lint]
+# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
+select = ["E4", "E7", "E9", "F"]
+ignore = ["E731"]
+
+# Allow fix for all enabled rules (when `--fix`) is provided.
+fixable = ["ALL"]
+unfixable = []
+
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
@@ -35,6 +35,8 @@
 random.seed(0)
 torch.manual_seed(0)
 
+is_H100 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)
+
 
 class TestFloat8Tensor(unittest.TestCase):
     def test_preserves_dtype(self) -> None:
@@ -114,13 +116,14 @@ def _test_linear_impl(
                 ), f"{buffer_name} not filled, current value {buffer_value}"
 
             # verify initialization flags got updated
-            assert m_fp8.is_amax_initialized == True
+            assert m_fp8.is_amax_initialized, "Amax was not properly initialized"
 
-    @pytest.mark.parametrize("emulate", [True, False])
+    @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
     @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
     @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
     @pytest.mark.parametrize("use_activation_hooks", [True, False])
     @pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_linear_nobias(
         self,
         x_shape,
@@ -142,14 +145,15 @@ def test_linear_nobias(
         m_ref = nn.Linear(16, 32, bias=False, device="cuda")
         self._test_linear_impl(x, m_ref, linear_type, emulate, use_activation_hooks)
 
-    @pytest.mark.parametrize("emulate", [True, False])
+    @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
     @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
     @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
     @pytest.mark.parametrize(
         "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
     )
     @pytest.mark.parametrize("use_activation_hooks", [True, False])
     @pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_linear_bias(
         self,
         x_shape,
@@ -172,13 +176,14 @@ def test_linear_bias(
         m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
         self._test_linear_impl(x, m_ref, linear_type, emulate, use_activation_hooks)
 
-    @pytest.mark.parametrize("emulate", [True, False])
+    @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
     @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
     @pytest.mark.parametrize(
         "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
     )
     @pytest.mark.parametrize("use_activation_hooks", [True, False])
     @pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_autocast_outputs(
         self,
         linear_type: LinearType,
@@ -225,31 +230,36 @@ def test_autocast_outputs(
     @pytest.mark.parametrize(
         "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
     )
-    def test_type_cast(self, linear_type: LinearType, linear_dtype: torch.dtype):
+    @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_type_cast(
+        self, linear_type: LinearType, linear_dtype: torch.dtype, emulate: bool
+    ):
         emulate = (
             not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0)
         )
 
         m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
-        m = Float8Linear.from_float(m, emulate)
+        m = get_float8_linear(linear_type, m, emulate, False)
 
         # Cast the module to dtype
         m = m.to(dtype=linear_dtype)
-        # Check amax buffer types
-        for key in [
-            "fp8_amax_x",
-            "fp8_amax_history_x",
-            "fp8_scale_x",
-            "fp8_amax_w",
-            "fp8_amax_history_w",
-            "fp8_scale_w",
-            "fp8_amax_dL_dY",
-            "fp8_amax_history_dL_dY",
-            "fp8_scale_dL_dY",
-        ]:
-            assert (
-                m._buffers[key].dtype == torch.float32
-            ), f"{key}.dtype is {m._buffers[key].dtype}, expected torch.float32"
+        if linear_requires_sync(linear_type):
+            # Check amax buffer types
+            for key in [
+                "fp8_amax_x",
+                "fp8_amax_history_x",
+                "fp8_scale_x",
+                "fp8_amax_w",
+                "fp8_amax_history_w",
+                "fp8_scale_w",
+                "fp8_amax_dL_dY",
+                "fp8_amax_history_dL_dY",
+                "fp8_scale_dL_dY",
+            ]:
+                assert (
+                    m._buffers[key].dtype == torch.float32
+                ), f"{key}.dtype is {m._buffers[key].dtype}, expected torch.float32"
 
         # autocast off
         x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
@@ -273,7 +283,7 @@ def test_type_cast(self, linear_type: LinearType, linear_dtype: torch.dtype):
 
 class TestScaledMM:
     @unittest.skipIf(
-        not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0),
+        not is_H100,
         "CUDA not available",
     )
     @pytest.mark.parametrize(
@@ -321,6 +331,7 @@ def test_scaled_mm_vs_emulated(self, base_dtype):
 
 class TestNumerics:
     @pytest.mark.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_small_amax_float16(self, float8_dtype):
         # If we calculate scale naively with FP8_MAX_POS / amax,
         # the result may not be representable in fp16. Verify that