Skip to content
This repository was archived by the owner on Aug 7, 2024. It is now read-only.

Add in pre-commit config and some more CI/CD #232

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Basic flak8 + pytest workflow for Python 3.10

name: Python Lint and Test

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .
pip install -e .'[dev]'
pip install -e .'[test]'
- name: Lint with ruff
run: |
ruff check .
- name: Running Tests
run: |
./test/test_everything.sh
33 changes: 33 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
exclude: 'build'

default_language_version:
python: python3

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: 6306a48f7dae5861702d573c9c247e4e9498e867
hooks:
- id: trailing-whitespace
- id: check-ast
- id: check-merge-conflict
- id: no-commit-to-branch
args: ['--branch=main']
- id: check-added-large-files
args: ['--maxkb=500']
- id: end-of-file-fixer
exclude: '^(.*\.svg)$'

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.3.0
hooks:
# Run the linter.
- id: ruff

- repo: https://github.com/omnilib/ufmt
rev: v2.3.0
hooks:
- id: ufmt
additional_dependencies:
- black == 23.3.0
- usort == 1.0.6
3 changes: 1 addition & 2 deletions benchmarks/bench_linear_float8.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,7 @@ def wrapper(*args, **kwargs):
print(data_pd_simple)

sweep_path = sweep_path.with_suffix(".csv")
with open(sweep_path, mode="w") as file:
data_pd.to_csv(sweep_path)
data_pd.to_csv(sweep_path)


def invoke_main() -> None:
Expand Down
1 change: 0 additions & 1 deletion benchmarks/bench_matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def run(n_limit: Optional[int] = None):
results = []

name_to_shapes = name_to_shapes_70b
bsz_and_seq_len = ((4, 4096),)
dtypes = torch.bfloat16, torch.float16

for idx, (dtype, (name, (K, N))) in enumerate(
Expand Down
1 change: 0 additions & 1 deletion benchmarks/bench_multi_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def fsdp_main(rank, world_size, args):
base_dtype, input_global, compile = args

# basic distributed data sampling
bsz_global = input_global.shape[0]
assert B % world_size == 0
bsz_local_start = int(rank / world_size * B)
bsz_local_end = int((rank + 1) / world_size * B)
Expand Down
1 change: 0 additions & 1 deletion float8_experimental/distributed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def _transform(t):

def _reduce_scatter(ctx: Any, input_: torch.Tensor):
group = get_model_parallel_group()
rank = torch.distributed.get_rank(group)
world_size = torch.distributed.get_world_size(group)

assert input_.shape[0] % world_size == 0
Expand Down
2 changes: 0 additions & 2 deletions float8_experimental/float8_linear_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
from enum import auto, Enum
from typing import List, Optional, Type

import float8_experimental.config as fp8_config

import torch
import torch.distributed as dist
import torch.nn as nn
Expand Down
10 changes: 8 additions & 2 deletions float8_experimental/float8_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

aten = torch.ops.aten
c10d_functional = torch.ops.c10d_functional
_c10d_functional = torch.ops._c10d_functional
FLOAT8_OPS_TABLE: Dict[Any, Any] = {}


Expand Down Expand Up @@ -148,7 +149,12 @@ def autocast_to_copy(aten_op, args, kwargs=None):
)


@implements([c10d_functional.all_gather_into_tensor.default])
@implements(
[
c10d_functional.all_gather_into_tensor.default,
_c10d_functional.all_gather_into_tensor.default,
]
)
def allgather_fp8(aten_op, args, kwargs=None):
"""
override funcol with FP8 handling
Expand All @@ -166,7 +172,7 @@ def allgather_fp8(aten_op, args, kwargs=None):
return Float8Tensor(fp8_out, fp8_input._scale, fp8_input._orig_dtype)


@implements([c10d_functional.wait_tensor.default])
@implements([c10d_functional.wait_tensor.default, _c10d_functional.wait_tensor.default])
def wait_tensor_fp8(aten_op, args, kwargs=None):
fp8_input = args[0]
assert isinstance(fp8_input, Float8Tensor)
Expand Down
6 changes: 1 addition & 5 deletions float8_experimental/float8_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@

import torch

from float8_experimental.float8_utils import (
tensor_to_amax,
tensor_to_scale,
to_fp8_saturated,
)
from float8_experimental.float8_utils import tensor_to_amax, to_fp8_saturated

from torch.distributed._tensor import DTensor

Expand Down
67 changes: 57 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ dependencies = [
test = [
"transformers==4.32.0",
"pandas >= 2.0",
"tqdm==4.66.1",
"fire==0.5.0"
"tqdm==4.66.2",
"fire==0.5.0",
"expecttest",
]
dev = [
"black==23.3.0",
Expand All @@ -32,16 +33,62 @@ dev = [
"libcst==1.0.1",
"pytest==7.4.0",
"bumpver",
"pip-tools"
"pip-tools",
"ruff==0.3.0"
]

# Since we have multiple top level folders we specify what we want to be included
# in the package
[tool.setuptools]
packages = ["float8_experimental"]

# ---------- TOOL CONFIGURATIONS ------------
[tool.usort]
first_party_detection = false

[tool.black]
target-version = ["py38"]
target-version = ["py310"]

[tool.ruff]
# Exclude a variety of commonly ignored directories.
exclude = [
".bzr",
".direnv",
".eggs",
".git",
".git-rewrite",
".hg",
".ipynb_checkpoints",
".mypy_cache",
".nox",
".pants.d",
".pyenv",
".pytest_cache",
".pytype",
".ruff_cache",
".svn",
".tox",
".venv",
".vscode",
"__pypackages__",
"_build",
"buck-out",
"build",
"dist",
"node_modules",
"site-packages",
"venv",
]

# Same as Black.
line-length = 88
indent-width = 4

# Assume Python 3.10
target-version = "py310"

[tool.ruff.lint]
# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
select = ["E4", "E7", "E9", "F"]
ignore = ["E731"]

# Allow fix for all enabled rules (when `--fix`) is provided.
fixable = ["ALL"]
unfixable = []

# Allow unused variables when underscore-prefixed.
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
55 changes: 33 additions & 22 deletions test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
random.seed(0)
torch.manual_seed(0)

is_H100 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)


class TestFloat8Tensor(unittest.TestCase):
def test_preserves_dtype(self) -> None:
Expand Down Expand Up @@ -114,13 +116,14 @@ def _test_linear_impl(
), f"{buffer_name} not filled, current value {buffer_value}"

# verify initialization flags got updated
assert m_fp8.is_amax_initialized == True
assert m_fp8.is_amax_initialized, "Amax was not properly initialized"

@pytest.mark.parametrize("emulate", [True, False])
@pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
@pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
@pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
@pytest.mark.parametrize("use_activation_hooks", [True, False])
@pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_linear_nobias(
self,
x_shape,
Expand All @@ -142,14 +145,15 @@ def test_linear_nobias(
m_ref = nn.Linear(16, 32, bias=False, device="cuda")
self._test_linear_impl(x, m_ref, linear_type, emulate, use_activation_hooks)

@pytest.mark.parametrize("emulate", [True, False])
@pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
@pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
@pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
@pytest.mark.parametrize(
"linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
)
@pytest.mark.parametrize("use_activation_hooks", [True, False])
@pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_linear_bias(
self,
x_shape,
Expand All @@ -172,13 +176,14 @@ def test_linear_bias(
m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
self._test_linear_impl(x, m_ref, linear_type, emulate, use_activation_hooks)

@pytest.mark.parametrize("emulate", [True, False])
@pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
@pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
@pytest.mark.parametrize(
"linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
)
@pytest.mark.parametrize("use_activation_hooks", [True, False])
@pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_autocast_outputs(
self,
linear_type: LinearType,
Expand Down Expand Up @@ -225,31 +230,36 @@ def test_autocast_outputs(
@pytest.mark.parametrize(
"linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
)
def test_type_cast(self, linear_type: LinearType, linear_dtype: torch.dtype):
@pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_type_cast(
self, linear_type: LinearType, linear_dtype: torch.dtype, emulate: bool
):
emulate = (
not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0)
)

m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
m = Float8Linear.from_float(m, emulate)
m = get_float8_linear(linear_type, m, emulate, False)

# Cast the module to dtype
m = m.to(dtype=linear_dtype)
# Check amax buffer types
for key in [
"fp8_amax_x",
"fp8_amax_history_x",
"fp8_scale_x",
"fp8_amax_w",
"fp8_amax_history_w",
"fp8_scale_w",
"fp8_amax_dL_dY",
"fp8_amax_history_dL_dY",
"fp8_scale_dL_dY",
]:
assert (
m._buffers[key].dtype == torch.float32
), f"{key}.dtype is {m._buffers[key].dtype}, expected torch.float32"
if linear_requires_sync(linear_type):
# Check amax buffer types
for key in [
"fp8_amax_x",
"fp8_amax_history_x",
"fp8_scale_x",
"fp8_amax_w",
"fp8_amax_history_w",
"fp8_scale_w",
"fp8_amax_dL_dY",
"fp8_amax_history_dL_dY",
"fp8_scale_dL_dY",
]:
assert (
m._buffers[key].dtype == torch.float32
), f"{key}.dtype is {m._buffers[key].dtype}, expected torch.float32"

# autocast off
x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
Expand All @@ -273,7 +283,7 @@ def test_type_cast(self, linear_type: LinearType, linear_dtype: torch.dtype):

class TestScaledMM:
@unittest.skipIf(
not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0),
not is_H100,
"CUDA not available",
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -321,6 +331,7 @@ def test_scaled_mm_vs_emulated(self, base_dtype):

class TestNumerics:
@pytest.mark.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_small_amax_float16(self, float8_dtype):
# If we calculate scale naively with FP8_MAX_POS / amax,
# the result may not be representable in fp16. Verify that
Expand Down
Loading