enable weights only loading

drisspg · drisspg · commit 2ea0ab51975f · 2024-06-25T13:38:24.000-07:00
diff --git a/float8_experimental/__init__.py b/float8_experimental/__init__.py
@@ -5,11 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 # Lets define a few top level things here
 from float8_experimental.float8_linear import Float8Linear
-from float8_experimental.float8_tensor import Float8Tensor
+from float8_experimental.float8_tensor import Float8Tensor, ScaledMMConfig
 
 # Needed to load Float8Tensor with weights_only = True
 from torch.serialization import add_safe_globals
 
-add_safe_globals([Float8Tensor])
+add_safe_globals([Float8Tensor, ScaledMMConfig])
 
 __all__ = ["Float8Tensor", "Float8Linear"]
diff --git a/test/test_base.py b/test/test_base.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
+import io
 import itertools
 import random
 import unittest
@@ -12,6 +13,7 @@
 
 import torch
 import torch.nn as nn
+
 from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
 from float8_experimental.float8_linear import Float8Linear
 from float8_experimental.float8_linear_utils import (
@@ -82,6 +84,25 @@ def test_split_cat(self):
         catted = torch.cat(splits, dim=0)
         assert bitwise_identical(fp8_a, catted)
 
+    def test_weights_only_load(self):
+        module = nn.Linear(16, 16)
+        # Save model state dict
+        buffer = io.BytesIO()
+        fp8_module = swap_linear_with_float8_linear(
+            module,
+            Float8DynamicLinear,
+            from_float_kwargs={
+                "pre_quantize_weight": True,
+                "activation_scale": torch.tensor(
+                    [1.0], device="cuda", dtype=torch.float32
+                ),
+            },
+        )
+
+        torch.save(fp8_module.state_dict(), buffer)
+        buffer.seek(0)
+        _ = torch.load(buffer, weights_only=True)
+
 
 class TestFloat8Linear:
     def _test_linear_impl(
diff --git a/test/test_inference_flows.py b/test/test_inference_flows.py
@@ -179,12 +179,14 @@ def test_fp8_save_and_load(self, compile_backend: str, dtype: torch.dtype):
             )
 
         # Load the actual data
-        new_fp8_mlp.load_state_dict(torch.load(buffer), strict=True, assign=True)
+        new_fp8_mlp.load_state_dict(
+            torch.load(buffer, weights_only=True), strict=True, assign=True
+        )
 
         # Dynamic Activations + Quantized Weights
         def quantize_dynamic_linear(x: nn.Module):
             if isinstance(x, Float8DynamicLinear):
-                x.set_quantization_scales(True)
+                x.set_quantization_scales(pre_quantize_weight=True)
             return x
 
         new_fp8_mlp.apply(quantize_dynamic_linear)