unified quantizer for 4b (#625)

mikekgfb · malfet · commit 785e92add70a · 2024-07-17T09:55:44.000-07:00
* unified quantizer for 4b

* typo

* typo

* fix argument spec

* gguf interface
diff --git a/build/gguf_loader.py b/build/gguf_loader.py
@@ -181,10 +181,10 @@ def load_model_and_state_dict(
                 parent,
                 _fqn_last(fqn),
                 WeightOnlyInt4Linear(
-                    "meta",
-                    in_features,
-                    out_features,
+                    in_features=in_features,
+                    out_features=out_features,
                     bias=False,
+                    device="meta",
                     groupsize=Q4_0.groupsize,
                     inner_k_tiles=inner_k_tiles,
                 ),
diff --git a/qops.py b/qops.py
@@ -323,13 +323,16 @@ class LinearInt4(torch.nn.Module):
 
     def __init__(
         self,
-        device: str,
         in_features: int,
         out_features: int,
         bias=True,
+        device=None,
         dtype=None,
+        *,
         groupsize: int = 128,
         inner_k_tiles: int = 8,
+        weight: Optional[torch.Tensor] = None,
+        scales_and_zeros: Optional[torch.Tensor] = None,
     ) -> None:
         super().__init__()
         self.padding = not self._check_k(
@@ -351,9 +354,12 @@ def __init__(
         assert (
             in_features % (inner_k_tiles * 16) == 0
         ), "require in_features % (innerKTiles * 16) == 0"
-        self.register_buffer(
-            "weight",
-            torch.empty(
+        assert (weight is None) == bool(
+            scales_and_zeros is None
+        ), "must specify both weights and scales_and_zeros, or neither"
+
+        if weight is None:
+            weight = torch.empty(
                 (
                     out_features // 8,
                     in_features // (inner_k_tiles * 16),
@@ -362,15 +368,20 @@ def __init__(
                 ),
                 dtype=torch.int32,
                 device=device,
-            ),
-        )
-        self.register_buffer(
-            "scales_and_zeros",
-            torch.empty(
+            )
+            scales_and_zeros = torch.empty(
                 (in_features // groupsize, out_features, 2),
                 dtype=get_precision(),
                 device=device,
-            ),
+            )
+
+        self.register_buffer(
+            "weight",
+            weight,
+        )
+        self.register_buffer(
+            "scales_and_zeros",
+            scales_and_zeros,
         )
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
diff --git a/quantize.py b/quantize.py
@@ -54,7 +54,7 @@ def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
             raise RuntimeError(f"unknown quantizer {quantizer} specified")
 
         model = quantizer_class_dict[quantizer](
-            model, device, tokenizer, **q_kwargs
+            model, device=device, tokenizer=tokenizer, **q_kwargs
         ).quantized_model()
 
 
@@ -450,7 +450,7 @@ def quantized_model(self) -> nn.Module:
 #####                   embedding table quantization               ######
 
 
-class EmbeddingOnlyInt8QuantHandler(QuantHandler):
+class EmbeddingOnlyQuantHandler(QuantHandler):
     def __init__(
         self,
         model: nn.Module,
@@ -545,6 +545,94 @@ def quantized_model(self) -> nn.Module:
 #####     weight only int4 per channel groupwise quantized code    ######
 
 
+class NewWeightOnlyInt4QuantHandler(QuantHandler):
+    def __init__(
+        self,
+        model: nn.Module,
+        device=None,
+        *,
+        tokenizer=None,
+        groupsize=128,
+        inner_k_tiles=8,
+        padding_allowed=True,
+        weight: Optional[torch.Tensor] = None,
+        scales_and_zeros: Optional[torch.Tensor] = None,
+    ):
+        self.model_ = model
+        self.device = device
+        self.groupsize = groupsize
+        self.inner_k_tiles = inner_k_tiles
+        self.padding_allowed = padding_allowed
+        assert groupsize in [32, 64, 128, 256]
+        assert inner_k_tiles in [2, 4, 8]
+
+    @torch.no_grad()
+    def quantize(self, module):
+        # cur_state_dict = state_dict_device(self.model_.state_dict())
+        # dict_device = "cpu"  # self.device
+
+        device = self.device
+
+        for name, child in module.named_children():
+            # print(f"name: {name}")
+            if isinstance(child, torch.nn.Linear):
+                assert not child.bias
+                out_features = child.out_features
+                in_features = child.in_features
+                assert out_features % 8 == 0, "require out_features % 8 == 0"
+                # print(f"linear: {fqn}, in={in_features}, out={out_features}")
+
+                weight = child.weight.data
+                if not WeightOnlyInt4Linear._check_k(
+                    k=in_features,
+                    groupsize=self.groupsize,
+                    inner_k_tiles=self.inner_k_tiles,
+                ):
+                    if self.padding_allowed:
+                        print(
+                            f"warning: {name} is padded to satisfy in_features % 1024 == 0"
+                        )
+                        padded_in_features = find_multiple(in_features, 1024)
+                        weight = F.pad(
+                            weight, pad=(0, padded_in_features - in_features)
+                        )
+                    else:
+                        print(
+                            f"warning: {name} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, "
+                            + "and that groupsize and inner_k_tiles*16 evenly divide into it"
+                        )
+                        continue
+                weight_int4pack, scales_and_zeros = (
+                    WeightOnlyInt4Linear._prepare_weight_and_scales_and_zeros(
+                        weight.to(torch.float), self.groupsize, self.inner_k_tiles
+                    )
+                )
+                weight_int4pack = weight_int4pack.to(device=self.device)
+                scales_and_zeros = scales_and_zeros.to(device=self.device)
+
+                setattr(
+                    module,
+                    name,
+                    WeightOnlyInt4Linear(
+                        child.in_features,
+                        child.out_features,
+                        bias=False,
+                        device=self.device,
+                        groupsize=self.groupsize,
+                        inner_k_tiles=self.inner_k_tiles,
+                        weight=weight_int4pack,
+                        scales_and_zeros=scales_and_zeros,
+                    ),
+                )
+            else:
+                self.quantize(child)
+
+        return module
+
+    def quantized_model(self) -> nn.Module:
+        return self.quantize(self.model_)
+
+
 def replace_linear_int4(
     module,
     device,
@@ -563,10 +651,10 @@ def replace_linear_int4(
                     module,
                     name,
                     WeightOnlyInt4Linear(
-                        device,
                         child.in_features,
                         child.out_features,
                         bias=False,
+                        device=device,
                         groupsize=groupsize,
                         inner_k_tiles=inner_k_tiles,
                     ),
@@ -1001,9 +1089,9 @@ def quantized_model(self) -> nn.Module:
 # Must come last because __future__ annotations don't work for naked
 # class references
 quantizer_class_dict = {
-    "embedding": EmbeddingOnlyInt8QuantHandler,
+    "embedding": EmbeddingOnlyQuantHandler,
     "linear:int8": WeightOnlyInt8QuantHandler,
-    "linear:int4": WeightOnlyInt4QuantHandler,
+    "linear:int4": NewWeightOnlyInt4QuantHandler,
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
     "linear:int4-gptq": WeightOnlyInt4GPTQQuantHandler,
     "linear:hqq": WeightOnlyInt4HqqQuantHandler,