Update state dict and model together (#573)

mikekgfb · malfet · commit b04747b9ece4 · 2024-07-17T09:55:44.000-07:00
* code beautification

* code beautification, move functions together

* rewrite model rewriter

* rewrite quantizers

* weights is none check

* typo

* not weight -&gt; weight is not None

* fix dimensions for parallel prefill

* test

* typo

* bfloat16 on ARM with MacOS 14

* precision for a8w4

* sdpa_kv

* fixes

* inline qlq definition

* trial and error

* qdq not working

* ci

* not so fast with bf16=fast

* typo, and handle fast across maxcos version...

* typo

* type cast
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -631,7 +631,7 @@ jobs:
   test-mps:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-stable
+      runner: macos-m1-stable  # neeps MPS, was macos-m1-stable
       script: |
         set -x
         # NS: Remove previous installation  of torch first
@@ -740,7 +740,7 @@ jobs:
   test-mps-dtype:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-stable
+      runner: macos-m1-stable  # needs MPS, was macos-m1-stable
       script: |
         set -x
         # NS: Remove previous installation  of torch first
@@ -918,7 +918,13 @@ jobs:
 
           python torchchat.py export stories15M --output-pte-path ./model.pte
           ./cmake-out/et_run ./model.pte -z ./tokenizer.bin -t 0 -i "${PRMT}"
-
+          
+            for dtype in fp32 fp16; do   # bf16 needs to be supported
+              echo "Testing export + runner with dtype=$dtype"
+              python torchchat.py export stories15M --dtype $dtype --output-pte-path ./model.pte
+                  ./cmake-out/et_run ./model.pte -z ./tokenizer.bin -t 0 -i "${PRMT}"
+            done
+  
           echo "Tests complete."
   runner-aoti:
     name: test-runner-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
diff --git a/build/builder.py b/build/builder.py
@@ -194,13 +194,16 @@ def validate_model(
         if model is None:
             return
 
+        if self.is_tiktoken == self.is_sentencepiece:
+            raise RuntimeError("no tokenizer was found")
+
         is_tiktoken = self.is_tiktoken
         is_sentencepiece = self.is_sentencepiece
         use_tiktoken = model.config.use_tiktoken
 
         if not (is_tiktoken == use_tiktoken) or not (is_sentencepiece != use_tiktoken):
             raise RuntimeError(
-                f"model-specified tokenizer ({tokenizer_setting_to_name(use_tiktoken)}) does not match provided tokenizer ({tokenizer_setting_to_name(is_tiktoken)} for {model_description}"
+                f"model-specified tokenizer ({tokenizer_setting_to_name(use_tiktoken)}) does not match provided tokenizer ({tokenizer_setting_to_name(is_tiktoken)}) for {model_description}"
             )
 
         return
diff --git a/build/model.py b/build/model.py
@@ -124,8 +124,10 @@ def __init__(
         dtype=None,
     ):
         super().__init__()
+        print(f"dtype on entry {dtype}")
         if not dtype:
             dtype = get_precision()
+        print(f"dtype on get_prec {dtype}")
         cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
         self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
         self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
diff --git a/build/utils.py b/build/utils.py
@@ -134,12 +134,12 @@ def get_precision():
 
 def name_to_dtype(name):
     if (name == "fast") or (name == "fast16"):
+        # MacOS now supports bfloat16
         import platform
-
         if platform.processor() == "arm":
-            return torch.float16
-        else:
-            return torch.bfloat16
+            if int(platform.mac_ver()[0].split('.')[0]) < 14:
+                return torch.float16
+        return torch.bfloat16
 
     if name in name_to_dtype_dict:
         return name_to_dtype_dict[name]
diff --git a/cli.py b/cli.py
@@ -296,7 +296,7 @@ def _add_arguments_common(parser):
 def arg_init(args):
     if not (torch.__version__ > "2.3"):
         raise RuntimeError(
-            "You are using PyTorch {torch.__version__}. At this time, torchchat uses the latest PyTorch technology with high-performance kernels only available in PyTorch nightly until the PyTorch 2.4 release"
+            f"You are using PyTorch {torch.__version__}. At this time, torchchat uses the latest PyTorch technology with high-performance kernels only available in PyTorch nightly until the PyTorch 2.4 release"
         )
 
     if hasattr(args, "quantize") and Path(args.quantize).is_file():
diff --git a/export_et.py b/export_et.py
@@ -70,7 +70,7 @@ def export_model(model, device, output_path, args=None) -> str:  # noqa: C901
         _skip_type_promotion=bool(target_precision == torch.float16),
     )
 
-    if target_precision == torch.float16:
+    if target_precision == torch.float16 or target_precision == torch.bfloat16:
         if state_dict_dtype != torch.float16:
             print("model.to torch.float16")
             model = model.to(dtype=torch.float16)
diff --git a/export_et_util.py b/export_et_util.py
@@ -63,15 +63,16 @@ def forward(self, x, freqs_cis, mask, input_pos=None):
         k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
         v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
 
-        q = apply_rotary_emb(q, freqs_cis)
-        k = apply_rotary_emb(k, freqs_cis)
-
+        q = apply_rotary_emb(q, freqs_cis).to(dtype=torch.float)
+        k = apply_rotary_emb(k, freqs_cis).to(dtype=torch.float)
+        v = v.to(dtype=torch.float)
+        
         # KV cache should always be enabled
         assert self.kv_cache is not None
         output = torch.ops.llama.sdpa_with_kv_cache(
-            q.float(),
-            k.float(),
-            v.float(),
+            q,
+            k,
+            v,
             self.kv_cache.k_cache,
             self.kv_cache.v_cache,
             input_pos[-1].item(),
diff --git a/qops.py b/qops.py
@@ -15,17 +15,20 @@
 from torch.nn.parameter import Parameter
 
 
-def linear_int8(input, weight, scales):
+def linear_int8_aoti(input, weight, scales):
     n_groups = scales.numel() // scales.shape[0]
 
     # we special-case channel-wise, because we know how to make that fast
     if n_groups == 1:
+        scales = scales.view(-1)
         if (
             torch.compiler.is_compiling()
             or input.device.type != "cpu"
             or torch.__version__ < "2.4"
         ):
-            return F.linear(input, weight.to(dtype=input.dtype)) * scales
+            lin = F.linear(input, weight.to(dtype=input.dtype))
+            # print(f"linear shape {lin.shape}, scales shape {scales.shape}")
+            return lin * scales
         # Use int8pack_mm for CPU eager
         return torch.ops.aten._weight_int8pack_mm(
             input.reshape(-1, input.shape[-1]),
@@ -42,6 +45,55 @@ def linear_int8(input, weight, scales):
     )
 
 
+def _qdq_dynamic_quantized_linear(
+    x_fp32, x_quant_min, x_quant_max, x_eps,
+    weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max,
+    bias_fp32,
+):
+    x_scale, x_zero_point = torch.ops.quantized_decomposed.choose_qparams(x_fp32, x_quant_min, x_quant_max, x_eps, torch.int8)
+    x_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        x_fp32, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
+    weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8)
+    out_fp32 = torch.ops.aten.linear.default(x_fp32, weight_fp32, bias_fp32)
+    return out_fp32
+
+def linear_int8_et(input, weight, scales):
+    n_groups = scales.numel() // scales.shape[0]
+
+    # we special-case channel-wise, because we know how to make that fast
+    if n_groups == 1:
+        scales = scales.view(-1)
+
+        if True:
+            lin = F.linear(input, weight.to(dtype=input.dtype))
+            # print(f"linear shape {lin.shape}, scales shape {scales.shape}")
+            return lin * scales
+
+        return _qdq_dynamic_quantized_linear(
+            x_fp32=input.float(),
+            x_quant_min=-128,
+            x_quant_max=127,
+            x_eps=torch.finfo(input.dtype).eps,
+            weight_i8=weight,
+            weight_scale=scales.float(),
+            weight_zero_point=0,
+            weight_quant_min=-128,
+            weight_quant_max=127,
+            bias_fp32=None,
+        ).to(dtype=input.dtype)
+
+    return F.linear(
+        input,
+        (
+            weight.to(dtype=input.dtype).view(weight.shape[0], n_groups, -1)
+            * scales.view(weight.shape[0], n_groups, -1)
+        ).view(weight.shape[0], -1),
+    )
+
+
 class LinearInt8(nn.Module):
     __constants__ = ["in_features", "out_features"]
     in_features: int
@@ -68,17 +120,14 @@ def __init__(
         if device is None:
             device = "cpu"
 
-        if device == "einputecutorch":
-            device = "cpu"
-
         assert not bias, "Bias is not supported by LinearInt8"
         self.in_features = in_features
         self.out_features = out_features
 
-        assert bool(weight) == bool(
-            scales
+        assert (weight is None) == bool(
+            scales is None
         ), "must specify both weights and scales, or neither"
-        if not weight:
+        if weight is None:
             weight = torch.empty(
                 (out_features, in_features), dtype=torch.int8, device=device
             )
@@ -91,8 +140,16 @@ def __init__(
         self.register_buffer("weight", weight.to(device))
         self.register_buffer("scales", scales.to(device))
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return linear_int8(input, self.weight, self.scales)
+        if use_et_backend():
+            self.forward = self.et_forward
+        else:
+            self.forward = self.aoti_forward
+
+    def aoti_forward(self, input: torch.Tensor) -> torch.Tensor:
+        return linear_int8_aoti(input, self.weight, self.scales)
+
+    def et_forward(self, input: torch.Tensor) -> torch.Tensor:
+        return linear_int8_et(input, self.weight, self.scales)
 
 
 class QuantizedEmbedding(torch.nn.Module):
diff --git a/quantize.py b/quantize.py
@@ -93,7 +93,9 @@ def __init__(self, model: nn.Module, device="cpu", tokenizer=None, **kwargs):
         self.model_ = model
         self.device = device
         self.tokenizer = tokenizer
-        self.quantizer = quant_api.Int8DynActInt4WeightQuantizer(**kwargs)
+        self.quantizer = quant_api.Int8DynActInt4WeightQuantizer(
+            **kwargs, precision=get_precision(), scales_precision=get_precision()
+        )
 
     def create_quantized_state_dict(self) -> Dict:  # "StateDict"
         pass
@@ -362,39 +364,6 @@ def group_dequantize_tensor(w_int32, scales_and_zeros, n_bit=4, groupsize=128):
 #####          Weight-only int8 per-channel quantized code         ######
 
 
-def replace_linear_weight_only_int8_per_channel(
-    module, device, node_type, groupsize=None
-):
-    if groupsize is not None and groupsize != 0:
-        pass
-
-    for name, child in module.named_children():
-        # print(f"name: {name}")
-        if isinstance(child, nn.Linear):
-            if (
-                (node_type == "*")
-                or (node_type == "output" and name == "output")
-                or (node_type == "!output" and name != "output")
-            ):
-                # print(f"{name, child}")
-                # print(f"in_features: {child.in_features}")
-                # print(f"out_features: {child.out_features}")
-                setattr(
-                    module,
-                    name,
-                    WeightOnlyInt8Linear(
-                        in_features=child.in_features,
-                        out_features=child.out_features,
-                        device=device,
-                        groupsize=groupsize,
-                    ),
-                )
-        else:
-            replace_linear_weight_only_int8_per_channel(
-                child, device, node_type, groupsize
-            )
-
-
 class WeightOnlyInt8QuantHandler(QuantHandler):
     def __init__(
         self,
@@ -416,9 +385,11 @@ def __init__(
             self.bitwidth = bitwidth
 
     @torch.no_grad()
-    def create_quantized_state_dict(self) -> Dict:
-        cur_state_dict = state_dict_device(self.model_.state_dict())
-        dict_device = "cpu"  # self.device
+    def quantize(self, module):
+        # cur_state_dict = state_dict_device(self.model_.state_dict())
+        # dict_device = "cpu"  # self.device
+
+        device = self.device
 
         if self.bitwidth == 4:
             range_min = -8
@@ -429,24 +400,19 @@ def create_quantized_state_dict(self) -> Dict:
         else:
             raise ValueError(f"Unsupported bitwidth {self.bitwidth}")
 
-        for fqn, mod in self.model_.named_modules():
-            # print(f"maybe? quantize {fqn}...{type(mod)}")
-            if isinstance(mod, torch.nn.Linear):
-                # print(f"candidate {fqn}, nodetype {self.node_type}")
+        for name, child in module.named_children():
+            # print(f"name: {name}")
+            if isinstance(child, nn.Linear):
                 if (
                     (self.node_type == "*")
-                    or (self.node_type == "output" and fqn in ["output", "final_proj"])
-                    or (
-                        self.node_type == "!output"
-                        and fqn not in ["output", "final_proj"]
-                    )
+                    or (self.node_type == "output" and name == "output")
+                    or (self.node_type == "!output" and name != "output")
                 ):
-                    # print(
-                    #     f"quantize {self.node_type} {fqn, mod} with groupsize {self.groupsize}, bitwidth {self.bitwidth}"
-                    # )
-
-                    # print(f"initial weight shape {mod.weight.shape}")
-                    input_weight = mod.weight.float()
+                    # print(f"{name, child}")
+                    input_weight = child.weight.float()
+                    # print(f"{name, child}")
+                    # print(f"in_features: {child.in_features}")
+                    # print(f"out_features: {child.out_features}")
 
                     # print(f"expanded weight shape {input_weight.shape}")
                     weight, scales, _ = dynamically_quantize_per_channel(
@@ -455,28 +421,29 @@ def create_quantized_state_dict(self) -> Dict:
                         range_max,
                         torch.int8,
                         self.groupsize,
-                        scales_dtype=mod.weight.dtype,
+                        scales_dtype=child.weight.dtype,
                     )
 
-                    weight = weight.to(device=dict_device)
-                    scales = scales.to(device=dict_device)
-                    cur_state_dict[f"{fqn}.weight"] = weight
-                    # squeeze makes groupsize=rowsize unidimensional
-                    cur_state_dict[f"{fqn}.scales"] = scales.squeeze(dim=-1)
-
-        return cur_state_dict
+                    setattr(
+                        module,
+                        name,
+                        WeightOnlyInt8Linear(
+                            in_features=child.in_features,
+                            out_features=child.out_features,
+                            device=self.device,
+                            # update variables from quantization
+                            weight=weight,
+                            scales=scales,
+                            groupsize=self.groupsize,
+                        ),
+                    )
+                else:
+                    self.quantize(module)
 
-    def convert_for_runtime(self) -> nn.Module:
-        replace_linear_weight_only_int8_per_channel(
-            self.model_, self.device, self.node_type, self.groupsize
-        )
-        return self.model_
+        return module
 
     def quantized_model(self) -> nn.Module:
-        model_updated_state_dict = self.create_quantized_state_dict()
-        self.convert_for_runtime()
-        self.model_.load_state_dict(model_updated_state_dict)
-        return self.model_
+        return self.quantize(self.model_)
 
 
 #########################################################################

Original file line number	Diff line number	Diff line change
`@@ -296,7 +296,7 @@ def _add_arguments_common(parser):`
`296`	`296`	`def arg_init(args):`
`297`	`297`	`if not (torch.__version__ > "2.3"):`
`298`	`298`	`raise RuntimeError(`
`299`		`- "You are using PyTorch {torch.__version__}. At this time, torchchat uses the latest PyTorch technology with high-performance kernels only available in PyTorch nightly until the PyTorch 2.4 release"`
	`299`	`+ f"You are using PyTorch {torch.__version__}. At this time, torchchat uses the latest PyTorch technology with high-performance kernels only available in PyTorch nightly until the PyTorch 2.4 release"`
`300`	`300`	`)`
`301`	`301`
`302`	`302`	`if hasattr(args, "quantize") and Path(args.quantize).is_file():`
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ def export_model(model, device, output_path, args=None) -> str: # noqa: C901`
`70`	`70`	`_skip_type_promotion=bool(target_precision == torch.float16),`
`71`	`71`	`)`
`72`	`72`
`73`		`- if target_precision == torch.float16:`
	`73`	`+ if target_precision == torch.float16 or target_precision == torch.bfloat16:`
`74`	`74`	`if state_dict_dtype != torch.float16:`
`75`	`75`	`print("model.to torch.float16")`
`76`	`76`	`model = model.to(dtype=torch.float16)`