add 16a4w_hqq quant mode

cccclai · cccclai · commit 8a569cb2e732 · 2024-05-28T10:45:04.000-07:00
Pull Request resolved: #3752 Prerequistie: install hqq following https://github.com/mobiusml/hqq Step 1: use hqq to quantize weight to 4bit Step 2: use static quant to quantize activation to 16bit Currently the graph calibration is too slow, so adding the the quant oberserver to the eager model for faster iteration command: ``` python -m examples.models.llama2.eval_llama -t /data/users/chenlai/models/llama2/tokenizer.model -p /data/users/chenlai/models/llama2/params.json -c /data/users/chenlai/models/llama2/consolidated.00.pth --max_seq_len 129 -qmode 16a4w-hqq --limit 5 2>&1 | tee hqq_16a4w.log ``` Differential Revision: [D57849772](https://our.internmc.facebook.com/intern/diff/D57849772/) ghstack-source-id: 227950317
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py
@@ -33,7 +33,7 @@
 from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
 from torch.nn.attention import SDPBackend
 
-from ...portable.utils import export_to_edge, save_pte_program
+from examples.portable.utils import export_to_edge, save_pte_program
 from ..model_factory import EagerModelFactory
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -119,7 +119,7 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--quantization_mode",
         type=str,
         default=None,
-        choices=["int8", "8da4w", "8da4w-gptq"],
+        choices=["int8", "8da4w", "8da4w-gptq", "16a4w-hqq"],
         help="type of quantization",
     )
 
@@ -366,8 +366,8 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
         )
         .set_output_dir(output_dir_path)
         .set_metadata(args.metadata)
-        .source_transform(transforms)
         .to_dtype(dtype_override)
+        .source_transform(transforms)
     )
 
 
diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama2/source_transformation/quantize.py
@@ -6,11 +6,21 @@
 
 from functools import partial
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken
+from executorch.examples.models.llama2.tokenizer.tokenizer import (
+    Tokenizer,
+    Tokenizer as SentencePieceTokenizer,
+)
+from hqq.core.quantize import BaseQuantizeConfig, HQQLinear
+from lm_eval.api.model import LM
+from lm_eval.evaluator import evaluate
+from lm_eval.models.huggingface import HFLM as eval_wrapper
+from lm_eval.tasks import get_task_dict
 
 from sentencepiece import SentencePieceProcessor
 
@@ -33,6 +43,233 @@
     fsLinear = nn.Linear
 
 
+class EagerEvalWrapper(eval_wrapper):
+    """
+    A wrapper class based on GPTFast, providing integration with the lm-evaluation-harness library.
+    """
+
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        tokenizer: Union[SentencePieceTokenizer, Tiktoken],
+        max_seq_length: Optional[int] = None,
+        use_kv_cache: bool = False,
+    ):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        super().__init__(device=device)
+        self._model = model
+        self._tokenizer = tokenizer
+        self._device = torch.device(device)
+        self._max_seq_length = 2048 if max_seq_length is None else max_seq_length
+        self._use_kv_cache = use_kv_cache
+
+    @property
+    def eot_token_id(self):
+        return self._tokenizer.eos_id
+
+    @property
+    def max_length(self):
+        return self._max_seq_length
+
+    @property
+    def max_gen_toks(self):
+        return 50
+
+    @property
+    def batch_size(self):
+        return 1
+
+    @property
+    def device(self):
+        return self._device
+
+    def tok_encode(self, string: str, **kwargs):
+        tokens = self._tokenizer.encode(string, bos=True, eos=False)
+        encoded = torch.tensor(tokens, dtype=torch.int, device=self.device)
+        # encoded is a pytorch tensor, but some internal logic in the
+        # eval harness expects it to be a list instead
+        # TODO: verify this for multi-batch as well
+        encoded = encoded.tolist()
+        return encoded
+
+    def tok_decode(self, tokens):
+        decoded = self._tokenizer.decode(tokens)
+        return decoded
+
+    def _model_call(self, inps):
+        bsz, seq_len = inps.shape
+        if self._use_kv_cache:
+            pos_tensor = torch.arange(
+                self._max_seq_length, dtype=torch.int64, device=self.device
+            )
+
+            logits = self._model(inps[:, : self._max_seq_length], pos_tensor)
+            return logits
+        else:
+            logits = self._model(inps)
+            return logits
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
+
+
+@torch.no_grad()
+def eval(
+    eval_wrapper: LM,
+    tasks: Optional[list] = None,
+    limit: Optional[int] = None,
+) -> dict:
+    """
+    Evaluates a language model on a specified task using the lm-evaluation-harness library.
+    Args:
+        eval_wrapper (LM): A LM wrapper class compatible with lm-evaluation-harness evaluation
+        task (str): The name of the evaluation task to perform.
+        limit (Optional[int]): The maximum number of samples to evaluate (None for all available).
+    Returns:
+        eval_results (dict): A dictionary of evaluation results for the specified task(s).
+    """
+    if tasks is None:
+        tasks = ["wikitext"]
+    if "hendrycks_test" in tasks:
+        tasks.remove("hendrycks_test")
+        tasks += list(lm_eval.tasks.hendrycks_test.create_all_tasks().keys())
+    task_dict = get_task_dict(tasks)
+    eval_results = evaluate(
+        eval_wrapper,
+        task_dict,
+        limit=limit,
+    )
+    return eval_results
+
+
+def run_wikitext_eval(m, tokenizer_path, seq_len):
+    print("run_wikitext_eval calibration...")
+    print("tokenizer_path: ", tokenizer_path)
+    tokenizer = Tokenizer(str(tokenizer_path))
+    eval_wrapper = EagerEvalWrapper(
+        model=m,
+        tokenizer=tokenizer,
+        max_seq_length=seq_len,
+        use_kv_cache=False,
+    )
+    eval_results = eval(
+        eval_wrapper,
+        tasks=["wikitext"],
+        # limit=128,
+        limit=5,
+        # limit=1,
+    )
+    for task, res in eval_results["results"].items():
+        print(f"{task}: {res}")
+
+
+class LinearActFakeQuant(torch.nn.Module):
+    def __init__(self, linear):
+        super().__init__()
+        self.linear = linear
+        self.input_activation_fake_quant = torch.quantization.FakeQuantize(
+            observer=torch.quantization.MovingAverageMinMaxObserver,
+            dtype=torch.int32,
+            quant_min=torch.iinfo(torch.uint16).min,
+            quant_max=torch.iinfo(torch.uint16).max,
+        )
+        self.output_activation_fake_quant = torch.quantization.FakeQuantize(
+            observer=torch.quantization.MovingAverageMinMaxObserver,
+            dtype=torch.int32,
+            quant_min=torch.iinfo(torch.uint16).min,
+            quant_max=torch.iinfo(torch.uint16).max,
+        )
+
+    def forward(self, x):
+        x = self.input_activation_fake_quant(x)
+        return self.output_activation_fake_quant(self.linear(x))
+
+
+def get_quant_params(activation_fake_quant):
+    quant_min = activation_fake_quant.quant_min
+    quant_max = activation_fake_quant.quant_max
+    qparams = activation_fake_quant.calculate_qparams()
+    scale = qparams[0]
+    zero_point = qparams[1]
+    return (quant_min, quant_max, scale, zero_point)
+
+
+class LinearActQuant(torch.nn.Module):
+
+    def __init__(self, linear_fake_quant):
+        super().__init__()
+        self.linear_fake_quant = linear_fake_quant
+        self.input_quant_min, self.input_quant_max, input_scale, input_zero_point = (
+            get_quant_params(linear_fake_quant.input_activation_fake_quant)
+        )
+        self.input_scale = input_scale.to(device="cuda")
+        self.input_zero_point = input_zero_point.to(device="cuda")
+
+        (
+            self.output_quant_min,
+            self.output_quant_max,
+            output_scale,
+            output_zero_point,
+        ) = get_quant_params(linear_fake_quant.output_activation_fake_quant)
+        self.output_scale = output_scale.to(device="cuda")
+        self.output_zero_point = output_zero_point.to(device="cuda")
+
+    def forward(self, x):
+        # Manually quantize the input tensor using observed min and max values
+        q_tensor = torch.round(x / self.input_scale + self.input_zero_point)
+        # Clip to ensure within the range [0, 255]
+        q_tensor = torch.clamp(q_tensor, self.input_quant_min, self.input_quant_max)
+        # Dequantize to the original scale
+        dequantized_tensor = (q_tensor - self.input_zero_point) * self.input_scale
+
+        linear_output = self.linear_fake_quant.linear(dequantized_tensor)
+
+        # # Quantize the linear output tensor
+        q_linear_output = torch.round(
+            linear_output / self.output_scale + self.output_zero_point
+        )
+        q_linear_output = torch.clamp(
+            q_linear_output, self.output_quant_min, self.output_quant_max
+        )
+        # Dequantize the linear output tensor
+        dq_linear_output = (
+            q_linear_output - self.output_zero_point
+        ) * self.output_scale
+
+        return dq_linear_output
+
+
+def _replace_linear_q_act(module: torch.nn.Module, stage: str):
+    for name, child in module.named_children():
+        if stage == "convert":
+            if isinstance(child, LinearActFakeQuant):
+                new_linear = LinearActQuant(child)
+                setattr(module, name, new_linear)
+            else:
+                _replace_linear_q_act(child, stage)
+        elif stage == "prepare":
+            if isinstance(child, HQQLinear):
+                new_linear = LinearActFakeQuant(child)
+                setattr(module, name, new_linear)
+            else:
+                _replace_linear_q_act(child, stage)
+
+
+def replace_linear_q_act(module: torch.nn.Module, stage: str):
+    _replace_linear_q_act(
+        module,
+        stage,
+    )
+
+
+def prepare(model):
+    replace_linear_q_act(model, "prepare")
+
+
+def convert(model):
+    replace_linear_q_act(model, "convert")
+
+
 def quantize(
     model: torch.nn.Module,
     qmode: str,
@@ -127,6 +364,65 @@ def quantize(
             group_size,
         )
         model = gptq_quantizer.quantize(model, inputs)
+        return model
+    elif qmode == "16a4w-hqq":
+        print("running 16a4w-hqq")
+        from hqq.core.quantize import BaseQuantizeConfig, HQQLinear
+
+        def _replace_linear_16a4w_hqq(
+            module: torch.nn.Module,
+            quant_config,
+            compute_dtype,
+            del_orig=False,
+        ):
+            for name, child in module.named_children():
+                if isinstance(child, nn.Linear):
+                    new_linear = HQQLinear(
+                        child,
+                        quant_config,
+                        compute_dtype=compute_dtype,
+                        del_orig=True,
+                        device="cpu",
+                    )
+                    setattr(module, name, new_linear)
+                else:
+                    _replace_linear_16a4w_hqq(
+                        child,
+                        quant_config,
+                        compute_dtype,
+                        del_orig=False,
+                    )
+
+        def replace_linear_16a4w_hqq(
+            module: torch.nn.Module,
+            quant_config,
+            compute_dtype,
+            del_orig=False,
+        ):
+            _replace_linear_16a4w_hqq(
+                module,
+                quant_config,
+                compute_dtype,
+                del_orig=False,
+            )
+
+        compute_dtype = torch.float32  # torch.bfloat16 #[torch.float16, torch.bfloat16]
+        quant_config = BaseQuantizeConfig(
+            quant_zero=False, quant_scale=False, offload_meta=False, view_as_float=False
+        )
+        print("before replace_linear_16a4w_hqq model: ", model)
+        replace_linear_16a4w_hqq(model, quant_config, compute_dtype)
+        print("after replace_linear_16a4w_hqq model: ", model)
+
+        print("model before prepare: ", model)
+        prepare(model)
+        print("model after prepare: ", model)
+
+        # Calibration with wikitext, currently only use 5 samples and can be fine tuned
+        run_wikitext_eval(model, tokenizer_path, 128)
+        print("model after calibrate: ", model)
+        convert(model)
+
         return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")

Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ def build_args_parser() -> argparse.ArgumentParser:`
`119`	`119`	`"--quantization_mode",`
`120`	`120`	`type=str,`
`121`	`121`	`default=None,`
`122`		`- choices=["int8", "8da4w", "8da4w-gptq"],`
	`122`	`+ choices=["int8", "8da4w", "8da4w-gptq", "16a4w-hqq"],`
`123`	`123`	`help="type of quantization",`
`124`	`124`	`)`
`125`	`125`
`@@ -366,8 +366,8 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:`
`366`	`366`	`)`
`367`	`367`	`.set_output_dir(output_dir_path)`
`368`	`368`	`.set_metadata(args.metadata)`
`369`		`- .source_transform(transforms)`
`370`	`369`	`.to_dtype(dtype_override)`
	`370`	`+ .source_transform(transforms)`
`371`	`371`	`)`
`372`	`372`
`373`	`373`