Moving Quant functions out to quant_lib.py: Part 1 (#2846)

Jack-Khuu · facebook-github-bot · commit 4f95b1b263bb · 2024-04-03T18:25:58.000-07:00
Summary:

Export_llama_lib is currently smothered in Quant related code. This diff starts a stack of refactors to move the code out of the export_llama_lib

Specifically this moves to quant_li.py (new), only the lines that do not require manual editing.

i.e. verbatim copy and paste

---
Note: This stack intentionally **DOES** **__NOT__** fix any existing style/refactor/feature. Those must come later otherwise, nothing gets landed

Differential Revision: D55723711
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
@@ -68,6 +68,7 @@ runtime.python_library(
         "export_llama.py",
         "export_llama_lib.py",
         "model.py",
+        "quant_lib.py",
         "quantize.py",
     ],
     _is_external_target = True,
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -11,11 +11,10 @@
 import logging
 import os
 import shlex
-from dataclasses import dataclass
 
 from functools import partial
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import Any, Optional, Union
 
 import pkg_resources
 import torch
@@ -30,14 +29,9 @@
 from executorch.sdk.etrecord import generate_etrecord
 from executorch.util.activation_memory_profiler import generate_memory_trace
 from sentencepiece import SentencePieceProcessor
-from torch.ao.quantization.quantizer import Quantizer
-from torch.ao.quantization.quantizer.embedding_quantizer import EmbeddingQuantizer
-from torch.ao.quantization.quantizer.xnnpack_quantizer import (
-    get_symmetric_quantization_config,
-    XNNPACKQuantizer,
-)
 
 from .builder import DType, LlamaEdgeManager, load_llama_model, WeightType
+from .quant_lib import _get_pt2e_quantization_params, get_pt2e_quantizers
 
 from .quantize import EmbeddingOnlyInt8QuantHandler, WeightOnlyInt8QuantHandler
 
@@ -68,121 +62,6 @@ def verbose_export():
     return verbosity_setting
 
 
-@dataclass
-class EmbeddingQuantOptions:
-    is_per_channel: bool = True
-    group_size: int = -1
-
-    def __post_init__(self):
-        if self.group_size != -1:
-            raise RuntimeError(
-                "PT2E embedding quantizer does not support groupwise at the moment."
-            )
-
-
-@dataclass
-class DynamicQuantLinearOptions:
-    is_per_channel: bool = True
-    is_qc4: bool = False
-
-
-@dataclass
-class PT2EQuantOptions:
-    quantize_embedding: Optional[EmbeddingQuantOptions] = None
-    quantize_linear: Optional[DynamicQuantLinearOptions] = None
-
-
-def _get_pt2e_quantization_params(args) -> Optional[PT2EQuantOptions]:
-    if args.pt2e_quantize is None:
-        return None
-    if args.quantization_mode:
-        raise ValueError("Cannot specify both --quantization_mode and --pt2e_quantize")
-
-    quantization_options = args.pt2e_quantize.split(",")
-    quantization_options = [option.strip() for option in quantization_options]
-    # This can really be improved significantly.
-    # Hopefully we dont release this in its current form.
-    # Just using this for quick experiments.
-    quant_options = None
-    if "embedding" in quantization_options:
-        quant_options = quant_options or PT2EQuantOptions()
-        quant_options.quantize_embedding = EmbeddingQuantOptions()
-    if (
-        "xnnpack_dynamic" in quantization_options
-        and "xnnpack_dynamic_qc4" in quantization_options
-    ):
-        raise RuntimeError(
-            "For dynamic linear quantization via xnnpack quantizer you can chose only qc8 or qc4 option, not both."
-        )
-    if (
-        "xnnpack_dynamic" in quantization_options
-        or "xnnpack_dynamic_qc4" in quantization_options
-    ):
-        quant_options = quant_options or PT2EQuantOptions()
-        quant_options.quantize_linear = DynamicQuantLinearOptions()
-        if "xnnpack_dynamic_qc4" in quantization_options:
-            quant_options.quantize_linear.is_qc4 = True
-
-    return quant_options
-
-
-# TODO: move args is used only get so_file. Refactor this
-def get_pt2e_quantizers(
-    quant_params: Optional[PT2EQuantOptions], args
-) -> List[Quantizer]:
-    """
-    Get a list of quantizers from quantization params
-    Args:
-        args: quant params
-    Returns:
-        A list of quantizers to pass into LlamaBuilder.
-    """
-
-    def check_embedding_byte_registered():
-        try:
-            _ = torch.ops.quantized_decomposed.embedding_byte.out
-        except AttributeError:
-            if args.so_library:
-                print(f"Loading library {args.so_library}")
-                torch.ops.load_library(args.so_library)
-            else:
-                raise RuntimeError(
-                    "Need to specify shared library path to register quantized ops (and their out variants) into EXIR.\n"
-                    "Follow the following steps to build the needed lib via cmake.\n"
-                    'Use `python -c "import torch as _; print(_.__path__)"` to find where torch package is installed.\n'
-                    "Set that as TORCH_PACKAGE_DIR.\n"
-                    "Then from root executorch dir do the following:\n"
-                    "rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED=ON ..) && cmake --build . -j16\n"
-                    'To find the location of the lib: find cmake-out -name "libquantized_ops_aot_lib*"\n'
-                    "Then specify the said library via -s <path to libquantized_ops_aot_lib.so\n"
-                )
-
-    quantizers = []
-    if quant_params is not None and quant_params.quantize_embedding is not None:
-        logging.info("Apply PT2E embedding quantization.")
-        check_embedding_byte_registered()
-        quantizers.append(EmbeddingQuantizer())
-    if quant_params is not None and quant_params.quantize_linear is not None:
-        logging.info("Apply PT2E dynamic linear quantization.")
-        dynamic_quantizer = XNNPACKQuantizer()
-        assert quant_params.quantize_linear is not None
-        if not quant_params.quantize_linear.is_per_channel:
-            raise ValueError(
-                "At the moment only per channel weight quantization is supported."
-            )
-        if quant_params.quantize_linear.is_qc4:
-            operator_config_dynamic = get_symmetric_quantization_config(
-                is_per_channel=True, is_dynamic=True, weight_qmin=-8, weight_qmax=7
-            )
-        else:
-            operator_config_dynamic = get_symmetric_quantization_config(
-                is_per_channel=True, is_dynamic=True
-            )
-        dynamic_quantizer.set_global(operator_config_dynamic)
-        quantizers.append(dynamic_quantizer)
-    return quantizers
-
-
 def materialze_broadcast_of_rope_freq_cis(
     module: torch.nn.Module,
 ):
diff --git a/examples/models/llama2/quant_lib.py b/examples/models/llama2/quant_lib.py
@@ -0,0 +1,136 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+
+from torch.ao.quantization.quantizer import Quantizer
+from torch.ao.quantization.quantizer.embedding_quantizer import EmbeddingQuantizer
+from torch.ao.quantization.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+@dataclass
+class EmbeddingQuantOptions:
+    is_per_channel: bool = True
+    group_size: int = -1
+
+    def __post_init__(self):
+        if self.group_size != -1:
+            raise RuntimeError(
+                "PT2E embedding quantizer does not support groupwise at the moment."
+            )
+
+
+@dataclass
+class DynamicQuantLinearOptions:
+    is_per_channel: bool = True
+    is_qc4: bool = False
+
+
+@dataclass
+class PT2EQuantOptions:
+    quantize_embedding: Optional[EmbeddingQuantOptions] = None
+    quantize_linear: Optional[DynamicQuantLinearOptions] = None
+
+
+def _get_pt2e_quantization_params(args) -> Optional[PT2EQuantOptions]:
+    if args.pt2e_quantize is None:
+        return None
+    if args.quantization_mode:
+        raise ValueError("Cannot specify both --quantization_mode and --pt2e_quantize")
+
+    quantization_options = args.pt2e_quantize.split(",")
+    quantization_options = [option.strip() for option in quantization_options]
+    # This can really be improved significantly.
+    # Hopefully we dont release this in its current form.
+    # Just using this for quick experiments.
+    quant_options = None
+    if "embedding" in quantization_options:
+        quant_options = quant_options or PT2EQuantOptions()
+        quant_options.quantize_embedding = EmbeddingQuantOptions()
+    if (
+        "xnnpack_dynamic" in quantization_options
+        and "xnnpack_dynamic_qc4" in quantization_options
+    ):
+        raise RuntimeError(
+            "For dynamic linear quantization via xnnpack quantizer you can chose only qc8 or qc4 option, not both."
+        )
+    if (
+        "xnnpack_dynamic" in quantization_options
+        or "xnnpack_dynamic_qc4" in quantization_options
+    ):
+        quant_options = quant_options or PT2EQuantOptions()
+        quant_options.quantize_linear = DynamicQuantLinearOptions()
+        if "xnnpack_dynamic_qc4" in quantization_options:
+            quant_options.quantize_linear.is_qc4 = True
+
+    return quant_options
+
+
+# TODO: move args is used only get so_file. Refactor this
+def get_pt2e_quantizers(
+    quant_params: Optional[PT2EQuantOptions], args
+) -> List[Quantizer]:
+    """
+    Get a list of quantizers from quantization params
+    Args:
+        args: quant params
+    Returns:
+        A list of quantizers to pass into LlamaBuilder.
+    """
+
+    def check_embedding_byte_registered():
+        try:
+            _ = torch.ops.quantized_decomposed.embedding_byte.out
+        except AttributeError:
+            if args.so_library:
+                print(f"Loading library {args.so_library}")
+                torch.ops.load_library(args.so_library)
+            else:
+                raise RuntimeError(
+                    "Need to specify shared library path to register quantized ops (and their out variants) into EXIR.\n"
+                    "Follow the following steps to build the needed lib via cmake.\n"
+                    'Use `python -c "import torch as _; print(_.__path__)"` to find where torch package is installed.\n'
+                    "Set that as TORCH_PACKAGE_DIR.\n"
+                    "Then from root executorch dir do the following:\n"
+                    "rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED=ON ..) && cmake --build . -j16\n"
+                    'To find the location of the lib: find cmake-out -name "libquantized_ops_aot_lib*"\n'
+                    "Then specify the said library via -s <path to libquantized_ops_aot_lib.so\n"
+                )
+
+    quantizers = []
+    if quant_params is not None and quant_params.quantize_embedding is not None:
+        logging.info("Apply PT2E embedding quantization.")
+        check_embedding_byte_registered()
+        quantizers.append(EmbeddingQuantizer())
+    if quant_params is not None and quant_params.quantize_linear is not None:
+        logging.info("Apply PT2E dynamic linear quantization.")
+        dynamic_quantizer = XNNPACKQuantizer()
+        assert quant_params.quantize_linear is not None
+        if not quant_params.quantize_linear.is_per_channel:
+            raise ValueError(
+                "At the moment only per channel weight quantization is supported."
+            )
+        if quant_params.quantize_linear.is_qc4:
+            operator_config_dynamic = get_symmetric_quantization_config(
+                is_per_channel=True, is_dynamic=True, weight_qmin=-8, weight_qmax=7
+            )
+        else:
+            operator_config_dynamic = get_symmetric_quantization_config(
+                is_per_channel=True, is_dynamic=True
+            )
+        dynamic_quantizer.set_global(operator_config_dynamic)
+        quantizers.append(dynamic_quantizer)
+    return quantizers