Add quantization documentations (#925)

larryliu0820 · web-flow · commit b0d0d0dcc8ed · 2024-07-18T14:41:45.000-07:00
Summary: Add details to `quantize.py` explaining the plan to change
quantization implementation.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/generate.py b/generate.py
@@ -132,7 +132,7 @@ class Generator:
         tokenizer_args: Defines the tokenizer configuration for both the model and speculative model
         generator_args: Controls the generation parameters
         profile: A Path to a directory where the profiling results will be stored, if enabled.
-        quantize: If True, quantize the model.
+        quantize: If True, quantize the model. Please refer to docs/quantization.md for details.
         draft_quantize: If True, quantize the draft model.
     """
 
@@ -715,6 +715,7 @@ def chat(
                 print("Model: ", end="")
 
                 buffer = []
+
                 def callback(x, *, done_generating=False):
                     return self._callback(
                         x,
@@ -726,6 +727,7 @@ def callback(x, *, done_generating=False):
                 assert not generator_args.chat_mode
 
                 buffer = [generator_args.prompt]
+
                 def callback(x, *, done_generating=False):
                     return self._callback(
                         x,
@@ -794,7 +796,7 @@ def callback(x, *, done_generating=False):
                 )
                 # Don't continue here.... because we need to report and reset
                 # continue
-            
+
             logging.info(
                 f"\nTime for inference {i + 1}: {t:.02f} sec total, time to first token {metrics.get('time_to_first_token', 0.0):.02f} sec with {'sequential' if generator_args.sequential_prefill else 'parallel'} prefill, {num_tokens_generated} tokens, {tokens_sec:.02f} tokens/sec, {1000 / tokens_sec:.02f} ms/token"
             )
diff --git a/quantization/quantize.py b/quantization/quantize.py
@@ -4,6 +4,22 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Quantization API library for torchchat.
+#
+# NOTICE: most of the quant primitives code here will be deprecated in favor of torchao quantization APIs.
+#
+# Here are the quantization APIs available:
+#   * quantize_model(): the entry point for all quantization with different options.
+#   * QuantHandler: a base class for quantization handlers. This will be deprecated in favorr of torchao API.
+#
+# Different implementation of Handlers:
+#   * EmbeddingOnlyQuantHandler: quantize embeddings.
+#   * WeightOnlyInt8QuantHandler: int8 weight only quantization. Will be migrated to torchao API.
+#   * WeightOnlyInt4QuantHandler: int4 weight only quantization. Will be migrated to torchao API.
+#
+# torchao Quantizer:
+#   * Int8DynActInt4WeightQuantizer: dynamic quantization for int8 acitvation and int4 weight. Using torchao API.
+#
 from __future__ import annotations
 
 import json