Skip to content

Commit b0d0d0d

Browse files
authored
Add quantization documentations (#925)
Summary: Add details to `quantize.py` explaining the plan to change quantization implementation. Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent d59c077 commit b0d0d0d

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

generate.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ class Generator:
132132
tokenizer_args: Defines the tokenizer configuration for both the model and speculative model
133133
generator_args: Controls the generation parameters
134134
profile: A Path to a directory where the profiling results will be stored, if enabled.
135-
quantize: If True, quantize the model.
135+
quantize: If True, quantize the model. Please refer to docs/quantization.md for details.
136136
draft_quantize: If True, quantize the draft model.
137137
"""
138138

@@ -715,6 +715,7 @@ def chat(
715715
print("Model: ", end="")
716716

717717
buffer = []
718+
718719
def callback(x, *, done_generating=False):
719720
return self._callback(
720721
x,
@@ -726,6 +727,7 @@ def callback(x, *, done_generating=False):
726727
assert not generator_args.chat_mode
727728

728729
buffer = [generator_args.prompt]
730+
729731
def callback(x, *, done_generating=False):
730732
return self._callback(
731733
x,
@@ -794,7 +796,7 @@ def callback(x, *, done_generating=False):
794796
)
795797
# Don't continue here.... because we need to report and reset
796798
# continue
797-
799+
798800
logging.info(
799801
f"\nTime for inference {i + 1}: {t:.02f} sec total, time to first token {metrics.get('time_to_first_token', 0.0):.02f} sec with {'sequential' if generator_args.sequential_prefill else 'parallel'} prefill, {num_tokens_generated} tokens, {tokens_sec:.02f} tokens/sec, {1000 / tokens_sec:.02f} ms/token"
800802
)

quantization/quantize.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,22 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
# Quantization API library for torchchat.
8+
#
9+
# NOTICE: most of the quant primitives code here will be deprecated in favor of torchao quantization APIs.
10+
#
11+
# Here are the quantization APIs available:
12+
# * quantize_model(): the entry point for all quantization with different options.
13+
# * QuantHandler: a base class for quantization handlers. This will be deprecated in favorr of torchao API.
14+
#
15+
# Different implementation of Handlers:
16+
# * EmbeddingOnlyQuantHandler: quantize embeddings.
17+
# * WeightOnlyInt8QuantHandler: int8 weight only quantization. Will be migrated to torchao API.
18+
# * WeightOnlyInt4QuantHandler: int4 weight only quantization. Will be migrated to torchao API.
19+
#
20+
# torchao Quantizer:
21+
# * Int8DynActInt4WeightQuantizer: dynamic quantization for int8 acitvation and int4 weight. Using torchao API.
22+
#
723
from __future__ import annotations
824

925
import json

0 commit comments

Comments
 (0)