@@ -212,11 +212,9 @@ def quantize(
212
212
if qmode == "int8" :
213
213
# Add quantization mode options here: group size, bit width, etc.
214
214
return WeightOnlyInt8QuantHandler (model ).quantized_model ()
215
- elif qmode == "int4" :
216
- model_int4 = Int8DynActInt4WeightQuantHandler (
217
- model ,
218
- precision = torch_dtype ,
219
- ).quantized_model ()
215
+ elif qmode == "8da4w" :
216
+ from torchao .quantization .quant_api import Int8DynActInt4WeightQuantizer
217
+ model_int4 = Int8DynActInt4WeightQuantizer (precision = torch_dtype ).quantize (model )
220
218
print ("quantized model:" , model_int4 )
221
219
return model_int4
222
220
elif qmode == "8da4w-gptq" :
@@ -287,7 +285,7 @@ def build_args_parser() -> argparse.ArgumentParser:
287
285
"--quantization_mode" ,
288
286
type = str ,
289
287
default = None ,
290
- choices = ["int8" , "int4 " , "8da4w-gptq" ],
288
+ choices = ["int8" , "8da4w " , "8da4w-gptq" ],
291
289
help = "type of quantization" ,
292
290
)
293
291
@@ -430,7 +428,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
430
428
if args .dtype_override is not None :
431
429
dtype_override = DType [args .dtype_override ]
432
430
else :
433
- dtype_override = DType ["fp16" ] if args .quantization_mode == "int4" else None
431
+ dtype_override = DType ["fp16" ] if args .quantization_mode in [ "8da4w" , "8da4w-gptq" ] else None
434
432
435
433
# source transforms
436
434
transforms = []
@@ -500,7 +498,7 @@ def _export_llama(modelname, args) -> str: # noqa: C901
500
498
if args .xnnpack :
501
499
# Following changes due to.
502
500
# 1. We need dynamically quantized partitioner for both pt2e_quantize options
503
- # as well as "qmode int4 " which is also dynamic quantizes linear layers.
501
+ # as well as "qmode 8da4w " which is also dynamic quantizes linear layers.
504
502
# 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops.
505
503
partitioners [XnnpackDynamicallyQuantizedPartitioner .__name__ ] = (
506
504
XnnpackDynamicallyQuantizedPartitioner ()
0 commit comments