up

metascroy · metascroy · commit ff6207fb9daf · 2025-04-15T19:53:24.000-07:00
diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py
@@ -185,13 +185,11 @@ def from_q_dq_node(
             quant_node_args = extract_qdq_affine_op_args_for_decomposed_ops(quant_node)
 
         scale = quant_node_args[1]
-        zp = quant_node_args[2] if len(quant_node_args) > 2 else None
+        zp = quant_node_args[2]
         axis = 0
         if per_channel:
             assert isinstance(scale, torch.fx.Node) and isinstance(scale.target, str)
-            assert zp is None or (
-                isinstance(zp, torch.fx.Node) and isinstance(zp.target, str)
-            )
+            assert isinstance(zp, torch.fx.Node) and isinstance(zp.target, str)
             assert (
                 ep is not None
             ), "ExportedProgram must be provided to extract per channel params"
@@ -202,11 +200,7 @@ def _get_tensor(node):
                 return cast(torch.Tensor, param)
 
             scale = _get_tensor(scale)
-            zp = (
-                _get_tensor(zp)
-                if zp is not None
-                else torch.zeros_like(scale, dtype=torch.int8)
-            )
+            zp = _get_tensor(zp)
             axis = cast(int, quant_node_args[3])
 
             if _groupwise:
diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py
@@ -58,24 +58,18 @@ def is_dynamic_qdq(node: torch.fx.Node) -> bool:
         node_input_args = extract_qdq_affine_op_args_for_decomposed_ops(node)
 
     scale = node_input_args[1]
-    if not isinstance(scale, torch.fx.Node):
+    zp = node_input_args[2]
+    if not (isinstance(scale, torch.fx.Node) and isinstance(zp, torch.fx.Node)):
         return False
-    if not (scale.target == operator.getitem):
-        return False
-    scale_choose_qparam = scale.all_input_nodes[0]
-    if not is_qparam(scale_choose_qparam):
+
+    if not (scale.target == operator.getitem and zp.target == operator.getitem):
         return False
 
-    if len(node_input_args) > 2:
-        zp = node_input_args[2]
-        if not isinstance(zp, torch.fx.Node):
-            return False
+    scale_choose_qparam = scale.all_input_nodes[0]
+    zp_choose_qparam = zp.all_input_nodes[0]
 
-        if not (zp.target == operator.getitem):
-            return False
-        zp_choose_qparam = zp.all_input_nodes[0]
-        if not is_qparam(zp_choose_qparam):
-            return False
+    if not (is_qparam(scale_choose_qparam) and is_qparam(zp_choose_qparam)):
+        return False
 
     return True
 
@@ -229,7 +223,7 @@ def extract_qdq_affine_op_args_for_decomposed_ops(node: torch.fx.Node):
     # add target_dtype_node after quant_min/quant_max
     args.append(target_dtype)
     # zero_point_domain
-    if len(node.args) > 7 and node.args[7] not in ["INT", "NONE"]:
+    if len(node.args) > 7 and node.args[7] != "INT":
         return None, None
 
     if is_per_channel_group(node):
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
@@ -416,7 +416,7 @@ python -m examples.models.llama.export_llama \
 ```
 
 A few notes:
-- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory.  When this option is enabled, you can specify whether embeddings are quantized with weight zeros or not by specifying a third argument.  For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and uses weight zeros (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32, but is quantized with scales-only.  If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
+- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory.  When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument.  For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric.  If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
 - To do channelwise quantization, specify group_size to 0.  This works for both linear and embedding layers.
 
 Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -112,9 +112,13 @@ def quantize(  # noqa C901
         assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
         bitwidth = int(matches[0][0])
 
-        from torchao.experimental.quant_api import Int8DynamicActivationIntxWeightConfig
-        from torchao.quantization.granularity import PerGroup, PerRow
-        from torchao.quantization.quant_api import quantize_
+        from torchao.dtypes import PackedLinearInt8DynamicActivationIntxWeightLayout
+        from torchao.quantization.granularity import PerAxis, PerGroup
+        from torchao.quantization.quant_api import (
+            Int8DynamicActivationIntxWeightConfig,
+            MappingType,
+            quantize_,
+        )
         from torchao.utils import unwrap_tensor_subclass
 
         with torch.no_grad():
@@ -124,8 +128,11 @@ def quantize(  # noqa C901
                 model,
                 Int8DynamicActivationIntxWeightConfig(
                     weight_dtype=getattr(torch, f"int{bitwidth}"),
-                    granularity=(PerRow() if group_size == 0 else PerGroup(group_size)),
-                    has_weight_zeros=False,
+                    granularity=(
+                        PerAxis(0) if group_size == 0 else PerGroup(group_size)
+                    ),
+                    mapping_type=MappingType.SYMMETRIC,
+                    layout=PackedLinearInt8DynamicActivationIntxWeightLayout(),
                 ),
             )
             model = unwrap_tensor_subclass(model)
@@ -777,38 +784,42 @@ def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
             EmbeddingQuantizer,
             SharedEmbeddingQuantizer,
         )
-        from torchao.quantization.granularity import PerGroup, PerRow
+        from torchao.quantization.granularity import PerAxis, PerGroup
+        from torchao.quantization.quant_api import MappingType
 
         quant_args = args.embedding_quantize.split(":")[1].split(",")
         if len(quant_args) == 2:
             bitwidth, group_size = quant_args
-            has_weight_zeros = True
+            is_asymmetric = True
         else:
-            bitwidth, group_size, has_weight_zeros = quant_args
+            bitwidth, group_size, is_asymmetric = quant_args
 
         if group_size in ["none", "None", "0"]:
             group_size = 0
 
         group_size = int(group_size)
         bitwidth = int(bitwidth)
-        has_weight_zeros = bool(has_weight_zeros)
+        is_asymmetric = bool(is_asymmetric)
         weight_dtype = getattr(torch, f"int{bitwidth}")
-        granularity = PerRow() if group_size == 0 else PerGroup(group_size)
+        granularity = PerAxis(0) if group_size == 0 else PerGroup(group_size)
+        mapping_type = (
+            MappingType.ASYMMETRIC if is_asymmetric else MappingType.SYMMETRIC
+        )
 
         def _torchao_embedding_quantizer(model):
             with torch.no_grad():
                 if not args.use_shared_embedding:
                     EmbeddingQuantizer(
                         weight_dtype=weight_dtype,
                         granularity=granularity,
-                        has_weight_zeros=has_weight_zeros,
+                        mapping_type=mapping_type,
                         use_fallback=False,
                     ).quantize(model)
                 else:
                     SharedEmbeddingQuantizer(
                         weight_dtype=weight_dtype,
                         granularity=granularity,
-                        has_weight_zeros=has_weight_zeros,
+                        mapping_type=mapping_type,
                     ).quantize(model)
             return model
 
diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py
@@ -110,7 +110,8 @@ def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.T
 
 
 def convert_weights(input_dir_or_checkpoint: str, output_file: str) -> None:
-    # Don't necessarily need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
+    # If input_dir_or_checkpoint is a directory downloaded from HF, FullModelHFCheckpointer is used to extract the state dict
+    # If input_dir_or_checkpoint is a checkpoint (from eager model model), it is loaded directly
     if os.path.isdir(input_dir_or_checkpoint):
         checkpointer = FullModelHFCheckpointer(
             checkpoint_dir=input_dir_or_checkpoint,
diff --git a/third-party/ao b/third-party/ao
@@ -1 +1 @@
-Subproject commit 9516764a97147231c72377bc1067c5e997de02f5
+Subproject commit 665dac0a6dce15afe27122d2c960c144c48447f6