init

metascroy · metascroy · commit a540bfe5c5b1 · 2025-04-14T09:57:28.000-07:00
diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py
@@ -185,11 +185,13 @@ def from_q_dq_node(
             quant_node_args = extract_qdq_affine_op_args_for_decomposed_ops(quant_node)
 
         scale = quant_node_args[1]
-        zp = quant_node_args[2]
+        zp = quant_node_args[2] if len(quant_node_args) > 2 else None
         axis = 0
         if per_channel:
             assert isinstance(scale, torch.fx.Node) and isinstance(scale.target, str)
-            assert isinstance(zp, torch.fx.Node) and isinstance(zp.target, str)
+            assert zp is None or (
+                isinstance(zp, torch.fx.Node) and isinstance(zp.target, str)
+            )
             assert (
                 ep is not None
             ), "ExportedProgram must be provided to extract per channel params"
@@ -200,7 +202,11 @@ def _get_tensor(node):
                 return cast(torch.Tensor, param)
 
             scale = _get_tensor(scale)
-            zp = _get_tensor(zp)
+            zp = (
+                _get_tensor(zp)
+                if zp is not None
+                else torch.zeros_like(scale, dtype=torch.int8)
+            )
             axis = cast(int, quant_node_args[3])
 
             if _groupwise:
diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py
@@ -58,19 +58,25 @@ def is_dynamic_qdq(node: torch.fx.Node) -> bool:
         node_input_args = extract_qdq_affine_op_args_for_decomposed_ops(node)
 
     scale = node_input_args[1]
-    zp = node_input_args[2]
-    if not (isinstance(scale, torch.fx.Node) and isinstance(zp, torch.fx.Node)):
+    if not isinstance(scale, torch.fx.Node):
         return False
-
-    if not (scale.target == operator.getitem and zp.target == operator.getitem):
+    if not (scale.target == operator.getitem):
         return False
-
     scale_choose_qparam = scale.all_input_nodes[0]
-    zp_choose_qparam = zp.all_input_nodes[0]
-
-    if not (is_qparam(scale_choose_qparam) and is_qparam(zp_choose_qparam)):
+    if not is_qparam(scale_choose_qparam):
         return False
 
+    if len(node_input_args) > 2:
+        zp = node_input_args[2]
+        if not isinstance(zp, torch.fx.Node):
+            return False
+
+        if not (zp.target == operator.getitem):
+            return False
+        zp_choose_qparam = zp.all_input_nodes[0]
+        if not is_qparam(zp_choose_qparam):
+            return False
+
     return True
 
 
@@ -223,7 +229,7 @@ def extract_qdq_affine_op_args_for_decomposed_ops(node: torch.fx.Node):
     # add target_dtype_node after quant_min/quant_max
     args.append(target_dtype)
     # zero_point_domain
-    if len(node.args) > 7 and node.args[7] != "INT":
+    if len(node.args) > 7 and node.args[7] not in ["INT", "NONE"]:
         return None, None
 
     if is_per_channel_group(node):
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -763,7 +763,6 @@ def _to_edge_and_lower_llama_xnnpack(
         raise NotImplementedError(
             "export_llama does not support XNNPack and generating ETRecord at the moment."
         )
-
     builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
         partitioners
     )
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -16,8 +16,8 @@
     get_default_model_resource_dir,
 )
 from executorch.examples.models.llama.llama_transformer import Transformer
-
 from executorch.examples.models.llama.model_args import ModelArgs
+from torchao.utils import TorchAOBaseTensor
 
 try:
     from .fairseq2 import convert_to_llama_checkpoint
@@ -101,6 +101,7 @@ def __init__(self, **kwargs):
         if fairseq2_checkpoint:
             print("Using fairseq2 checkpoint")
             checkpoint = convert_to_llama_checkpoint(checkpoint=checkpoint)
+            print("checkpoint", checkpoint)
         if "model" in checkpoint:
             # NB: some checkpoint contains a "model" field, which is the actual weights dict
             checkpoint = checkpoint["model"]
@@ -257,6 +258,9 @@ def __init__(self, **kwargs):
                 strict=False,
                 assign=True,
             )  # self.model_ = Transformer(gptconf)
+            for param in self.model_.parameters():
+                if isinstance(param, TorchAOBaseTensor):
+                    param.requires_grad = False
         else:
             print("Checkpoint not provided, defaulting weights to zeros.")
             self.model_.to_empty(device="cpu")
diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 from typing import Dict
 
 import torch
@@ -7,6 +8,63 @@
 
 from torchtune.training import FullModelHFCheckpointer
 
+_HF_PHI_4_FROM_META = {
+    "tok_embeddings.weight": "model.embed_tokens.weight",
+    "norm.weight": "model.norm.weight",
+    "layers.{}.attention.wq.weight": "model.layers.{}.self_attn.q_proj.weight",
+    "layers.{}.attention.wk.weight": "model.layers.{}.self_attn.k_proj.weight",
+    "layers.{}.attention.wv.weight": "model.layers.{}.self_attn.v_proj.weight",
+    "layers.{}.attention.wo.weight": "model.layers.{}.self_attn.o_proj.weight",
+    "layers.{}.attention_norm.weight": "model.layers.{}.input_layernorm.weight",
+    "layers.{}.ffn_norm.weight": "model.layers.{}.post_attention_layernorm.weight",
+    "layers.{}.feed_forward.w1.weight": "model.layers.{}.mlp.gate_proj.weight",
+    "layers.{}.feed_forward.w3.weight": "model.layers.{}.mlp.up_proj.weight",
+    "layers.{}.feed_forward.w2.weight": "model.layers.{}.mlp.down_proj.weight",
+    "output.weight": "lm_head.weight",
+}
+
+
+def phi_4_hf_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from hf's format to Meta's format.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in hf's format.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _HF_PHI_4_FROM_META.items()}
+
+    for key, value in state_dict.items():
+        if key.endswith("mlp.gate_up_proj.weight"):
+            # Split the gate_up_proj into gate_proj and up_proj
+            hidden_dim = value.shape[0] // 2
+            assert 2 * hidden_dim == value.shape[0]
+            gate = value[0:hidden_dim, :]
+            up = value[hidden_dim:, :]
+            for new_key, new_value in [("gate_proj", gate), ("up_proj", up)]:
+                new_key = key.replace("gate_up_proj", new_key)
+                new_key = get_mapped_key(new_key, inverted_mapping_dict)
+                converted_state_dict[new_key] = new_value
+        elif key.endswith("self_attn.qkv_proj.weight"):
+            # Split the qkv_proj into q_proj, k_proj, and v_proj
+            q_dim = value.shape[1]
+            kv_dim = (value.shape[0] - q_dim) // 2
+            assert 2 * kv_dim + q_dim == value.shape[0]
+            q = value[0:q_dim, :]
+            k = value[q_dim : (q_dim + kv_dim), :]
+            v = value[(q_dim + kv_dim) :, :]
+            for new_key, new_value in [("q_proj", q), ("k_proj", k), ("v_proj", v)]:
+                new_key = key.replace("qkv_proj", new_key)
+                new_key = get_mapped_key(new_key, inverted_mapping_dict)
+                converted_state_dict[new_key] = new_value
+        else:
+            new_key = get_mapped_key(key, inverted_mapping_dict)
+            converted_state_dict[new_key] = value
+    return converted_state_dict
+
 
 # Standard _FROM_META weight mapping of Meta weights to TorchTune.
 _PHI_4_FROM_META = {
@@ -51,22 +109,29 @@ def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.T
     return converted_state_dict
 
 
-def convert_weights(input_dir: str, output_file: str) -> None:
+def convert_weights(input_dir_or_checkpoint: str, output_file: str) -> None:
     # Don't necessarily need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
-    checkpointer = FullModelHFCheckpointer(
-        checkpoint_dir=input_dir,
-        checkpoint_files=[
-            "model-00001-of-00002.safetensors",
-            "model-00002-of-00002.safetensors",
-        ],
-        output_dir=".",
-        model_type="PHI4",
-    )
+    if os.path.isdir(input_dir_or_checkpoint):
+        checkpointer = FullModelHFCheckpointer(
+            checkpoint_dir=input_dir_or_checkpoint,
+            checkpoint_files=[
+                "model-00001-of-00002.safetensors",
+                "model-00002-of-00002.safetensors",
+            ],
+            output_dir=".",
+            model_type="PHI4",
+        )
+        print("Loading checkpoint from directory...")
+        sd = checkpointer.load_checkpoint()
+        sd = sd["model"]
+        print("Converting checkpoint...")
+        sd = phi_4_tune_to_meta(sd)
+    else:
+        print("Loading checkpoint from file...")
+        sd = torch.load(input_dir_or_checkpoint, map_location="cpu", weights_only=True)
+        print("Converting checkpoint...")
+        sd = phi_4_hf_to_meta(sd)
 
-    print("Loading checkpoint...")
-    sd = checkpointer.load_checkpoint()
-    print("Converting checkpoint...")
-    sd = phi_4_tune_to_meta(sd["model"])
     print("Saving checkpoint...")
     torch.save(sd, output_file)
     print("Done.")
@@ -79,7 +144,7 @@ def main():
     parser.add_argument(
         "input_dir",
         type=str,
-        help="Path to directory containing checkpoint files",
+        help="Path to directory containing checkpoint files, or path to a single checkpoint file.",
     )
     parser.add_argument("output", type=str, help="Path to the output checkpoint")
 
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -31,7 +31,6 @@
 from executorch.exir.passes import MemoryPlanningPass
 from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
-
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
 
 from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
@@ -41,6 +40,7 @@
 from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
 from torch.export import export_for_training, ExportedProgram
 from torch.nn.attention import SDPBackend
+from torchao.utils import unwrap_tensor_subclass
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -199,6 +199,11 @@ def _get_edge_config(self) -> EdgeCompileConfig:
         return edge_config
 
     def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
+        if module is not None:
+            unwrap_tensor_subclass(module)
+        else:
+            unwrap_tensor_subclass(self.model)
+
         dynamic_shape = self._get_dynamic_shape()
         # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
@@ -226,6 +231,7 @@ def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
                     logging.info("Re-exporting with:")
                 else:
                     logging.info("Exporting with:")
+
                 logging.info(f"inputs: {self.example_inputs}")
                 logging.info(f"kwargs: {self.example_kwarg_inputs}")
                 logging.info(f"dynamic shapes: {dynamic_shape}")

Original file line number	Diff line number	Diff line change
`@@ -763,7 +763,6 @@ def _to_edge_and_lower_llama_xnnpack(`
`763`	`763`	`raise NotImplementedError(`
`764`	`764`	`"export_llama does not support XNNPack and generating ETRecord at the moment."`
`765`	`765`	`)`
`766`		`-`
`767`	`766`	`builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(`
`768`	`767`	`partitioners`
`769`	`768`	`)`