Default Uninitialized Llama2 Weights to Zeros, and Provide Better Quantization for Example Models (#9634)

mcr229 · kirklandsign · commit 386d1b32cf4c · 2025-04-11T14:32:56.000-07:00
### Summary Changes: 1. When initializing Llama2 for aot_compiler, since checkpoints can only e downloaded from hugging face, we initialize llama2 with uninitialized weights. The problem with this is that when running quantization, we can run into errors with the histogram if the unitialized values are nan. We fix this by initializing the weights with zeros if no check point is provided. This enforces that quantization step can still work. 2. Quant Type in AoT compiler. When looking at the model options available to XNNPACK, everything is quantized with per-tensor static quantization. This isn't the best option for all the models available. For example transformer based models like Llama and MobileBert would likely prefer dynamically quantized per channel weights, where has CNN like MobileNet would prefer statically quantized per channel weights. We add this type of Quant Type to the existing models options. This also helps with Test Timeouts. per-tensor static quantization on a model like llama can take a long time due to the introduction of MANY q/dq nodes, and the complex partitions it creates. As a result, proposing partitions can take a long time due to the constant BFS to find the largest possible partition. By specifying the more apt quantization scheme like dynamic per-channel quantization, we can avoid this complexity. Overall this should help with flakey [nan, nan] errors in the quantization histogram, and it should also help with CI timing out. ### Test plan OSS XNNPACK CI for all model delegation cc @digantdesai @cbilgin
diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
@@ -14,7 +14,7 @@
 from typing import Any
 
 from examples.models import MODEL_NAME_TO_MODEL
-from examples.xnnpack import MODEL_NAME_TO_OPTIONS
+from examples.xnnpack import MODEL_NAME_TO_OPTIONS, QuantType
 
 DEFAULT_RUNNERS = {
     "linux": "linux.2xlarge",
@@ -154,7 +154,7 @@ def export_models_for_ci() -> dict[str, dict]:
         if backend == "xnnpack":
             if name not in MODEL_NAME_TO_OPTIONS:
                 continue
-            if MODEL_NAME_TO_OPTIONS[name].quantization:
+            if MODEL_NAME_TO_OPTIONS[name].quantization != QuantType.NONE:
                 backend += "-quantization"
 
             if MODEL_NAME_TO_OPTIONS[name].delegation:
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -259,15 +259,22 @@ def __init__(self, **kwargs):
                     assign=True,
                 )  # self.model_ = Transformer(gptconf)
             else:
-                print("Checkpoint not provided, defaulting to uninitialized weights.")
+                print("Checkpoint not provided, defaulting weights to zeros.")
                 self.model_.to_empty(device="cpu")
+                for p in self.model_.parameters():
+                    p.data.fill_(0)
+                for b in self.model_.buffers():
+                    b.data.fill_(0)
         except RuntimeError as e:
             print(
-                f"Could not load checkpoint into mode and will default to uninitialized weights due to error: {e}."
+                f"Could not load checkpoint into mode and will defaulting weights to zeros due to error: {e}."
             )
             # Need to provide concrete (empty) values for meta-initialized tensors for quantization.
             self.model_.to_empty(device="cpu")
-
+            for p in self.model_.parameters():
+                p.data.fill_(0)
+            for b in self.model_.buffers():
+                b.data.fill_(0)
         if missing:
             missing_weights = [fqn for fqn in missing if fqn.endswith(".weight")]
             if missing_weights:
diff --git a/examples/xnnpack/__init__.py b/examples/xnnpack/__init__.py
@@ -7,33 +7,44 @@
 # pyre-unsafe
 
 from dataclasses import dataclass
+from enum import Enum
+
+
+class QuantType(Enum):
+    NONE = 1
+    # Used for Operations that don't have weights
+    STATIC_PER_TENSOR = 2
+    # Used best for CNN/RNN Models with Conv layers
+    STATIC_PER_CHANNEL = 3
+    # Used for Linear Layers and Transformer Based Models
+    DYNAMIC_PER_CHANNEL = 4
 
 
 @dataclass
 class XNNPACKOptions(object):
-    quantization: bool
+    quantization: QuantType
     delegation: bool
 
 
 MODEL_NAME_TO_OPTIONS = {
-    "linear": XNNPACKOptions(True, True),
-    "add": XNNPACKOptions(True, True),
-    "add_mul": XNNPACKOptions(True, True),
-    "dl3": XNNPACKOptions(True, True),
-    "ic3": XNNPACKOptions(True, True),
-    "ic4": XNNPACKOptions(True, True),
-    "mv2": XNNPACKOptions(True, True),
-    "mv3": XNNPACKOptions(True, True),
-    "resnet18": XNNPACKOptions(True, True),
-    "resnet50": XNNPACKOptions(True, True),
-    "vit": XNNPACKOptions(True, True),
-    "w2l": XNNPACKOptions(True, True),
-    "edsr": XNNPACKOptions(True, True),
-    "mobilebert": XNNPACKOptions(True, True),
-    "llama2": XNNPACKOptions(False, True),
-    "emformer_join": XNNPACKOptions(True, True),
-    "emformer_predict": XNNPACKOptions(True, True),
-    "emformer_transcribe": XNNPACKOptions(True, True),
+    "linear": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "add": XNNPACKOptions(QuantType.STATIC_PER_TENSOR, True),
+    "add_mul": XNNPACKOptions(QuantType.STATIC_PER_TENSOR, True),
+    "dl3": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "ic3": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "ic4": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "mv2": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "mv3": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "resnet18": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "resnet50": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "vit": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "w2l": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "edsr": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "mobilebert": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "llama2": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "emformer_join": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "emformer_predict": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "emformer_transcribe": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
 }
 
 
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
@@ -66,7 +66,7 @@
 
     args = parser.parse_args()
 
-    if not args.delegate:
+    if not args.delegate and args.quantize:
         raise NotImplementedError(
             "T161880157: Quantization-only without delegation is not supported yet"
         )
@@ -79,6 +79,8 @@
             f"Available models are {list(MODEL_NAME_TO_OPTIONS.keys())}."
         )
 
+    quant_type = MODEL_NAME_TO_OPTIONS[args.model_name].quantization
+
     model, example_inputs, _, _ = EagerModelFactory.create_model(
         *MODEL_NAME_TO_MODEL[args.model_name]
     )
@@ -91,7 +93,7 @@
     if args.quantize:
         logging.info("Quantizing Model...")
         # TODO(T165162973): This pass shall eventually be folded into quantizer
-        model = quantize(model, example_inputs)
+        model = quantize(model, example_inputs, quant_type)
         ep = torch.export.export_for_training(model, example_inputs)
 
     edge = to_edge_transform_and_lower(
diff --git a/examples/xnnpack/quantization/utils.py b/examples/xnnpack/quantization/utils.py
@@ -13,13 +13,25 @@
 
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
+from .. import QuantType
 
-def quantize(model, example_inputs):
+
+def quantize(
+    model, example_inputs, quant_type: QuantType = QuantType.STATIC_PER_TENSOR
+):
     """This is the official recommended flow for quantization in pytorch 2.0 export"""
     logging.info(f"Original model: {model}")
     quantizer = XNNPACKQuantizer()
     # if we set is_per_channel to True, we also need to add out_variant of quantize_per_channel/dequantize_per_channel
-    operator_config = get_symmetric_quantization_config(is_per_channel=False)
+    is_per_channel = (
+        quant_type == QuantType.STATIC_PER_CHANNEL
+        or quant_type == QuantType.DYNAMIC_PER_CHANNEL
+    )
+    is_dynamic = quant_type == QuantType.DYNAMIC_PER_CHANNEL
+    operator_config = get_symmetric_quantization_config(
+        is_per_channel=is_per_channel,
+        is_dynamic=is_dynamic,
+    )
     quantizer.set_global(operator_config)
     m = prepare_pt2e(model, quantizer)
     # calibration