Quantization support for groupwise embedding, various fp16 support

Michael Gschwind · facebook-github-bot · commit a3a525a879e2 · 2024-03-05T12:56:09.000-08:00
Summary: Quantization support for groupwise embedding, various fp16 support

Differential Revision: D54549727
diff --git a/examples/models/llama2/ops/quantized_ops.py b/examples/models/llama2/ops/quantized_ops.py
@@ -57,8 +57,8 @@ def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points):
         weight_zero_points is None or weight_zero_points.dtype == weight_scales.dtype
     ), "Expecting weight_zero_points to be None or have same dtype as weight_scales"
     assert (
-        weight_zero_points is None or weight_zero_points.dim() == 1
-    ), f"Expecting weight_zero_points tensor to be None or have dim()==1, but found {weight_zero_points.dim()}"
+        weight_zero_points is None or weight_zero_points.dim() == weight_scales.dim()
+    ), f"Expecting weight_zero_points tensor to be None or have dim() same as weight scales, but found {weight_zero_points.dim()}"
     assert weight_zero_points is None or weight_zero_points.size(0) == weight.size(
         0
     ), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}"
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py
@@ -76,7 +76,7 @@ def dynamically_quantize_per_channel(
 
     if group_size is None or group_size == 0:
         items = x_shape_1
-    elif not enable_non_multiple_groups:
+    elif (x_shape_1 % group_size == 0) or not enable_non_multiple_groups:
         assert group_size > 0, "group size must be positive"
         assert (
             x_shape_1 % group_size
@@ -128,6 +128,7 @@ def dynamically_quantize_per_channel(
     scales = scales.to(dtype=scales_dtype)
     quant = quant[:, :x_shape_1]
 
+    print(f"quant shape {quant.shape} scales shape {scales.shape}")
     return quant, scales, zero_points