Assert quant_min/quant_max in embedding4bit (#7410)

metascroy · YIWENX14 · commit f104e0f9ac59 · 2025-01-28T14:21:21.000-08:00
* init

* format fix
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -729,18 +729,18 @@ def __init__(
     def forward(self, indices: torch.Tensor) -> torch.Tensor:
         if not self.packed:  # 8bit
             return torch.ops.quantized_decomposed.embedding_byte.dtype(
-                self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
+                self.weight, self.scales, None, -128, 127, indices, dtype=self.dtype
             )
         else:  # packed
             if self.bitwidth == 2:
                 return torch.ops.quantized_decomposed.embedding_2bit.dtype(
-                    self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
+                    self.weight, self.scales, None, -2, 1, indices, dtype=self.dtype
                 )
 
             # Remaining case (always return to make pyre happy)
             assert self.bitwidth == 4
             return torch.ops.quantized_decomposed.embedding_4bit.dtype(
-                self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
+                self.weight, self.scales, None, -8, 7, indices, dtype=self.dtype
             )
 
 
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -202,6 +202,13 @@ def embedding_2bit(
     weight_quant_max: int,
     indices: torch.Tensor,
 ) -> torch.Tensor:
+    assert (
+        weight_quant_min == -2
+    ), "embedding_2bit in ExecuTorch expects weight_quant_min == -2"
+    assert (
+        weight_quant_max == 1
+    ), "embedding_2bit in ExecuTorch expects weight_quant_max == 1"
+
     embedding_weight_checks(weight, weight_scales, weight_zero_points)
     group_size = (4 * weight.size(1)) // (
         weight_scales.size(1) if weight_scales.dim() == 2 else 1
@@ -257,6 +264,13 @@ def embedding_2bit_dtype(
     indices: torch.Tensor,
     dtype: Optional[torch.dtype],
 ) -> torch.Tensor:
+    assert (
+        weight_quant_min == -2
+    ), "embedding_2bit_dtype in ExecuTorch expects weight_quant_min == -2"
+    assert (
+        weight_quant_max == 1
+    ), "embedding_2bit_dtype in ExecuTorch expects weight_quant_max == 1"
+
     embedding_weight_checks(weight, weight_scales, weight_zero_points)
     group_size = (4 * weight.size(1)) // (
         weight_scales.size(1) if weight_scales.dim() == 2 else 1
@@ -334,6 +348,13 @@ def embedding_4bit(
     weight_quant_max: int,
     indices: torch.Tensor,
 ) -> torch.Tensor:
+    assert (
+        weight_quant_min == -8
+    ), "embedding_4bit in ExecuTorch expects weight_quant_min == -8"
+    assert (
+        weight_quant_max == 7
+    ), "embedding_4bit in ExecuTorch expects weight_quant_max == 7"
+
     embedding_weight_checks(weight, weight_scales, weight_zero_points)
     group_size = (2 * weight.size(1)) // (
         weight_scales.size(1) if weight_scales.dim() == 2 else 1
@@ -387,6 +408,13 @@ def embedding_4bit_dtype(
     indices: torch.Tensor,
     dtype: Optional[torch.dtype],
 ) -> torch.Tensor:
+    assert (
+        weight_quant_min == -8
+    ), "embedding_4bit_dtype in ExecuTorch expects weight_quant_min == -8"
+    assert (
+        weight_quant_max == 7
+    ), "embedding_4bit_dtype in ExecuTorch expects weight_quant_max == 7"
+
     embedding_weight_checks(weight, weight_scales, weight_zero_points)
     group_size = (2 * weight.size(1)) // (
         weight_scales.size(1) if weight_scales.dim() == 2 else 1

Original file line number	Diff line number	Diff line change
`@@ -729,18 +729,18 @@ def __init__(`
`729`	`729`	`def forward(self, indices: torch.Tensor) -> torch.Tensor:`
`730`	`730`	`if not self.packed: # 8bit`
`731`	`731`	`return torch.ops.quantized_decomposed.embedding_byte.dtype(`
`732`		`- self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype`
	`732`	`+ self.weight, self.scales, None, -128, 127, indices, dtype=self.dtype`
`733`	`733`	`)`
`734`	`734`	`else: # packed`
`735`	`735`	`if self.bitwidth == 2:`
`736`	`736`	`return torch.ops.quantized_decomposed.embedding_2bit.dtype(`
`737`		`- self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype`
	`737`	`+ self.weight, self.scales, None, -2, 1, indices, dtype=self.dtype`
`738`	`738`	`)`
`739`	`739`
`740`	`740`	`# Remaining case (always return to make pyre happy)`
`741`	`741`	`assert self.bitwidth == 4`
`742`	`742`	`return torch.ops.quantized_decomposed.embedding_4bit.dtype(`
`743`		`- self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype`
	`743`	`+ self.weight, self.scales, None, -8, 7, indices, dtype=self.dtype`
`744`	`744`	`)`
`745`	`745`
`746`	`746`