Update on "Add quantized op support to llama runner"

larryliu0820 · larryliu0820 · commit 38cfb8daf5c8 · 2024-04-16T16:22:34.000-07:00
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: [D56197863](https://our.internmc.facebook.com/intern/diff/D56197863) [ghstack-poisoned]
diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml
@@ -1,10 +1,10 @@
-- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
+- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::quantized_embedding_byte_out
 
-- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
diff --git a/examples/models/llama2/ops/quantized_ops.py b/examples/models/llama2/ops/quantized_ops.py
@@ -15,22 +15,22 @@
     "llama_quantized", "DEF"
 )  # to not be confused with torch.ops.quantized.* ops.
 quantized_lib.define(
-    "embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "DEPRECATED_DO_NOT_USE_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
 )
 
 quantized_lib.define(
-    "embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
 )
 
 quantized_lib.define(
-    "embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "DEPRECATED_DO_NOT_USE_embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
 )
 
 quantized_lib.define(
-    "embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
 )
 
@@ -66,7 +66,9 @@ def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points):
     ), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}"
 
 
-@impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd")
+@impl(
+    quantized_lib, "DEPRECATED_DO_NOT_USE_embedding_byte", "CompositeExplicitAutograd"
+)
 def embedding_byte(
     weight: torch.Tensor,
     weight_scales: torch.Tensor,
@@ -92,7 +94,7 @@ def embedding_byte(
     return torch.ops.aten.embedding.default(weight, indices)
 
 
-@impl_abstract("llama_quantized::embedding_byte.out")
+@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out")
 def embedding_byte_out_meta(
     weight: torch.Tensor,
     weight_scales: torch.Tensor,
@@ -112,7 +114,11 @@ def embedding_byte_out_meta(
     )
 
 
-@impl(quantized_lib, "embedding_byte.dtype", "CompositeExplicitAutograd")
+@impl(
+    quantized_lib,
+    "DEPRECATED_DO_NOT_USE_embedding_byte.dtype",
+    "CompositeExplicitAutograd",
+)
 def embedding_byte_dtype(
     weight: torch.Tensor,
     weight_scales: torch.Tensor,
@@ -140,7 +146,7 @@ def embedding_byte_dtype(
     return torch.ops.aten.embedding.default(weight, indices)
 
 
-@impl_abstract("llama_quantized::embedding_byte.dtype_out")
+@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out")
 def embedding_byte_dtype_out_meta(
     weight: torch.Tensor,
     weight_scales: torch.Tensor,
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py
@@ -377,7 +377,7 @@ def __init__(
 
     @torch.no_grad()
     def forward(self, indices: torch.Tensor) -> torch.Tensor:
-        return torch.ops.llama_quantized.embedding_byte.dtype(
+        return torch.ops.llama_quantized.DEPRECATED_DO_NOT_USE_embedding_byte.dtype(
             self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
         )
 

Original file line number	Diff line number	Diff line change
`@@ -377,7 +377,7 @@ def __init__(`
`377`	`377`
`378`	`378`	`@torch.no_grad()`
`379`	`379`	`def forward(self, indices: torch.Tensor) -> torch.Tensor:`
`380`		`- return torch.ops.llama_quantized.embedding_byte.dtype(`
	`380`	`+ return torch.ops.llama_quantized.DEPRECATED_DO_NOT_USE_embedding_byte.dtype(`
`381`	`381`	`self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype`
`382`	`382`	`)`
`383`	`383`