Enable embedding_byte output dtype be different than scales/zp dtype (#2091)

manuelcandales · facebook-github-bot · commit 2966e3889035 · 2024-03-01T20:49:01.000-08:00
Summary: Pull Request resolved: #2091 Reviewed By: mikekgfb, cbilgin Differential Revision: D54141337 fbshipit-source-id: f79754770ddca459e0e23680b42f84d6ff5ce21a
diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml
@@ -1,4 +1,4 @@
-- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
+- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
diff --git a/examples/models/llama2/ops/quantized_ops.py b/examples/models/llama2/ops/quantized_ops.py
@@ -14,12 +14,12 @@
 )  # to not be confused with torch.ops.quantized.* ops.
 quantized_lib.define(
     "embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
 )
 
 quantized_lib.define(
     "embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
 )
 
 
@@ -31,6 +31,8 @@ def embedding_byte_meta(
     weight_quant_min,
     weight_quant_max,
     indices,
+    *,
+    dtype,
 ):
     assert weight.dtype in [
         torch.int8,
@@ -71,7 +73,7 @@ def embedding_byte_meta(
         weight_quant_max,
         weight.dtype,
     )
-    return torch.ops.aten.embedding.default(weight, indices)
+    return torch.ops.aten.embedding.default(weight, indices).to(dtype)
 
 
 @impl_abstract("llama_quantized::embedding_byte.out")
@@ -82,6 +84,8 @@ def embedding_byte_out_meta(
     weight_quant_min,
     weight_quant_max,
     indices,
+    *,
+    dtype,
     out,
 ):
     return embedding_byte_meta(
@@ -91,4 +95,5 @@ def embedding_byte_out_meta(
         weight_quant_min,
         weight_quant_max,
         indices,
+        dtype=dtype,
     )
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py
@@ -818,8 +818,8 @@ def __init__(
     @torch.no_grad()
     def forward(self, indices: torch.Tensor) -> torch.Tensor:
         return torch.ops.llama_quantized.embedding_byte.default(
-            self.weight, self.scales, None, 0, 0, indices
-        ).to(self.dtype)
+            self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
+        )
 
 
 #        result_weights = self.weight.index_select(0, indices.view(-1))
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -27,7 +27,7 @@
 
 quantized_decomposed_lib.define(
     "embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
 )
 
 quantized_decomposed_lib.define(
@@ -482,6 +482,48 @@ def replacement(
             )
             return out
 
+        @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte")
+        def pattern_with_dtype(
+            weight,
+            weight_scales,
+            weight_zero_points,
+            weight_quant_min,
+            weight_quant_max,
+            indicies,
+            dtype,
+        ):
+            weight = torch.ops.quantized_decomposed.dequantize_per_channel.default(
+                weight,
+                weight_scales,
+                weight_zero_points,
+                0,
+                weight_quant_min,
+                weight_quant_max,
+                torch.uint8,
+            )
+            out = torch.ops.aten.embedding.default(weight, indicies).to(dtype)
+            return out
+
+        def replacement_with_dtype(
+            weight,
+            weight_scales,
+            weight_zero_points,
+            weight_quant_min,
+            weight_quant_max,
+            indicies,
+            dtype,
+        ):
+            out = torch.ops.quantized_decomposed.embedding_byte.default(
+                weight,
+                weight_scales,
+                weight_zero_points,
+                weight_quant_min,
+                weight_quant_max,
+                indicies,
+                dtype=dtype,
+            )
+            return out
+
         @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte")
         def pattern_with_padding_idx(
             weight,
@@ -529,6 +571,11 @@ def replacement_with_padding_idx(
                 _trace_and_lower_to_edge_ops(replacement),
                 [],
             ),
+            (
+                _trace_and_lower_to_edge_ops(pattern_with_dtype),
+                _trace_and_lower_to_edge_ops(replacement_with_dtype),
+                [],
+            ),
             (
                 _trace_and_lower_to_edge_ops(pattern_with_padding_idx),
                 _trace_and_lower_to_edge_ops(replacement_with_padding_idx),
diff --git a/kernels/quantized/cpu/op_embedding.cpp b/kernels/quantized/cpu/op_embedding.cpp
@@ -31,6 +31,7 @@ void check_embedding_byte_args(
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
+    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   ET_CHECK_MSG(
       weight.dim() == 2, "weight must be 2D but got() %zd dims", weight.dim());
@@ -75,8 +76,9 @@ void check_embedding_byte_args(
       static_cast<int8_t>(out.scalar_type()));
 
   ET_CHECK_MSG(
-      weight_scales.scalar_type() == out.scalar_type(),
-      "weight scales scalar type %" PRId8 " does not match out.scalar_type()",
+      weight_scales.scalar_type() == ScalarType::Float ||
+          weight_scales.scalar_type() == ScalarType::Half,
+      "weight_scales.scalar_type() %" PRId8 " is not supported:",
       static_cast<int8_t>(weight_scales.scalar_type()));
 
   if (opt_weight_zero_points.has_value()) {
@@ -116,13 +118,19 @@ void check_embedding_byte_args(
       " is greater than weight quant max: %" PRId64,
       weight_quant_min,
       weight_quant_max);
+
+  if (out_dtype.has_value()) {
+    ET_CHECK_MSG(
+        out.scalar_type() == out_dtype.value(),
+        "output_dtype must match the dtype of the out tensor");
+  }
 }
 
 /**
  * Retrieves the embeddings specified by indices, dequantizes them, and stores
  * them in out
  */
-template <class CTYPE_WEIGHT, class CTYPE_OUT>
+template <typename CTYPE_WEIGHT, typename CTYPE_PARAMS, typename CTYPE_OUT>
 void embedding_byte_per_channel(
     const Tensor& weight,
     const Tensor& weight_scales,
@@ -142,19 +150,19 @@ void embedding_byte_per_channel(
   CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
   const int64_t* indices_ptr = indices.const_data_ptr<int64_t>();
 
-  const CTYPE_OUT* scales = weight_scales.const_data_ptr<CTYPE_OUT>();
-  const CTYPE_OUT* zero_points = nullptr;
+  const CTYPE_PARAMS* scales = weight_scales.const_data_ptr<CTYPE_PARAMS>();
+  const CTYPE_PARAMS* zero_points = nullptr;
   if (opt_weight_zero_points.has_value()) {
-    zero_points = opt_weight_zero_points.value().const_data_ptr<CTYPE_OUT>();
+    zero_points = opt_weight_zero_points.value().const_data_ptr<CTYPE_PARAMS>();
   }
 
   for (int i = 0; i < indices.numel(); i++) {
     int64_t index = indices_ptr[i];
     // If using groupwise embedding
     int32_t qparams_index = index * num_groups_per_channel;
-    CTYPE_OUT zp = 0.0;
-    const CTYPE_OUT* scale_ptr = scales + qparams_index;
-    const CTYPE_OUT* zero_points_ptr = nullptr;
+    CTYPE_PARAMS zp = 0.0;
+    const CTYPE_PARAMS* scale_ptr = scales + qparams_index;
+    const CTYPE_PARAMS* zero_points_ptr = nullptr;
     if (opt_weight_zero_points.has_value()) {
       zero_points_ptr = zero_points + qparams_index;
     }
@@ -164,7 +172,7 @@ void embedding_byte_per_channel(
 
     for (int j = 0; j < embedding_dim; ++j) {
       int32_t group_id = j / group_size;
-      const CTYPE_OUT scale = scale_ptr[group_id];
+      const CTYPE_PARAMS scale = scale_ptr[group_id];
       if (opt_weight_zero_points.has_value()) {
         zp = zero_points_ptr[group_id];
       }
@@ -219,6 +227,7 @@ Tensor& quantized_embedding_byte_out(
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
+    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // TODO (jakeszwe): improve these to account for the size of out in relation
   // to weight and indices accounting for a possible batch dimension
@@ -229,16 +238,20 @@ Tensor& quantized_embedding_byte_out(
       weight_quant_min,
       weight_quant_max,
       indices,
+      out_dtype,
       out);
 
-  ScalarType w_type = weight.scalar_type();
+  ScalarType weight_type = weight.scalar_type();
+  ScalarType params_type = weight_scales.scalar_type();
   ScalarType out_type = out.scalar_type();
 
   constexpr auto name = "quantized_decomposed::embedding_byte.out";
-  ET_SWITCH_TWO_TYPES(Byte, Char, w_type, ctx, name, CTYPE_W, [&]() {
-    ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() {
-      embedding_byte_per_channel<CTYPE_W, CTYPE_OUT>(
-          weight, weight_scales, opt_weight_zero_points, indices, out);
+  ET_SWITCH_TWO_TYPES(Byte, Char, weight_type, ctx, name, CTYPE_W, [&]() {
+    ET_SWITCH_TWO_TYPES(Float, Half, params_type, ctx, name, CTYPE_P, [&]() {
+      ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() {
+        embedding_byte_per_channel<CTYPE_W, CTYPE_P, CTYPE_OUT>(
+            weight, weight_scales, opt_weight_zero_points, indices, out);
+      });
     });
   });
 
@@ -253,6 +266,7 @@ Tensor& quantized_embedding_byte_out(
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
+    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
@@ -265,6 +279,7 @@ Tensor& quantized_embedding_byte_out(
       weight_quant_min,
       weight_quant_max,
       indices,
+      out_dtype,
       out);
 }
 
diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml
@@ -34,7 +34,7 @@
     - arg_meta: null
       kernel_name: torch::executor::dequantize_per_channel_out
 
-- func: quantized_decomposed::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
+- func: quantized_decomposed::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
diff --git a/kernels/quantized/test/op_embedding_test.cpp b/kernels/quantized/test/op_embedding_test.cpp
@@ -76,6 +76,7 @@ void test_dtype() {
       quant_min,
       quant_max,
       indices,
+      out.scalar_type(),
       out);
 
   // (8 - 1) * 0.5 = 3.5
@@ -139,6 +140,7 @@ TEST(OpQuantizedEmbeddingTest, ConsitencyWithReferencePattern) {
       quant_min,
       quant_max,
       indices,
+      out.scalar_type(),
       out);
 
   // Do Q DQ embedding
@@ -196,6 +198,7 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbedding) {
       quant_min,
       quant_max,
       indices,
+      out.scalar_type(),
       out);
 
   EXPECT_TENSOR_EQ(out, expected);
@@ -220,6 +223,7 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbedding) {
       quant_min,
       quant_max,
       indices,
+      out.scalar_type(),
       out);
 
   EXPECT_TENSOR_EQ(out, expected);
@@ -251,6 +255,7 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath1) {
           quant_min,
           quant_max,
           indices,
+          out.scalar_type(),
           out),
       "");
 }
@@ -281,6 +286,7 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath2) {
           quant_min,
           quant_max,
           indices,
+          out.scalar_type(),
           out),
       "");
 }
@@ -310,6 +316,7 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath3) {
           quant_min,
           quant_max,
           indices,
+          out.scalar_type(),
           out),
       "");
 }
@@ -339,6 +346,7 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath4) {
           quant_min,
           quant_max,
           indices,
+          out.scalar_type(),
           out),
       "");
 }
@@ -368,6 +376,7 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath5) {
           quant_min,
           quant_max,
           indices,
+          out.scalar_type(),
           out),
       "");
 }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)`
	`1`	`+- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)`
`2`	`2`	`variants: function`
`3`	`3`	`kernels:`
`4`	`4`	`- arg_meta: null`