Add pattern + replacement for Embedding with padding_idx

salilsdesai · facebook-github-bot · commit e023d8f43a6b · 2023-08-11T12:42:27.000-07:00
Summary:
This diff adds a pattern/replacement for embedding with padding_idx, which causes embedding in the NLU model to be quantized successfully.

Previously, the embedding op in the NLU model was not being quantized. This was happening because embedding in NLU includes an extra arg, padding_idx, which was not expected by the pattern used to match embedding ops for replacement in model graphs.

This change also reduces the size of the NLU model from 11.4 MB to 4.4 MB since embedding weight tensors are stored in quantized form instead of fp32.

Reviewed By: digantdesai, mcr229

Differential Revision: D48191947

fbshipit-source-id: 47283aa8c4990325238c362d130d7e2d141fcf0f
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -478,12 +478,58 @@ def replacement(
             )
             return out
 
+        @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte")
+        def pattern_with_padding_idx(
+            weight,
+            weight_scales,
+            weight_zero_points,
+            weight_quant_min,
+            weight_quant_max,
+            indicies,
+            padding_idx,
+        ):
+            weight = torch.ops.quantized_decomposed.dequantize_per_channel.default(
+                weight,
+                weight_scales,
+                weight_zero_points,
+                0,
+                weight_quant_min,
+                weight_quant_max,
+                torch.uint8,
+            )
+            out = torch.ops.aten.embedding.default(weight, indicies, padding_idx)
+            return out
+
+        def replacement_with_padding_idx(
+            weight,
+            weight_scales,
+            weight_zero_points,
+            weight_quant_min,
+            weight_quant_max,
+            indicies,
+            _,  # padding_idx only matters for training and not when running op for inference
+        ):
+            out = torch.ops.quantized_decomposed.embedding_byte.default(
+                weight,
+                weight_scales,
+                weight_zero_points,
+                weight_quant_min,
+                weight_quant_max,
+                indicies,
+            )
+            return out
+
         return [
             (
                 _trace_and_lower_to_edge_ops(pattern),
                 _trace_and_lower_to_edge_ops(replacement),
                 [],
-            )
+            ),
+            (
+                _trace_and_lower_to_edge_ops(pattern_with_padding_idx),
+                _trace_and_lower_to_edge_ops(replacement_with_padding_idx),
+                [],
+            ),
         ]
 
     patterns_and_replacements = []