Embedding quantization per backend (#402)

mikekgfb · malfet · commit d00bbf536596 · 2024-07-16T22:43:48.000-07:00
* ET or AOTI backend logic

* use args, not builder_args

* typo

* typo

* typo
diff --git a/build/utils.py b/build/utils.py
@@ -23,7 +23,7 @@
 def set_backend(dso, pte):
     global active_builder_args_dso
     global active_builder_args_pte
-    active_builder_args_dso = dso 
+    active_builder_args_dso = dso
     active_builder_args_pte = pte
 
 
diff --git a/quantize.py b/quantize.py
@@ -15,7 +15,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from build.utils import find_multiple, get_precision
+from build.utils import find_multiple, get_precision, use_et_backend
 
 
 #########################################################################
@@ -92,30 +92,6 @@ def quantized_model(self) -> nn.Module:
         return self.quantizer.quantize(self.model_)
 
 
-#########################################################################
-###                QuantHandler API definition                        ###
-###               (unify with torchao in future)                      ###
-
-
-class QuantHandler:
-    def __init__(self, model: nn.Module, device="cpu", tokenizer=None):
-        self.model_ = model
-        self.device = device
-        self.tokenizer = tokenizer
-
-    def create_quantized_state_dict(self) -> Dict:  # "StateDict"
-        pass
-
-    def convert_for_runtime(self) -> nn.Module:
-        pass
-
-    def quantized_model(self) -> nn.Module:
-        model_updated_state_dict = self.create_quantized_state_dict()
-        self.convert_for_runtime()
-        self.model_.load_state_dict(model_updated_state_dict)
-        return self.model_
-
-
 #########################################################################
 ###           wrapper for setting precision as a QuantHandler         ###
 
@@ -647,6 +623,12 @@ def __init__(
         self.groupsize = groupsize
         self.dtype = dtype
         self.packed = packed
+
+        if use_et_backend():
+            self.forward = self.et_forward
+        else:
+            self.forward = self.aoti_forward
+
         if not packed:
             self.register_buffer(
                 "weight",
@@ -675,12 +657,18 @@ def __init__(
             )
 
     @torch.no_grad()
-    def forward(self, indices: torch.Tensor) -> torch.Tensor:
-        if False:  # Used for Executorch
-            return torch.ops.llama_quantized.embedding_byte.dtype(
+    def et_forward(self, indices: torch.Tensor) -> torch.Tensor:
+        if self.packed:
+            return torch.ops.quantized_decomposed.embedding_byte.dtype(
+                self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
+            )
+        else:
+            return torch.ops.quantized_decomposed.embedding_4bit.dtype(
                 self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
             )
 
+    @torch.no_grad()
+    def aoti_forward(self, indices: torch.Tensor) -> torch.Tensor:
         # result_weights = self.weight.index_select(0, indices.view(-1))
         # result_scales = self.scales.index_select(0, indices.view(-1))