pytorch · larryliu0820 · Apr 17, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 18, 2024
@@ -539,7 +539,10 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
         bitwidth = int(bitwidth)
         transforms.append(
             lambda model: EmbeddingQuantHandler(
-                model, bitwidth=bitwidth, group_size=group_size
+                model,
+                bitwidth=bitwidth,
+                group_size=group_size,
+                packed=(bitwidth == 4),
             ).quantized_model()
         )
 

@@ -189,7 +189,7 @@ def embedding_byte_dtype_out_meta(
 
 quantized_decomposed_lib.define(
     "embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)",
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
 )