Define embedding_4bit ops

larryliu0820 · larryliu0820 · commit a28e73bc951d · 2024-04-18T22:33:52.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/examples/models/llama2/custom_ops/op_sdpa.cpp b/examples/models/llama2/custom_ops/op_sdpa.cpp
@@ -240,7 +240,6 @@ void cpu_flash_attention(
       " and num kv heads=%" PRId64,
       num_head,
       num_heads_kv);
-
   int64_t num_reps = num_head / num_heads_kv;
 
   bool has_attn_mask = attn_mask.has_value() && attn_mask.value().numel();
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -29,7 +29,7 @@
     Transformer,
 )
 from executorch.exir.backend.backend_details import CompileSpec
-from executorch.exir.passes import *
+
 from executorch.sdk.etrecord import generate_etrecord
 from executorch.util.activation_memory_profiler import generate_memory_trace
 from sentencepiece import SentencePieceProcessor
@@ -539,7 +539,10 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
         bitwidth = int(bitwidth)
         transforms.append(
             lambda model: EmbeddingQuantHandler(
-                model, bitwidth=bitwidth, group_size=group_size, packed=(bitwidth==4),
+                model,
+                bitwidth=bitwidth,
+                group_size=group_size,
+                packed=(bitwidth == 4),
             ).quantized_model()
         )
 
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -189,7 +189,7 @@ def embedding_byte_dtype_out_meta(
 
 quantized_decomposed_lib.define(
     "embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)",
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
 )
 
 

Original file line number	Diff line number	Diff line change
`@@ -189,7 +189,7 @@ def embedding_byte_dtype_out_meta(`
`189`	`189`
`190`	`190`	`quantized_decomposed_lib.define(`
`191`	`191`	`"embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "`
`192`		`- "int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)",`
	`192`	`+ "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",`
`193`	`193`	`)`
`194`	`194`
`195`	`195`