Define embedding_4bit ops

larryliu0820 · larryliu0820 · commit 4b7050dfdfb3 · 2024-04-18T20:57:15.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/examples/models/llama2/custom_ops/op_sdpa.cpp b/examples/models/llama2/custom_ops/op_sdpa.cpp
@@ -240,7 +240,6 @@ void cpu_flash_attention(
       " and num kv heads=%" PRId64,
       num_head,
       num_heads_kv);
-
   int64_t num_reps = num_head / num_heads_kv;
 
   bool has_attn_mask = attn_mask.has_value() && attn_mask.value().numel();
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -29,7 +29,7 @@
     Transformer,
 )
 from executorch.exir.backend.backend_details import CompileSpec
-from executorch.exir.passes import *
+
 from executorch.sdk.etrecord import generate_etrecord
 from executorch.util.activation_memory_profiler import generate_memory_trace
 from sentencepiece import SentencePieceProcessor
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -189,7 +189,7 @@ def embedding_byte_dtype_out_meta(
 
 quantized_decomposed_lib.define(
     "embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)",
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
 )
 
 

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`Transformer,`
`30`	`30`	`)`
`31`	`31`	`from executorch.exir.backend.backend_details import CompileSpec`
`32`		`-from executorch.exir.passes import *`
	`32`	`+`
`33`	`33`	`from executorch.sdk.etrecord import generate_etrecord`
`34`	`34`	`from executorch.util.activation_memory_profiler import generate_memory_trace`
`35`	`35`	`from sentencepiece import SentencePieceProcessor`
Original file line number	Diff line number	Diff line change
`@@ -189,7 +189,7 @@ def embedding_byte_dtype_out_meta(`
`189`	`189`
`190`	`190`	`quantized_decomposed_lib.define(`
`191`	`191`	`"embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "`
`192`		`- "int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)",`
	`192`	`+ "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",`
`193`	`193`	`)`
`194`	`194`
`195`	`195`