Add Vulkan Quantizer to Llama export lib

SS-JIA · SS-JIA · commit 36cd162d8165 · 2024-10-11T14:46:02.000-07:00
Pull Request resolved: #6169 TSIA. Note that only 8 bit weight only quantization is supported for now since `VulkanQuantizer` does not support 4 bit weight only quantization at the moment. ghstack-source-id: 247613963 @exported-using-ghexport Differential Revision: [D64249615](https://our.internmc.facebook.com/intern/diff/D64249615/)
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -41,6 +41,7 @@
     get_pt2e_quantization_params,
     get_pt2e_quantizers,
     get_qnn_quantizer,
+    get_vulkan_quantizer,
 )
 from executorch.util.activation_memory_profiler import generate_memory_trace
 
@@ -147,6 +148,7 @@ def build_args_parser() -> argparse.ArgumentParser:
             "coreml_8a_c4w",
             "coreml_baseline_8a_c8w",
             "coreml_baseline_8a_c4w",
+            "vulkan_8w",
         ],
         help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
     )
@@ -548,6 +550,12 @@ def get_quantizer_and_quant_params(args):
         assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml"
         coreml_quantizer = get_coreml_quantizer(args.pt2e_quantize)
         quantizers.append(coreml_quantizer)
+    if args.vulkan and args.pt2e_quantize:
+        assert (
+            len(quantizers) == 0
+        ), "Should not enable both vulkan and other quantizers"
+        vulkan_quantizer = get_vulkan_quantizer(args.pt2e_quantize)
+        quantizers.append(vulkan_quantizer)
     logging.info(f"Applying quantizers: {quantizers}")
     return pt2e_quant_params, quantizers, quant_dtype
 
diff --git a/extension/llm/export/TARGETS b/extension/llm/export/TARGETS
@@ -31,6 +31,7 @@ runtime.python_library(
         "//executorch/backends/qualcomm/quantizer:quantizer",
         "//executorch/backends/transforms:duplicate_dynamic_quant_chain",
         "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
+        "//executorch/backends/vulkan/quantizer:vulkan_quantizer",
         "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_details",
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
@@ -260,3 +260,22 @@ def get_coreml_quantizer(pt2e_quantize: str):
         raise ValueError(f"Unsupported Core ML quantizer specification {pt2e_quantize}")
 
     return quantizer
+
+
+def get_vulkan_quantizer(pt2e_quantize: str):
+    from executorch.backends.vulkan.quantizer.vulkan_quantizer import (
+        get_weight_quantization_config,
+        VulkanQuantizer,
+    )
+
+    if pt2e_quantize == "vulkan_8w":
+        config = get_weight_quantization_config(
+            is_per_channel=True,
+            weight_qmin=-128,
+            weight_qmax=127,
+        )
+    else:
+        raise ValueError(f"Unsupported Vulkan quantizer specification {pt2e_quantize}")
+
+    quantizer = VulkanQuantizer().set_global(config)
+    return quantizer