Fix lint

Joey Tsai · Joey Tsai · commit 08c47428bc77 · 2024-11-25T09:46:50.000+08:00
- Fix transformers version
- Refine pass quantization tagging function
- Rebase
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -22,7 +22,9 @@
 from torch.fx import Node
 
 
-def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
+def annotate_matmul_16a8w(
+    gm: torch.fx.GraphModule, traverse_input1=True
+) -> None:  # noqa: C901
     """
     This function is specific for matmul op 16a8w.
     """
@@ -99,7 +101,8 @@ def annotate_matmul_input1(node: Node):
     for node in gm.graph.nodes:
         if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
             annotate_matmul(node, quantization_config_16a8w)
-            annotate_matmul_input1(node.args[1])
+            if traverse_input1:
+                annotate_matmul_input1(node.args[1])
 
 
 def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama3_2/llama.py
@@ -8,9 +8,9 @@
 import json
 import logging
 import os
-
 import sys
 import time
+from functools import partial
 from multiprocessing.connection import Client
 
 import torch
@@ -319,8 +319,10 @@ def compile(args):
 
     if args.model_mode == "kv":
         use_kv_cache = output_new_cache_only = True
+        matmul_annotate_func = partial(annotate_matmul_16a8w, traverse_input1=True)
     elif args.model_mode == "batch_prefill":
         use_kv_cache = output_new_cache_only = False
+        matmul_annotate_func = partial(annotate_matmul_16a8w, traverse_input1=False)
     elif args.model_mode == "hybrid":
         raise NotImplementedError(
             f"model_mode {args.model_mode} is not implemented yet."
@@ -385,7 +387,10 @@ def compile(args):
         start_quantize_ts = time.time()
         single_llama.quantize(
             quant_dtype,
-            custom_annotations=(annotate_matmul_16a8w,),
+            custom_annotations=(
+                custom_annotate_llama_last_conv_16a8w,
+                matmul_annotate_func,
+            ),
         )
         end_quantize_ts = time.time()
         logging.info(f"Time for quantizing: {end_quantize_ts - start_quantize_ts}")
diff --git a/install_requirements.py b/install_requirements.py
@@ -137,7 +137,7 @@ def python_is_compatible():
     "timm==1.0.7",
     f"torchaudio==2.5.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torchaudio",
     "torchsr==1.0.4",
-    "transformers==4.42.4", # TODO update back to 4.46.1 once the error is fixed
+    "transformers==4.46.1",
 ]
 
 # pip packages needed for development.

Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ def python_is_compatible():`
`137`	`137`	`"timm==1.0.7",`
`138`	`138`	`f"torchaudio==2.5.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torchaudio",`
`139`	`139`	`"torchsr==1.0.4",`
`140`		`- "transformers==4.42.4", # TODO update back to 4.46.1 once the error is fixed`
	`140`	`+ "transformers==4.46.1",`
`141`	`141`	`]`
`142`	`142`
`143`	`143`	`# pip packages needed for development.`