Use Helios' decomposition for SDPA before quantizing

Matthias Cremon · facebook-github-bot · commit c61728464f5b · 2024-07-09T15:56:33.000-07:00
Summary: As titled. This will expose the `bmm` nodes in the graph, and allow us to quantize them in a subsequent diff.

Differential Revision: D59503355
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
@@ -35,6 +35,7 @@ python_library(
         "//executorch/backends/cadence/aot/quantizer:fusion_pass",
         "//executorch/backends/cadence/aot/quantizer:quantizer",
         "//executorch/exir:lib",
+        "//on_device_ai/helios/quantization:quantization",
     ],
 )
 
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -24,6 +24,7 @@
 )
 from executorch.backends.cadence.aot.utils import model_is_quantized
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
+from on_device_ai.helios.quantization.transforms import decompose_SDPA_turing
 from pyre_extensions import assert_is_instance
 from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.pt2e.export_utils import model_is_exported
@@ -47,6 +48,9 @@ def quantize_pt2(
     # Export with dynamo
     model_exp = capture_pre_autograd_graph(model, inputs)
 
+    # Decompose SDPA (grab the pass from Turing)
+    decompose_SDPA_turing(model_exp)
+
     # Prepare
     prepared_model = prepare_pt2e(model_exp, quantizer)
 

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ python_library(`
`35`	`35`	`"//executorch/backends/cadence/aot/quantizer:fusion_pass",`
`36`	`36`	`"//executorch/backends/cadence/aot/quantizer:quantizer",`
`37`	`37`	`"//executorch/exir:lib",`
	`38`	`+ "//on_device_ai/helios/quantization:quantization",`
`38`	`39`	`],`
`39`	`40`	`)`
`40`	`41`