[LLAVA] Enable 2nd XNNPACK Partition pass for the text model

digantdesai · digantdesai · commit dc164f50aa04 · 2024-08-28T22:10:54.000-07:00
This is to pick up ops like mul, add, sigmoid etc. which contributes to
ghe e2e latency.
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -208,10 +208,15 @@ def export_all(llava_model: LlavaModel):
         partitioner={
             "image_encoder": [XnnpackPartitioner()],
             "text_model": [
+                # First partition the DQLinear nodes, then partition the rest of the nodes,
+                # to avoid multiple DQLinear nodes in the same partition,
+                # to avoid holding multiple unpacked and packed weight buffers in memory,
+                # to reduce peak memory footprint.
                 XnnpackPartitioner(
                     config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
                     per_op_mode=True,
-                )
+                ),
+                XnnpackPartitioner(),
             ],
         },
         compile_config=EdgeCompileConfig(_check_ir_validity=False),