Skip to content

use partitioner instance directly in to_backend #2513

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 6 additions & 20 deletions examples/models/llama2/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import logging
from enum import Enum
from json import JSONDecodeError
from typing import Any, Callable, Dict, List, Optional, Union
from typing import Any, Callable, Dict, List, Optional

import torch
from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
Expand Down Expand Up @@ -286,30 +286,18 @@ def export_to_edge(
)
return self

def to_backend(
self, partitioner: Union[Partitioner, Dict[str, Partitioner]]
) -> "LlamaEdgeManager":
def to_backend(self, partitioner: Optional[Partitioner]) -> "LlamaEdgeManager":
"""
Partition the model and lower to different backends. The signature is
aligned with the signature of `to_backend` method of EdgeManager.
Args:
partitioner (Union[Partitioner, Dict[str, Partitioner]]): One or more
partitioner (Optional[Partitioner]): One or more
partitioner to be sent to EdgeManager.to_backend().
"""
assert self.edge_manager is not None, "Need to run export_to_edge() first"
if isinstance(partitioner, dict):
for key, p in partitioner.items():
assert self.edge_manager is not None
self.edge_manager = self.edge_manager.to_backend(p)
if self.verbose:
logging.info(
print_delegated_graph(
self.edge_manager.exported_program().graph_module
)
)
logging.info(f"Applied partitioners: {key}")
elif isinstance(partitioner, Partitioner):
assert self.edge_manager is not None
if partitioner is None:
logging.info("No partitioner provided, passing...")
else:
self.edge_manager = self.edge_manager.to_backend(partitioner)
if self.verbose:
logging.info(
Expand All @@ -318,8 +306,6 @@ def to_backend(
)
)
logging.info(f"Applied partitioners: {partitioner}")
else:
logging.warning("Invalid partitioner, skipping...")
return self

def to_executorch(self) -> "LlamaEdgeManager":
Expand Down
22 changes: 9 additions & 13 deletions examples/models/llama2/export_llama_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,22 +529,18 @@ def _export_llama(modelname, args) -> str: # noqa: C901
).export_to_edge(quantizers)

# to_backend
partitioners = {}
partitioner = None
if pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None:
partitioners[XnnpackDynamicallyQuantizedPartitioner.__name__] = (
XnnpackDynamicallyQuantizedPartitioner()
)
partitioner = XnnpackDynamicallyQuantizedPartitioner()
modelname = f"xnnpack_dq_{modelname}"

if args.xnnpack:
# Following changes due to.
# 1. We need dynamically quantized partitioner for both pt2e_quantize options
# as well as "qmode int4" which is also dynamic quantizes linear layers.
# 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops.
partitioners[XnnpackDynamicallyQuantizedPartitioner.__name__] = (
XnnpackDynamicallyQuantizedPartitioner()
)
# partitioners[XnnpackPartitioner.__name__] = XnnpackPartitioner()
partitioner = XnnpackDynamicallyQuantizedPartitioner()
# partitioner = XnnpackPartitioner()
modelname = f"xnnpack_{modelname}"

if args.vulkan:
Expand All @@ -555,7 +551,7 @@ def _export_llama(modelname, args) -> str: # noqa: C901
args.quantization_mode is None
), "Vulkan backend does not support quantization at the moment"

partitioners[VulkanPartitioner.__name__] = VulkanPartitioner()
partitioner = VulkanPartitioner()
modelname = f"vulkan_{modelname}"

if args.mps:
Expand All @@ -574,7 +570,7 @@ def _export_llama(modelname, args) -> str: # noqa: C901

compile_specs = [CompileSpec("use_fp16", bytes([True]))]
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`.
partitioners[MPSPartitioner.__name__] = MPSPartitioner(compile_specs)
partitioner = MPSPartitioner(compile_specs)
modelname = f"mps_{modelname}"

if args.coreml:
Expand Down Expand Up @@ -605,7 +601,7 @@ def _export_llama(modelname, args) -> str: # noqa: C901
model_type=CoreMLBackend.MODEL_TYPE.MODEL,
)
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`
partitioners[CoreMLPartitioner.__name__] = CoreMLPartitioner(
partitioner = CoreMLPartitioner(
skip_ops_for_coreml_delegation=None, compile_specs=compile_specs
)
modelname = f"coreml_{modelname}"
Expand All @@ -617,7 +613,7 @@ def _export_llama(modelname, args) -> str: # noqa: C901
logging.info("Generating etrecord")
# Copy the edge manager which will be serialized into etrecord. This is memory-wise expensive.
edge_manager_copy = copy.deepcopy(builder_exported_to_edge.edge_manager)
builder = builder_exported_to_edge.to_backend(partitioners).to_executorch()
builder = builder_exported_to_edge.to_backend(partitioner).to_executorch()

# Generate ETRecord
if edge_manager_copy:
Expand All @@ -628,7 +624,7 @@ def _export_llama(modelname, args) -> str: # noqa: C901
)
logging.info("Generated etrecord.bin")
else:
builder = builder_exported_to_edge.to_backend(partitioners).to_executorch()
builder = builder_exported_to_edge.to_backend(partitioner).to_executorch()

if args.profile_memory:
generate_memory_trace(builder.export_program, "memory_profile.json")
Expand Down