Skip to content

Commit 0364d45

Browse files
cccclaifacebook-github-bot
authored andcommitted
seperate quantize and export_to_edge in builder (#3613)
Summary: Pull Request resolved: #3613 Currently export_to_edge includes both applying quantizer and run to_edge, separate them so I can call quantize only in the eval_llama.py Reviewed By: Jack-Khuu, larryliu0820 Differential Revision: D57367832 fbshipit-source-id: 04d225df5403657cd86726af8cffb77c7c41147e
1 parent 36f83eb commit 0364d45

File tree

2 files changed

+40
-13
lines changed

2 files changed

+40
-13
lines changed

examples/models/llama2/builder.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,8 @@ def __init__(
142142
verbose: bool = False,
143143
):
144144
self.model = model
145+
# graph module returned from capture_pre_autograd_graph
146+
self.pre_autograd_graph_module: Optional[torch.fx.GraphModule] = None
145147
self.modelname = modelname
146148
self.weight_type = weight_type
147149
self.dtype = dtype
@@ -251,25 +253,27 @@ def _get_metadata(self):
251253
self.metadata = metadata
252254
return self.metadata
253255

254-
def export_to_edge(
256+
def pt2e_quantize(
255257
self, quantizers: Optional[List[Quantizer]]
256258
) -> "LlamaEdgeManager":
257259
"""
258-
Export the model to Edge dialect and retrieve a EdgeManager.
260+
Quantize the model via pt2e flow and retrieve LlamaEdgeManager including the quantized model.
259261
Args:
260262
quantizers (Optional[List[Quantizer]]): A list of quantizers.
261263
"""
264+
assert (
265+
self.edge_manager is None
266+
), "export_to_edge is already called, please call pt2e_quantize before export_to_edge"
267+
logging.info(f"Using pt2e {quantizers} to quantizing the model...")
262268
dynamic_shape = self._get_dynamic_shape()
263-
edge_config = self._get_edge_config()
264-
metadata = self._get_metadata()
265269

266270
# 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
267271
# 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
268-
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
269-
m = capture_pre_autograd_graph(
270-
self.model, self.example_inputs, dynamic_shapes=dynamic_shape
271-
)
272-
if quantizers:
272+
if quantizers:
273+
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
274+
m = capture_pre_autograd_graph(
275+
self.model, self.example_inputs, dynamic_shapes=dynamic_shape
276+
)
273277
if self.verbose:
274278
logging.info(f"Applied quantizers: {quantizers}")
275279
composed_quantizer = ComposableQuantizer(quantizers)
@@ -278,8 +282,29 @@ def export_to_edge(
278282
m(*self.example_inputs)
279283
m = convert_pt2e(m)
280284
DuplicateDynamicQuantChainPass()(m)
285+
self.pre_autograd_graph_module = m
286+
return self
287+
else:
288+
logging.info("No quantizer provided, passing...")
289+
return self
290+
291+
def export_to_edge(self) -> "LlamaEdgeManager":
292+
"""
293+
Export the model to Edge dialect and retrieve a LlamaEdgeManager.
294+
"""
295+
dynamic_shape = self._get_dynamic_shape()
296+
edge_config = self._get_edge_config()
297+
metadata = self._get_metadata()
298+
299+
# 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
300+
# 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
301+
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
302+
if self.pre_autograd_graph_module is None:
303+
self.pre_autograd_graph_module = capture_pre_autograd_graph(
304+
self.model, self.example_inputs, dynamic_shapes=dynamic_shape
305+
)
281306
self.edge_manager = export_to_edge(
282-
m,
307+
self.pre_autograd_graph_module,
283308
self.example_inputs,
284309
dynamic_shapes=dynamic_shape,
285310
edge_constant_methods=metadata,

examples/models/llama2/export_llama_lib.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -378,9 +378,11 @@ def _export_llama(modelname, args) -> str: # noqa: C901
378378
qnn_quantizer, quant_dtype = get_qnn_quantizer(args)
379379
quantizers.append(qnn_quantizer)
380380

381-
builder_exported_to_edge = _prepare_for_llama_export(
382-
modelname, args
383-
).export_to_edge(quantizers)
381+
builder_exported_to_edge = (
382+
_prepare_for_llama_export(modelname, args)
383+
.pt2e_quantize(quantizers)
384+
.export_to_edge()
385+
)
384386

385387
modelname = builder_exported_to_edge.modelname
386388

0 commit comments

Comments
 (0)