Skip to content

Commit f4936e7

Browse files
authored
Add a parameter to output delegate summary in llama export
Differential Revision: D68991594 Pull Request resolved: #8174
1 parent b362ab7 commit f4936e7

File tree

2 files changed

+15
-2
lines changed

2 files changed

+15
-2
lines changed

examples/qualcomm/oss_scripts/llama/llama.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
get_soc_to_chipset_map,
5252
update_spill_fill_size,
5353
)
54+
55+
from executorch.devtools.backend_debug import print_delegation_info
5456
from executorch.examples.models.llama.source_transformation.quantize import (
5557
get_quant_embedding_transform,
5658
)
@@ -389,6 +391,7 @@ def lowering_modules(
389391
num_sharding=1,
390392
passes_job=OrderedDict(),
391393
shared_buffer=False,
394+
verbose=False,
392395
):
393396
executorch_config = ExecutorchBackendConfig(
394397
# For shared buffer, user must pass the memory address
@@ -440,6 +443,10 @@ def lowering_modules(
440443
edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
441444
if num_sharding > 1:
442445
update_spill_fill_size(edge_prog_mgr.exported_program())
446+
447+
if verbose:
448+
print_delegation_info(edge_prog_mgr.exported_program().graph_module)
449+
443450
exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
444451
with open(f"{work_space}/{self.pte_filename}.pte", "wb") as file:
445452
exec_prog_mgr.write_to_file(file)
@@ -667,6 +674,10 @@ def compile(args, pte_filename, tokenizer):
667674
)
668675
compiler_specs[0][0].value = option_to_flatbuffer(qnn_executorch_options)
669676

677+
if args.verbose:
678+
for exported_program in exported_programs:
679+
print_delegation_info(exported_program.graph_module)
680+
670681
executorch_config = ExecutorchBackendConfig(
671682
# For shared buffer, user must pass the memory address
672683
# which is allocated by RPC memory to executor runner.
@@ -980,6 +991,8 @@ def _build_parser():
980991
help="Fallback to cpu embedding operator and type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '4,32'.",
981992
)
982993

994+
parser.add_argument("-v", "--verbose", action="store_true")
995+
983996
return parser
984997

985998

examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -557,8 +557,8 @@ SmartMaskIoMgr::SmartMaskIoMgr(
557557
const bool use_int64_token)
558558
: IoMgrBase(modules),
559559
shard_layers_({num_layers}),
560-
prefill_cache_len_(prefill_cache_len),
561560
kv_cache_len_(kv_cache_len),
561+
prefill_cache_len_(prefill_cache_len),
562562
vocab_size_(vocab_size),
563563
num_layers_(num_layers),
564564
head_dim_(head_dim),
@@ -1002,7 +1002,7 @@ void SmartMaskIoMgr::prepare_prefill_io(
10021002

10031003
// [O]: logits
10041004
int logit_index = 0;
1005-
Result<TensorInfo> logits = methods_meta[0]->output_tensor_meta(0);
1005+
Result<TensorInfo> logits = methods_meta[0]->output_tensor_meta(logit_index);
10061006
prefill_logits_ = std::make_unique<TensorImpl>(
10071007
logits->scalar_type(),
10081008
logits->sizes().size(),

0 commit comments

Comments
 (0)