Add a parameter to output delegate summary in llama export

limintang · web-flow · commit f4936e74ac8c · 2025-02-06T16:22:19.000-08:00
Differential Revision: D68991594 Pull Request resolved: #8174
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -51,6 +51,8 @@
     get_soc_to_chipset_map,
     update_spill_fill_size,
 )
+
+from executorch.devtools.backend_debug import print_delegation_info
 from executorch.examples.models.llama.source_transformation.quantize import (
     get_quant_embedding_transform,
 )
@@ -389,6 +391,7 @@ def lowering_modules(
         num_sharding=1,
         passes_job=OrderedDict(),
         shared_buffer=False,
+        verbose=False,
     ):
         executorch_config = ExecutorchBackendConfig(
             # For shared buffer, user must pass the memory address
@@ -440,6 +443,10 @@ def lowering_modules(
             edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
             if num_sharding > 1:
                 update_spill_fill_size(edge_prog_mgr.exported_program())
+
+            if verbose:
+                print_delegation_info(edge_prog_mgr.exported_program().graph_module)
+
             exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
             with open(f"{work_space}/{self.pte_filename}.pte", "wb") as file:
                 exec_prog_mgr.write_to_file(file)
@@ -667,6 +674,10 @@ def compile(args, pte_filename, tokenizer):
             )
             compiler_specs[0][0].value = option_to_flatbuffer(qnn_executorch_options)
 
+        if args.verbose:
+            for exported_program in exported_programs:
+                print_delegation_info(exported_program.graph_module)
+
         executorch_config = ExecutorchBackendConfig(
             # For shared buffer, user must pass the memory address
             # which is allocated by RPC memory to executor runner.
@@ -980,6 +991,8 @@ def _build_parser():
         help="Fallback to cpu embedding operator and type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '4,32'.",
     )
 
+    parser.add_argument("-v", "--verbose", action="store_true")
+
     return parser
 
 
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
@@ -557,8 +557,8 @@ SmartMaskIoMgr::SmartMaskIoMgr(
     const bool use_int64_token)
     : IoMgrBase(modules),
       shard_layers_({num_layers}),
-      prefill_cache_len_(prefill_cache_len),
       kv_cache_len_(kv_cache_len),
+      prefill_cache_len_(prefill_cache_len),
       vocab_size_(vocab_size),
       num_layers_(num_layers),
       head_dim_(head_dim),
@@ -1002,7 +1002,7 @@ void SmartMaskIoMgr::prepare_prefill_io(
 
   // [O]: logits
   int logit_index = 0;
-  Result<TensorInfo> logits = methods_meta[0]->output_tensor_meta(0);
+  Result<TensorInfo> logits = methods_meta[0]->output_tensor_meta(logit_index);
   prefill_logits_ = std::make_unique<TensorImpl>(
       logits->scalar_type(),
       logits->sizes().size(),