[Executorch][llama] Set # of threads to use performant cores

kimishpatel · kimishpatel · commit 76921f8b4c92 · 2024-03-12T10:46:26.000-07:00
Pull Request resolved: #2352 When using all cores, slower ones are dragging the performance down by blocking large cores. Perhaps when we have uarch specific implementation, we may not need this, but this tool is useful in general until we have better API ghstack-source-id: 218361721 @exported-using-ghexport Differential Revision: [D54766071](https://our.internmc.facebook.com/intern/diff/D54766071/)
diff --git a/examples/models/llama2/main.cpp b/examples/models/llama2/main.cpp
@@ -10,6 +10,9 @@
 
 #include <executorch/examples/models/llama2/runner/runner.h>
 
+#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
+#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+
 DEFINE_string(
     model_path,
     "llama2.pte",
@@ -45,6 +48,12 @@ int32_t main(int32_t argc, char** argv) {
 
   int32_t seq_len = FLAGS_seq_len;
 
+  uint32_t num_performant_cores =
+      torch::executorch::cpuinfo::get_num_performant_cores();
+  ET_LOG(
+      Info, "Resetting threadpool with num threads = %d", num_performant_cores);
+  torch::executorch::threadpool::get_threadpool()->_unsafe_reset_threadpool(
+      num_performant_cores);
   // create llama runner
   ::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
 
diff --git a/examples/models/llama2/targets.bzl b/examples/models/llama2/targets.bzl
@@ -16,6 +16,8 @@ def define_common_targets():
                 deps = [
                     "//executorch/examples/models/llama2/runner:runner" + aten_suffix,
                     "//executorch/extension/evalue_util:print_evalue",
+                    "//executorch/backends/xnnpack/threadpool:threadpool",
+                    "//executorch/backends/xnnpack/threadpool:cpuinfo_utils",
                 ],
                 external_deps = [
                     "gflags",