Default to cores/2 threads in JNI layer

GregoryComer · facebook-github-bot · commit 20ebaf3afdd3 · 2024-10-09T04:57:55.000-07:00
Summary:
Default to using cores/2 threadpool threads. The long-term plan is to
improve performant core detection in CPUInfo, but for now we can use cores/2 as a sane default.

Based on testing, this is almost universally faster than using all cores, as efficiency cores can be quite slow. In extreme cases, using 
all cores can be 10x slower than using cores/2.

This also matches Lite Interpreter's default behavior when it doesn't have a more precise heuristic for the target hardware.

Differential Revision: D64107326
diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK
@@ -1,4 +1,5 @@
 load("@fbsource//tools/build_defs/android:fb_android_cxx_library.bzl", "fb_android_cxx_library")
+load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib")
 
 oncall("executorch")
@@ -41,6 +42,8 @@ fb_android_cxx_library(
         "//xplat/executorch/extension/module:module_static",
         "//xplat/executorch/extension/runner_util:inputs_static",
         "//xplat/executorch/extension/tensor:tensor_static",
+        "//xplat/executorch/extension/threadpool:threadpool",
+        third_party_dep("cpuinfo"),
     ],
 )
 
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
@@ -17,9 +17,12 @@
 
 #include "jni_layer_constants.h"
 
+#include <cpuinfo.h>
+
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/extension/tensor/tensor.h>
+#include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/core/portable_type/tensor_impl.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
@@ -260,6 +263,23 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
     }
 
     module_ = std::make_unique<Module>(modelPath->toStdString(), load_mode);
+
+    // Default to using cores/2 threadpool threads. The long-term plan is to
+    // improve performant core detection in CPUInfo, but for now we can use
+    // cores/2 as a sane default.
+    //
+    // Based on testing, this is almost universally faster than using all 
+    // cores, as efficiency cores can be quite slow. In extreme cases, using 
+    // all cores can be 10x slower than using cores/2.
+    // 
+    // TODO Allow overriding this default from Java.
+    auto threadpool = executorch::extension::threadpool::get_threadpool();
+    if (threadpool) {
+      int thread_count = cpuinfo_get_processors_count() / 2;
+      if (thread_count > 0) {
+        threadpool->_unsafe_reset_threadpool(thread_count);
+      }
+    }
   }
 
   facebook::jni::local_ref<facebook::jni::JArrayClass<JEValue>> forward(