Default to cores/2 threads in JNI layer (#6042)

GregoryComer · facebook-github-bot · commit c43f3fc1ae3a · 2024-10-09T05:25:49.000-07:00
Summary:

Default to using cores/2 threadpool threads. The long-term plan is to
improve performant core detection in CPUInfo, but for now we can use cores/2 as a sane default.

Based on testing, this is almost universally faster than using all cores, as efficiency cores can be quite slow. In extreme cases, using
all cores can be 10x slower than using cores/2.

This also matches Lite Interpreter's default behavior when it doesn't have a more precise heuristic for the target hardware.

Differential Revision: D64107326
diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK
@@ -1,4 +1,5 @@
 load("@fbsource//tools/build_defs/android:fb_android_cxx_library.bzl", "fb_android_cxx_library")
+load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib")
 
 oncall("executorch")
@@ -41,6 +42,8 @@ fb_android_cxx_library(
         "//xplat/executorch/extension/module:module_static",
         "//xplat/executorch/extension/runner_util:inputs_static",
         "//xplat/executorch/extension/tensor:tensor_static",
+        "//xplat/executorch/extension/threadpool:threadpool",
+        third_party_dep("cpuinfo"),
     ],
 )
 
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
@@ -17,6 +17,8 @@
 
 #include "jni_layer_constants.h"
 
+#include <cpuinfo.h>
+
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/extension/tensor/tensor.h>
@@ -25,6 +27,10 @@
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
 
+#ifdef ET_USE_THREADPOOL
+#include <executorch/extension/threadpool/threadpool.h>
+#endif
+
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 
@@ -260,6 +266,25 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
     }
 
     module_ = std::make_unique<Module>(modelPath->toStdString(), load_mode);
+
+#ifdef ET_USE_THREADPOOL
+    // Default to using cores/2 threadpool threads. The long-term plan is to
+    // improve performant core detection in CPUInfo, but for now we can use
+    // cores/2 as a sane default.
+    //
+    // Based on testing, this is almost universally faster than using all
+    // cores, as efficiency cores can be quite slow. In extreme cases, using
+    // all cores can be 10x slower than using cores/2.
+    //
+    // TODO Allow overriding this default from Java.
+    auto threadpool = executorch::extension::threadpool::get_threadpool();
+    if (threadpool) {
+      int thread_count = cpuinfo_get_processors_count() / 2;
+      if (thread_count > 0) {
+        threadpool->_unsafe_reset_threadpool(thread_count);
+      }
+    }
+#endif
   }
 
   facebook::jni::local_ref<facebook::jni::JArrayClass<JEValue>> forward(