|
17 | 17 |
|
18 | 18 | #include "jni_layer_constants.h"
|
19 | 19 |
|
| 20 | +#include <cpuinfo.h> |
| 21 | + |
20 | 22 | #include <executorch/extension/module/module.h>
|
21 | 23 | #include <executorch/extension/runner_util/inputs.h>
|
22 | 24 | #include <executorch/extension/tensor/tensor.h>
|
|
25 | 27 | #include <executorch/runtime/platform/platform.h>
|
26 | 28 | #include <executorch/runtime/platform/runtime.h>
|
27 | 29 |
|
| 30 | +#ifdef ET_USE_THREADPOOL |
| 31 | +#include <executorch/extension/threadpool/threadpool.h> |
| 32 | +#endif |
| 33 | + |
28 | 34 | #include <fbjni/ByteBuffer.h>
|
29 | 35 | #include <fbjni/fbjni.h>
|
30 | 36 |
|
@@ -260,6 +266,25 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
|
260 | 266 | }
|
261 | 267 |
|
262 | 268 | module_ = std::make_unique<Module>(modelPath->toStdString(), load_mode);
|
| 269 | + |
| 270 | +#ifdef ET_USE_THREADPOOL |
| 271 | + // Default to using cores/2 threadpool threads. The long-term plan is to |
| 272 | + // improve performant core detection in CPUInfo, but for now we can use |
| 273 | + // cores/2 as a sane default. |
| 274 | + // |
| 275 | + // Based on testing, this is almost universally faster than using all |
| 276 | + // cores, as efficiency cores can be quite slow. In extreme cases, using |
| 277 | + // all cores can be 10x slower than using cores/2. |
| 278 | + // |
| 279 | + // TODO Allow overriding this default from Java. |
| 280 | + auto threadpool = executorch::extension::threadpool::get_threadpool(); |
| 281 | + if (threadpool) { |
| 282 | + int thread_count = cpuinfo_get_processors_count() / 2; |
| 283 | + if (thread_count > 0) { |
| 284 | + threadpool->_unsafe_reset_threadpool(thread_count); |
| 285 | + } |
| 286 | + } |
| 287 | +#endif |
263 | 288 | }
|
264 | 289 |
|
265 | 290 | facebook::jni::local_ref<facebook::jni::JArrayClass<JEValue>> forward(
|
|
0 commit comments