Skip to content

Commit 3ef5c3e

Browse files
kimishpatelmalfet
authored andcommitted
Port number of threads selection logic from executorch (#505)
Summary: Without this optimization llama3 on s22 is around 4 tok/sec. With the fix it is > 7 tok/sec Test Plan: ./runner/build_android.sh python3 torchchat.py download llama3 python3 torchchat.py export llama3 --output-pte-pat llama3.pte --quantize config/data/mobile.json adb push llama3.pte /data/local/tmp/ adb push tokenizer.model /data/local/tmp/ adb shell "cd /data/local/tmp/ && ./et_run llama3.pte -z tokenizer.model -t 0 -i "Once upon" -n 124" Reviewers: Subscribers: Tasks: Tags:
1 parent 202de74 commit 3ef5c3e

File tree

3 files changed

+30
-3
lines changed

3 files changed

+30
-3
lines changed

runner/build_android.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ export CMAKE_OUT_DIR="cmake-out-android"
3030
build_runner_et() {
3131
rm -rf cmake-out-android
3232
echo "ET BUILD DIR IS ${ET_BUILD_DIR}"
33-
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -S . -B cmake-out-android -G Ninja
33+
cmake -DET_USE_ADPATIVE_THREADS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -S . -B cmake-out-android -G Ninja
3434
cmake --build cmake-out-android/ -j16 --config Release --target et_run
3535
}
3636

runner/et.cmake

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,25 @@ if(executorch_FOUND)
4848

4949
cmake_print_variables(_common_include_directories)
5050

51+
set(_srcs runner/run.cpp)
52+
set(_common_compile_options -D__ET__MODEL -D_GLIBCXX_USE_CXX11_ABI=1)
53+
if(ET_USE_ADPATIVE_THREADS)
54+
list(APPEND _common_compile_options -DET_USE_ADPATIVE_THREADS)
55+
56+
set(EXECUTORCH_SRC_ROOT ${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/src/executorch)
57+
set(XNNPACK_ROOT ${EXECUTORCH_SRC_ROOT}/backends/xnnpack)
58+
list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp)
59+
list(APPEND _common_include_directories
60+
${XNNPACK_ROOT}/third-party/cpuinfo/include)
61+
62+
list(APPEND _common_include_directories
63+
${XNNPACK_ROOT}/third-party/pthreadpool/include)
64+
endif()
65+
5166
target_include_directories(executorch INTERFACE ${_common_include_directories}) # Ideally ExecuTorch installation process would do this
52-
add_executable(et_run runner/run.cpp)
67+
add_executable(et_run ${_srcs})
5368

54-
target_compile_options(et_run PUBLIC -D__ET__MODEL -D_GLIBCXX_USE_CXX11_ABI=1)
69+
target_compile_options(et_run PUBLIC ${_common_compile_options})
5570

5671
# Link ET runtime + extensions
5772
target_link_libraries(

runner/run.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@
2626
#include <executorch/runtime/core/exec_aten/exec_aten.h>
2727
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
2828

29+
#if defined(ET_USE_ADPATIVE_THREADS)
30+
#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
31+
#include <executorch/backends/xnnpack/threadpool/threadpool.h>
32+
#endif
33+
2934
using exec_aten::ScalarType;
3035
using torch::executor::EValue;
3136
using torch::executor::ManagedTensor;
@@ -633,6 +638,13 @@ int main(int argc, char* argv[]) {
633638
char* system_prompt =
634639
NULL; // the (optional) system prompt to use in chat mode
635640

641+
#if defined(ET_USE_ADPATIVE_THREADS)
642+
uint32_t num_performant_cores = torch::executorch::cpuinfo::get_num_performant_cores();
643+
if (num_performant_cores > 0) {
644+
torch::executorch::threadpool::get_threadpool()->_unsafe_reset_threadpool(
645+
num_performant_cores);
646+
}
647+
#endif
636648
// poor man's C argparse so we can override the defaults above from the
637649
// command line
638650
if (argc >= 2) {

0 commit comments

Comments
 (0)