integrate coreml delegate to llama_main (#4160)

cccclai · facebook-github-bot · commit b91c20bc5a5c · 2024-07-08T11:25:52.000-07:00
Summary: Pull Request resolved: #4160 As title, build executorch library ``` cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK=OFF \ -DEXECUTORCH_BUILD_COREML=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out . cmake --build cmake-out -j16 --target install --config DEBUG ``` build llama_main binary ``` cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -DEXECUTORCH_BUILD_COREML=ON \ -Bcmake-out/examples/models/llama2 \ examples/models/llama2 cmake --build cmake-out/examples/models/llama2 -j9 --config DEBUG ``` Run `llama_main` binary on mac ``` (executorch) chenlai@chenlai-mbp executorch % ./cmake-out/examples/models/llama2/llama_main --model_path coreml_llama2.pte --tokenizer_path /Users/chenlai/Documents/stories110M/tokenizer.bin I 00:00:00.028804 executorch:cpuinfo_utils.cpp:62] Reading file /sys/devices/soc0/image_version I 00:00:00.028844 executorch:cpuinfo_utils.cpp:78] Failed to open midr file /sys/devices/soc0/image_version I 00:00:00.028847 executorch:cpuinfo_utils.cpp:158] Number of efficient cores 4 I 00:00:00.028849 executorch:main.cpp:65] Resetting threadpool with num threads = 6 I 00:00:00.033847 executorch:runner.cpp:53] Creating LLaMa runner: model_path=coreml_llama2.pte, tokenizer_path=/Users/chenlai/Documents/stories110M/tokenizer.bin I 00:00:00.117320 executorch:runner.cpp:77] Reading metadata from model I 00:00:00.117374 executorch:runner.cpp:130] get_n_bos: 1 I 00:00:00.117386 executorch:runner.cpp:130] get_n_eos: 1 I 00:00:00.117392 executorch:runner.cpp:130] get_max_seq_len: 128 I 00:00:00.117399 executorch:runner.cpp:130] use_kv_cache: 1 I 00:00:00.117407 executorch:runner.cpp:130] use_sdpa_with_kv_cache: 0 I 00:00:00.117412 executorch:runner.cpp:130] append_eos_to_prompt: 0 I 00:00:00.117413 executorch:runner.cpp:128] The model does not contain enable_dynamic_shape method, using default value 0 I 00:00:00.117415 executorch:runner.cpp:130] enable_dynamic_shape: 0 I 00:00:00.128599 executorch:runner.cpp:130] get_vocab_size: 512 I 00:00:00.128615 executorch:runner.cpp:130] get_bos_id: 1 I 00:00:00.128621 executorch:runner.cpp:130] get_eos_id: 2 PyTorchObserver {"prompt_tokens":9,"generated_tokens":118,"model_load_start_ms":1720221254434,"model_load_end_ms":1720221254528,"inference_start_ms":1720221254528,"inference_end_ms":1720221254817,"prompt_eval_end_ms":1720221254563,"first_token_ms":1720221254563,"aggregate_sampling_time_ms":9,"SCALING_FACTOR_UNITS_PER_SECOND":1000} I 00:00:00.417257 executorch:runner.cpp:509] Prompt Tokens: 9 Generated Tokens: 118 I 00:00:00.417260 executorch:runner.cpp:515] Model Load Time: 0.094000 (seconds) I 00:00:00.417266 executorch:runner.cpp:525] Total inference time: 0.289000 (seconds) Rate: 408.304498 (tokens/second) I 00:00:00.417268 executorch:runner.cpp:533] Prompt evaluation: 0.035000 (seconds) Rate: 257.142857 (tokens/second) I 00:00:00.417270 executorch:runner.cpp:544] Generated 118 tokens: 0.254000 (seconds) Rate: 464.566929 (tokens/second) I 00:00:00.417272 executorch:runner.cpp:552] Time to first generated token: 0.035000 (seconds) I 00:00:00.417274 executorch:runner.cpp:559] Sampling time over 127 tokens: 0.009000 (seconds) ``` ghstack-source-id: 232863706 Reviewed By: kirklandsign Differential Revision: D59412309 fbshipit-source-id: a31ac59e616d7333323a5c7961b8b7dafe2d45e5
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
@@ -187,3 +187,10 @@ set(TARGET coremldelegate APPEND_STRING PROPERTY COMPILE_FLAGS
 set(TARGET coremldelegate APPEND_STRING PROPERTY COMPILE_FLAGS
            "-Wno-receiver-expr"
 )
+
+install(
+  TARGETS coremldelegate
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
@@ -39,6 +39,7 @@ set(lib_list
     bundled_program
     extension_data_loader
     ${FLATCCRT_LIB}
+    coremldelegate
     mpsdelegate
     qnn_executorch_backend
     portable_ops_lib
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
@@ -189,6 +189,20 @@ if(TARGET mpsdelegate)
   target_link_options_shared_lib(mpsdelegate)
 endif()
 
+if(TARGET coremldelegate)
+  find_library(SQLITE_LIBRARY sqlite3)
+  list(
+    APPEND
+    link_libraries
+    coremldelegate
+    sqlite3
+    "-framework Foundation"
+    "-framework CoreML"
+    "-framework Accelerate"
+  )
+  target_link_options_shared_lib(coremldelegate)
+endif()
+
 # This one is needed for cpuinfo where it uses android specific log lib
 if(ANDROID)
   list(APPEND link_libraries log)