Skip to content

Commit 9393598

Browse files
authored
whisper : Metal and ggml-alloc support (#1270)
* metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package
1 parent 3fec211 commit 9393598

File tree

18 files changed

+1537
-934
lines changed

18 files changed

+1537
-934
lines changed

CMakeLists.txt

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake_minimum_required (VERSION 3.0)
1+
cmake_minimum_required (VERSION 3.5)
22

33
project(whisper.cpp VERSION 1.4.2)
44

@@ -35,6 +35,12 @@ endif()
3535

3636
# options
3737

38+
if (APPLE)
39+
set(WHISPER_METAL_DEFAULT ON)
40+
else()
41+
set(WHISPER_METAL_DEFAULT OFF)
42+
endif()
43+
3844
option(BUILD_SHARED_LIBS "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
3945

4046
option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings" ON)
@@ -58,6 +64,8 @@ option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
5864

5965
if (APPLE)
6066
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
67+
option(WHISPER_METAL "whisper: use Metal" ${WHISPER_METAL_DEFAULT})
68+
option(WHISPER_METAL_NDEBUG "whisper: disable Metal debugging" OFF)
6169
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
6270
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
6371
else()
@@ -113,6 +121,34 @@ if (APPLE)
113121
endif()
114122
endif()
115123

124+
if (WHISPER_METAL)
125+
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
126+
find_library(METAL_FRAMEWORK Metal REQUIRED)
127+
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
128+
129+
if (METAL_FRAMEWORK)
130+
message(STATUS "Metal framework found")
131+
132+
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS}
133+
${FOUNDATION_LIBRARY}
134+
${METAL_FRAMEWORK}
135+
${METALKIT_FRAMEWORK}
136+
)
137+
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_METAL)
138+
139+
if (WHISPER_METAL_NDEBUG)
140+
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_NDEBUG)
141+
endif()
142+
else()
143+
message(WARNING "Metal framework not found")
144+
endif()
145+
146+
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
147+
148+
# copy ggml-metal.metal to bin directory
149+
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
150+
endif()
151+
116152
if (WHISPER_COREML)
117153
find_library(FOUNDATION_FRAMEWORK Foundation)
118154
find_library(COREML_FRAMEWORK CoreML)
@@ -177,7 +213,7 @@ if (WHISPER_CUBLAS)
177213

178214
enable_language(CUDA)
179215

180-
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
216+
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
181217

182218
add_compile_definitions(GGML_USE_CUBLAS)
183219

@@ -228,7 +264,7 @@ if (WHISPER_CLBLAST)
228264
if (CLBlast_FOUND)
229265
message(STATUS "CLBlast found")
230266

231-
set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
267+
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
232268

233269
add_compile_definitions(GGML_USE_CLBLAST)
234270

@@ -426,8 +462,11 @@ set(TARGET whisper)
426462
add_library(${TARGET}
427463
ggml.h
428464
ggml.c
429-
${GGML_CUDA_SOURCES}
430-
${GGML_OPENCL_SOURCES}
465+
ggml-alloc.h
466+
ggml-alloc.c
467+
${GGML_SOURCES_METAL}
468+
${GGML_SOURCES_CUDA}
469+
${GGML_SOURCES_OPENCL}
431470
whisper.h
432471
whisper.cpp
433472
)
@@ -468,9 +507,15 @@ if (BUILD_SHARED_LIBS)
468507
WHISPER_BUILD
469508
GGML_BUILD
470509
)
510+
511+
if (WHISPER_METAL)
512+
# TODO: I think this should make ggml-metal.m "see" the ggml-metal.metal file from the "bin" directory
513+
# but for some reason it does not work here like it does in llama.cpp
514+
set_target_properties(${TARGET} PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
515+
endif()
471516
endif()
472517

473-
if (GGML_CUDA_SOURCES)
518+
if (GGML_SOURCES_CUDA)
474519
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
475520
set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
476521
set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
@@ -486,10 +531,13 @@ target_compile_definitions(${TARGET} PUBLIC
486531

487532
set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")
488533

534+
include(GNUInstallDirs)
535+
489536
install(TARGETS ${TARGET}
490-
LIBRARY DESTINATION lib
491-
ARCHIVE DESTINATION lib/static
492-
RUNTIME DESTINATION bin
537+
LIBRARY DESTINATION lib
538+
ARCHIVE DESTINATION lib/static
539+
RUNTIME DESTINATION bin
540+
RESOURCE DESTINATION bin
493541
PUBLIC_HEADER DESTINATION include
494542
)
495543

Makefile

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ ifndef NVCC_VERSION
1818
endif
1919
endif
2020

21-
CCV := $(shell $(CC) --version | head -n 1)
21+
CCV := $(shell $(CC) --version | head -n 1)
2222
CXXV := $(shell $(CXX) --version | head -n 1)
2323

2424
# Mac OS + Arm can report x86_64
@@ -182,6 +182,15 @@ ifdef WHISPER_COREML_ALLOW_FALLBACK
182182
endif
183183
endif
184184

185+
ifndef WHISPER_NO_METAL
186+
ifeq ($(UNAME_S),Darwin)
187+
WHISPER_METAL := 1
188+
189+
CXXFLAGS += -DGGML_USE_METAL
190+
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
191+
endif
192+
endif
193+
185194
ifdef WHISPER_OPENBLAS
186195
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
187196
LDFLAGS += -lopenblas
@@ -288,6 +297,11 @@ $(info )
288297
ggml.o: ggml.c ggml.h ggml-cuda.h
289298
$(CC) $(CFLAGS) -c $< -o $@
290299

300+
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
301+
$(CC) $(CFLAGS) -c $< -o $@
302+
303+
WHISPER_OBJ += ggml-alloc.o
304+
291305
whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
292306
$(CXX) $(CXXFLAGS) -c $< -o $@
293307

@@ -303,6 +317,13 @@ whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-imp
303317
WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
304318
endif
305319

320+
ifdef WHISPER_METAL
321+
ggml-metal.o: ggml-metal.m ggml-metal.h
322+
$(CC) $(CFLAGS) -c $< -o $@
323+
324+
WHISPER_OBJ += ggml-metal.o
325+
endif
326+
306327
libwhisper.a: ggml.o $(WHISPER_OBJ)
307328
$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
308329

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ Beta: [v1.4.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.2) / S
1111
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
1212

1313
- Plain C/C++ implementation without dependencies
14-
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate framework and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
14+
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
1515
- AVX intrinsics support for x86 architectures
1616
- VSX intrinsics support for POWER architectures
1717
- Mixed F16 / F32 precision
1818
- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
1919
- Low memory usage (Flash Attention)
2020
- Zero memory allocations at runtime
21-
- Runs on the CPU
21+
- Support for CPU-only inference
2222
- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
2323
- [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
2424
- [BLAS CPU support via OpenBLAS](https://github.com/ggerganov/whisper.cpp#blas-cpu-support-via-openblas)
@@ -50,6 +50,10 @@ You can also easily make your own offline voice assistant application: [command]
5050

5151
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
5252

53+
On Apply Silicon, the inference runs fully on the GPU via Metal:
54+
55+
https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
56+
5357
Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
5458

5559
## Implementation details

coreml/whisper-encoder.mm

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,13 @@
2222

2323
NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
2424

25-
const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]);
25+
// select which device to run the Core ML model on
26+
MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
27+
config.computeUnits = MLComputeUnitsCPUAndGPU;
28+
//config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
29+
//config.computeUnits = MLComputeUnitsAll;
30+
31+
const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);
2632

2733
if (data == NULL) {
2834
return NULL;

examples/bench/bench.cpp

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,13 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
4444
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
4545
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
4646
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
47-
fprintf(stderr, " %-7s 0 - whisper encoder\n", "");
47+
fprintf(stderr, " %-7s 0 - whisper\n", "");
4848
fprintf(stderr, " %-7s 1 - memcpy\n", "");
4949
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
5050
fprintf(stderr, "\n");
5151
}
5252

53-
int whisper_bench_encoder(const whisper_params & params) {
53+
int whisper_bench_full(const whisper_params & params) {
5454
// whisper init
5555

5656
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
@@ -69,12 +69,49 @@ int whisper_bench_encoder(const whisper_params & params) {
6969
fprintf(stderr, "error: failed to set mel: %d\n", ret);
7070
return 3;
7171
}
72+
// heat encoder
73+
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
74+
fprintf(stderr, "error: failed to encode model: %d\n", ret);
75+
return 4;
76+
}
77+
78+
whisper_token tokens[512];
79+
memset(tokens, 0, sizeof(tokens));
80+
81+
// prompt heat
82+
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
83+
fprintf(stderr, "error: failed to encode model: %d\n", ret);
84+
return 4;
85+
}
86+
87+
// text-generation heat
88+
if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
89+
fprintf(stderr, "error: failed to encode model: %d\n", ret);
90+
return 4;
91+
}
7292

93+
whisper_reset_timings(ctx);
94+
95+
// actual run
7396
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
7497
fprintf(stderr, "error: failed to encode model: %d\n", ret);
7598
return 4;
7699
}
77100

101+
for (int i = 0; i < 16; i++) {
102+
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
103+
fprintf(stderr, "error: failed to encode model: %d\n", ret);
104+
return 4;
105+
}
106+
}
107+
108+
for (int i = 0; i < 256; i++) {
109+
if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
110+
fprintf(stderr, "error: failed to encode model: %d\n", ret);
111+
return 4;
112+
}
113+
}
114+
78115
whisper_print_timings(ctx);
79116
whisper_free(ctx);
80117

@@ -103,7 +140,7 @@ int main(int argc, char ** argv) {
103140
int ret = -1;
104141

105142
switch (params.what) {
106-
case 0: ret = whisper_bench_encoder(params); break;
143+
case 0: ret = whisper_bench_full(params); break;
107144
case 1: ret = whisper_bench_memcpy(params.n_threads); break;
108145
case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
109146
default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;

examples/talk-llama/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ if (WHISPER_SDL2)
77

88
# TODO: this is temporary
99
# need to export ggml symbols for MSVC, but too lazy ..
10-
add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
10+
add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../ggml-alloc.c ../../whisper.cpp)
1111

1212
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
1313
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})

examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
88
set(
99
SOURCE_FILES
1010
${WHISPER_LIB_DIR}/ggml.c
11+
${WHISPER_LIB_DIR}/ggml-alloc.c
1112
${WHISPER_LIB_DIR}/whisper.cpp
1213
${CMAKE_SOURCE_DIR}/jni.c
1314
)
@@ -20,7 +21,7 @@ function(build_library target_name)
2021
SHARED
2122
${SOURCE_FILES}
2223
)
23-
24+
2425
target_link_libraries(${target_name} ${LOG_LIB} android)
2526

2627
if (${target_name} STREQUAL "whisper_v8fp16_va")

examples/whisper.objc/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,22 @@ This can significantly improve the performance of the transcription:
2828

2929
<img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
3030

31+
## Core ML
32+
3133
If you want to enable Core ML support, you can add the `-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK` compiler flag for `whisper.cpp` in Build Phases:
3234

3335
<img width="1072" alt="image" src="https://github.com/ggerganov/whisper.cpp/assets/3001525/103e8f57-6eb6-490d-a60c-f6cf6c319324">
3436

3537
Then follow the [`Core ML support` section of readme](../../README.md#core-ml-support) for convert the model.
3638

3739
In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
40+
41+
## Metal
42+
43+
You can also enable Metal to make the inference run on the GPU of your device. This might or might not be more efficient
44+
compared to Core ML depending on the model and device that you use.
45+
46+
To enable Metal, just add `-DGGML_USE_METAL` instead off the `-DWHISPER_USE_COREML` flag and you are ready.
47+
This will make both the Encoder and the Decoder run on the GPU.
48+
49+
If you want to run the Encoder with Core ML and the Decoder with Metal then simply add both `-DWHISPER_USE_COREML -DGGML_USE_METAL` flags. That's all!

0 commit comments

Comments
 (0)