ggml-org · ngxson · Mar 29, 2025 · Mar 29, 2025 · Mar 29, 2025 · Mar 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -107,6 +107,7 @@ examples/server/*.gz.hpp
 !examples/*/*/*.kts
 !examples/sycl/*.bat
 !examples/sycl/*.sh
+/*.wav
 
 # Server Web UI temporary files
 node_modules

diff --git a/common/common.cpp b/common/common.cpp
@@ -1565,3 +1565,31 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 
     return result;
 }
+
+//
+// Audio utils
+//
+
+bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
+    std::ofstream file(fname, std::ios::binary);
+    if (!file) {
+        LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
+        return false;
+    }
+
+    wav_header header;
+    header.sample_rate = sample_rate;
+    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
+    header.block_align = header.num_channels * (header.bits_per_sample / 8);
+    header.data_size = data.size() * (header.bits_per_sample / 8);
+    header.chunk_size = 36 + header.data_size;
+
+    file.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    for (const auto & sample : data) {
+        int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
+        file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
+    }
+
+    return file.good();
+}
diff --git a/common/common.h b/common/common.h
@@ -662,3 +662,25 @@ const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
 }
+
+//
+// Audio utils
+//
+
+struct wav_header {
+    char riff[4] = {'R', 'I', 'F', 'F'};
+    uint32_t chunk_size;
+    char wave[4] = {'W', 'A', 'V', 'E'};
+    char fmt[4] = {'f', 'm', 't', ' '};
+    uint32_t fmt_chunk_size = 16;
+    uint16_t audio_format = 1; // PCM
+    uint16_t num_channels = 1; // Mono
+    uint32_t sample_rate;
+    uint32_t byte_rate;
+    uint16_t block_align;
+    uint16_t bits_per_sample = 16;
+    char data[4] = {'d', 'a', 't', 'a'};
+    uint32_t data_size;
+};
+
+bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate);
diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt
@@ -3,3 +3,20 @@ add_executable(${TARGET} tts.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+add_library(mimi-model STATIC mimi-model.h mimi-model.cpp)
+target_link_libraries(mimi-model PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+# for using C++ designated initializers, TODO: can be changed back to C++17 in the future
+target_compile_features(mimi-model PRIVATE cxx_std_20)
+
+set(TARGET llama-mimi)
+add_executable(${TARGET} mimi.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-tts-csm)
+add_executable(${TARGET} tts-csm.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/tts/README-csm.md b/examples/tts/README-csm.md
@@ -0,0 +1,47 @@
+# Sesame CSM
+
+This demo shows running inference of [Sesame CSM](https://github.com/SesameAILabs/csm) using llama.cpp / GGML
+
+It contains 3 components (each has its own GGUF file):
+1. Backbone LLM
+2. Decoder LLM
+3. Mimi decoder
+
+## Quick start
+
+By default, all GGUF files are downloaded from [ggml-org Hugging Face's account](https://huggingface.co/ggml-org/sesame-csm-1b-GGUF)
+
+```sh
+# build (make sure to have LLAMA_CURL enabled)
+cmake -B build -DLLAMA_CURL=ON
+cmake --build build -j --target llama-tts-csm
+
+# run it
+./build/bin/llama-tts-csm -p "[0]Hi, my name is Xuan Son. I am software engineer at Hugging Face."
+```
+
+## Convert the model yourself
+
+To get the GGUF:
+
+```sh
+python examples/tts/convert_csm_to_gguf.py
+
+# default output files:
+# sesame-csm-backbone.gguf
+# sesame-csm-decoder.gguf
+
+# optionally, quantize it
+# (lowest scheme is q8_0, it does not make sense to quantize further, quality degrades too much)
+python examples/tts/convert_csm_to_gguf.py --outtype q8_0
+```
+
+Run the example using local file:
+
+```sh
+./build/bin/llama-tts-csm -m sesame-csm-backbone.gguf -mv kyutai-mimi.gguf -p "[0]Hello world."
+# sesame-csm-backbone.gguf will automatically be loaded
+# make sure the place these 2 GGUF files in the same directory
+
+# output file: output.wav
+```
diff --git a/examples/tts/README-mimi.md b/examples/tts/README-mimi.md
@@ -0,0 +1,50 @@
+# llama.cpp/example/mimi
+
+This demonstrates running [Kyutai's Mimi](https://huggingface.co/kyutai/mimi) model via GGML.
+
+## Quickstart
+
+Convert model to GGUF (no need to download, the script will automatically download the `safetensors` file)
+
+```sh
+python examples/tts/convert_mimi_to_gguf.py
+
+# output file: kyutai-mimi.gguf
+
+# optionally, use q8_0 quantization for faster speed
+python examples/tts/convert_mimi_to_gguf.py --outtype q8_0
+```
+
+Then compile, run it:
+
+```sh
+cmake --build build -j --target llama-mimi
+
+./build/bin/llama-mimi kyutai-mimi.gguf codes.txt
+
+# output: output.wav
+
+# alternatively, use "dummy1" to get a "wah hello there" sample output file
+./build/bin/llama-mimi kyutai-mimi.gguf dummy1
+```
+
+Example of code file (one code per line):
+
+```
+1263
+1597
+1596
+1477
+1540
+1720
+1433
+118
+1066
+1968
+1096
+232
+418
+566
+1653
+2010
+```