Skip to content

Commit 27aa259

Browse files
authored
mtmd : add C public API (#13184)
* init * wip * working version * add mtmd::bitmaps * add test target * rm redundant define * test: mtmd_input_chunks_free * rm outdated comment * fix merging issue * explicitly create mtmd::input_chunks * mtmd_input_chunk_copy * add clone() * add const to various places * add warning about breaking changes * helper: use mtmd_image_tokens_get_n_pos
1 parent 9fdfcda commit 27aa259

File tree

7 files changed

+714
-242
lines changed

7 files changed

+714
-242
lines changed

tests/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ if (NOT GGML_BACKEND_DL)
165165
llama_build_and_test(test-rope.cpp)
166166
endif()
167167

168+
# libmtmd
169+
set(LLAMA_TEST_NAME test-mtmd-c-api)
170+
llama_build_and_test(test-mtmd-c-api.c)
171+
target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
168172

169173
# dummy executable - not installed
170174
get_filename_component(TEST_TARGET test-c.c NAME_WE)

tests/test-mtmd-c-api.c

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#include <stdio.h>
2+
#include <assert.h>
3+
4+
#include "mtmd.h"
5+
6+
int main(void) {
7+
printf("\n\nTesting libmtmd C API...\n");
8+
printf("--------\n\n");
9+
10+
struct mtmd_context_params params = mtmd_context_params_default();
11+
printf("Default image marker: %s\n", params.image_marker);
12+
13+
mtmd_input_chunks * chunks = mtmd_test_create_input_chunks();
14+
15+
if (!chunks) {
16+
fprintf(stderr, "Failed to create input chunks\n");
17+
return 1;
18+
}
19+
20+
size_t n_chunks = mtmd_input_chunks_size(chunks);
21+
printf("Number of chunks: %zu\n", n_chunks);
22+
assert(n_chunks > 0);
23+
24+
for (size_t i = 0; i < n_chunks; i++) {
25+
const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
26+
assert(chunk != NULL);
27+
enum mtmd_input_chunk_type type = mtmd_input_chunk_get_type(chunk);
28+
printf("Chunk %zu type: %d\n", i, type);
29+
30+
if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
31+
size_t n_tokens;
32+
const llama_token * tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
33+
printf(" Text chunk with %zu tokens\n", n_tokens);
34+
assert(tokens != NULL);
35+
assert(n_tokens > 0);
36+
for (size_t j = 0; j < n_tokens; j++) {
37+
assert(tokens[j] >= 0);
38+
printf(" > Token %zu: %d\n", j, tokens[j]);
39+
}
40+
41+
} else if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
42+
const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
43+
size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
44+
size_t nx = mtmd_image_tokens_get_nx(image_tokens);
45+
size_t ny = mtmd_image_tokens_get_ny(image_tokens);
46+
const char * id = mtmd_image_tokens_get_id(image_tokens);
47+
assert(n_tokens > 0);
48+
assert(nx > 0);
49+
assert(ny > 0);
50+
assert(id != NULL);
51+
printf(" Image chunk with %zu tokens\n", n_tokens);
52+
printf(" Image size: %zu x %zu\n", nx, ny);
53+
printf(" Image ID: %s\n", id);
54+
}
55+
}
56+
57+
// Free the chunks
58+
mtmd_input_chunks_free(chunks);
59+
60+
printf("\n\nDONE: test libmtmd C API...\n");
61+
62+
return 0;
63+
}

tools/llava/clip-impl.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,15 @@ struct clip_image_u8_batch {
233233

234234
struct clip_image_f32_batch {
235235
std::vector<clip_image_f32_ptr> entries;
236+
237+
clip_image_f32_batch clone() const {
238+
clip_image_f32_batch new_batch;
239+
new_batch.entries.reserve(entries.size());
240+
for (const auto & entry : entries) {
241+
new_batch.entries.emplace_back(new clip_image_f32(*entry));
242+
}
243+
return new_batch;
244+
}
236245
};
237246

238247
//

tools/llava/clip.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
7878
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
7979
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
8080

81-
CLIP_API struct clip_image_size * clip_image_size_init();
82-
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
83-
CLIP_API struct clip_image_f32 * clip_image_f32_init();
84-
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
81+
CLIP_API struct clip_image_size * clip_image_size_init(void);
82+
CLIP_API struct clip_image_u8 * clip_image_u8_init (void);
83+
CLIP_API struct clip_image_f32 * clip_image_f32_init(void);
84+
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
8585

8686
// nx, ny are the output image dimensions
8787
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);

tools/llava/mtmd-cli.cpp

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ static void sigint_handler(int signo) {
6363
#endif
6464

6565
struct mtmd_cli_context {
66-
mtmd_context_ptr ctx_vision;
66+
mtmd::context_ptr ctx_vision;
6767
common_init_result llama_init;
6868

6969
llama_model * model;
@@ -72,7 +72,7 @@ struct mtmd_cli_context {
7272
llama_batch batch;
7373
int n_batch;
7474

75-
std::vector<mtmd_bitmap> bitmaps;
75+
mtmd::bitmaps bitmaps;
7676

7777
// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
7878
// so here we don't need to keep track of chat history
@@ -115,12 +115,12 @@ struct mtmd_cli_context {
115115

116116
void init_vision_context(common_params & params) {
117117
const char * clip_path = params.mmproj.path.c_str();
118-
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
119-
/* use_gpu */ params.mmproj_use_gpu,
120-
/* timings */ true,
121-
/* n_threads */ params.cpuparams.n_threads,
122-
/* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
123-
}));
118+
mtmd_context_params mparams = mtmd_context_params_default();
119+
mparams.use_gpu = params.mmproj_use_gpu;
120+
mparams.print_timings = true;
121+
mparams.n_threads = params.cpuparams.n_threads;
122+
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
123+
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
124124
if (!ctx_vision.get()) {
125125
LOG_ERR("Failed to load vision model from %s\n", clip_path);
126126
exit(1);
@@ -139,11 +139,11 @@ struct mtmd_cli_context {
139139
}
140140

141141
bool load_image(const std::string & fname) {
142-
mtmd_bitmap bitmap;
143-
if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
142+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
143+
if (!bmp.ptr) {
144144
return false;
145145
}
146-
bitmaps.push_back(std::move(bitmap));
146+
bitmaps.entries.push_back(std::move(bmp));
147147
return true;
148148
}
149149
};
@@ -193,27 +193,40 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
193193
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
194194

195195
mtmd_input_text text;
196-
text.text = formatted_chat.prompt;
196+
text.text = formatted_chat.prompt.c_str();
197197
text.add_special = add_bos;
198198
text.parse_special = true;
199-
mtmd_input_chunks chunks;
200199

201200
if (g_is_interrupted) return 0;
202201

203-
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, ctx.bitmaps);
202+
mtmd::input_chunks chunks(mtmd_input_chunks_init());
203+
auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
204+
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
205+
chunks.ptr.get(), // output
206+
&text, // text
207+
bitmaps_c_ptr.data(),
208+
bitmaps_c_ptr.size());
204209
if (res != 0) {
205210
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
206211
return 1;
207212
}
208213

209-
ctx.bitmaps.clear();
210-
211-
if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
214+
ctx.bitmaps.entries.clear();
215+
216+
llama_pos new_n_past;
217+
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
218+
ctx.lctx, // lctx
219+
chunks.ptr.get(), // chunks
220+
ctx.n_past, // n_past
221+
0, // seq_id
222+
ctx.n_batch, // n_batch
223+
true, // logits_last
224+
&new_n_past)) {
212225
LOG_ERR("Unable to eval prompt\n");
213226
return 1;
214227
}
215228

216-
ctx.n_past += mtmd_helper_get_n_pos(chunks);
229+
ctx.n_past = new_n_past;
217230

218231
LOG("\n");
219232

@@ -246,7 +259,7 @@ int main(int argc, char ** argv) {
246259
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
247260
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
248261

249-
// ctrl+C handling
262+
// Ctrl+C handling
250263
{
251264
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
252265
struct sigaction sigint_action;

0 commit comments

Comments
 (0)