Skip to content

mtmd : add C public API #13184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
May 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ if (NOT GGML_BACKEND_DL)
llama_build_and_test(test-rope.cpp)
endif()

# libmtmd
set(LLAMA_TEST_NAME test-mtmd-c-api)
llama_build_and_test(test-mtmd-c-api.c)
target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)

# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
Expand Down
63 changes: 63 additions & 0 deletions tests/test-mtmd-c-api.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#include <stdio.h>
#include <assert.h>

#include "mtmd.h"

int main(void) {
printf("\n\nTesting libmtmd C API...\n");
printf("--------\n\n");

struct mtmd_context_params params = mtmd_context_params_default();
printf("Default image marker: %s\n", params.image_marker);

mtmd_input_chunks * chunks = mtmd_test_create_input_chunks();

if (!chunks) {
fprintf(stderr, "Failed to create input chunks\n");
return 1;
}

size_t n_chunks = mtmd_input_chunks_size(chunks);
printf("Number of chunks: %zu\n", n_chunks);
assert(n_chunks > 0);

for (size_t i = 0; i < n_chunks; i++) {
const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
assert(chunk != NULL);
enum mtmd_input_chunk_type type = mtmd_input_chunk_get_type(chunk);
printf("Chunk %zu type: %d\n", i, type);

if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens;
const llama_token * tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
printf(" Text chunk with %zu tokens\n", n_tokens);
assert(tokens != NULL);
assert(n_tokens > 0);
for (size_t j = 0; j < n_tokens; j++) {
assert(tokens[j] >= 0);
printf(" > Token %zu: %d\n", j, tokens[j]);
}

} else if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
size_t nx = mtmd_image_tokens_get_nx(image_tokens);
size_t ny = mtmd_image_tokens_get_ny(image_tokens);
const char * id = mtmd_image_tokens_get_id(image_tokens);
assert(n_tokens > 0);
assert(nx > 0);
assert(ny > 0);
assert(id != NULL);
printf(" Image chunk with %zu tokens\n", n_tokens);
printf(" Image size: %zu x %zu\n", nx, ny);
printf(" Image ID: %s\n", id);
}
}

// Free the chunks
mtmd_input_chunks_free(chunks);

printf("\n\nDONE: test libmtmd C API...\n");

return 0;
}
9 changes: 9 additions & 0 deletions tools/llava/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,15 @@ struct clip_image_u8_batch {

struct clip_image_f32_batch {
std::vector<clip_image_f32_ptr> entries;

clip_image_f32_batch clone() const {
clip_image_f32_batch new_batch;
new_batch.entries.reserve(entries.size());
for (const auto & entry : entries) {
new_batch.entries.emplace_back(new clip_image_f32(*entry));
}
return new_batch;
}
};

//
Expand Down
8 changes: 4 additions & 4 deletions tools/llava/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);

CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
CLIP_API struct clip_image_size * clip_image_size_init(void);
CLIP_API struct clip_image_u8 * clip_image_u8_init (void);
CLIP_API struct clip_image_f32 * clip_image_f32_init(void);
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava

// nx, ny are the output image dimensions
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
Expand Down
51 changes: 32 additions & 19 deletions tools/llava/mtmd-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ static void sigint_handler(int signo) {
#endif

struct mtmd_cli_context {
mtmd_context_ptr ctx_vision;
mtmd::context_ptr ctx_vision;
common_init_result llama_init;

llama_model * model;
Expand All @@ -72,7 +72,7 @@ struct mtmd_cli_context {
llama_batch batch;
int n_batch;

std::vector<mtmd_bitmap> bitmaps;
mtmd::bitmaps bitmaps;

// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
// so here we don't need to keep track of chat history
Expand Down Expand Up @@ -115,12 +115,12 @@ struct mtmd_cli_context {

void init_vision_context(common_params & params) {
const char * clip_path = params.mmproj.path.c_str();
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
/* use_gpu */ params.mmproj_use_gpu,
/* timings */ true,
/* n_threads */ params.cpuparams.n_threads,
/* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
}));
mtmd_context_params mparams = mtmd_context_params_default();
mparams.use_gpu = params.mmproj_use_gpu;
mparams.print_timings = true;
mparams.n_threads = params.cpuparams.n_threads;
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
if (!ctx_vision.get()) {
LOG_ERR("Failed to load vision model from %s\n", clip_path);
exit(1);
Expand All @@ -139,11 +139,11 @@ struct mtmd_cli_context {
}

bool load_image(const std::string & fname) {
mtmd_bitmap bitmap;
if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
if (!bmp.ptr) {
return false;
}
bitmaps.push_back(std::move(bitmap));
bitmaps.entries.push_back(std::move(bmp));
return true;
}
};
Expand Down Expand Up @@ -193,27 +193,40 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());

mtmd_input_text text;
text.text = formatted_chat.prompt;
text.text = formatted_chat.prompt.c_str();
text.add_special = add_bos;
text.parse_special = true;
mtmd_input_chunks chunks;

if (g_is_interrupted) return 0;

int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, ctx.bitmaps);
mtmd::input_chunks chunks(mtmd_input_chunks_init());
auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
chunks.ptr.get(), // output
&text, // text
bitmaps_c_ptr.data(),
bitmaps_c_ptr.size());
if (res != 0) {
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
return 1;
}

ctx.bitmaps.clear();

if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
ctx.bitmaps.entries.clear();

llama_pos new_n_past;
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
ctx.lctx, // lctx
chunks.ptr.get(), // chunks
ctx.n_past, // n_past
0, // seq_id
ctx.n_batch, // n_batch
true, // logits_last
&new_n_past)) {
LOG_ERR("Unable to eval prompt\n");
return 1;
}

ctx.n_past += mtmd_helper_get_n_pos(chunks);
ctx.n_past = new_n_past;

LOG("\n");

Expand Down Expand Up @@ -246,7 +259,7 @@ int main(int argc, char ** argv) {
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;

// ctrl+C handling
// Ctrl+C handling
{
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
Expand Down
Loading
Loading