@@ -63,7 +63,7 @@ static void sigint_handler(int signo) {
63
63
#endif
64
64
65
65
struct mtmd_cli_context {
66
- mtmd_context_ptr ctx_vision;
66
+ mtmd::context_ptr ctx_vision;
67
67
common_init_result llama_init;
68
68
69
69
llama_model * model;
@@ -72,7 +72,7 @@ struct mtmd_cli_context {
72
72
llama_batch batch;
73
73
int n_batch;
74
74
75
- std::vector<mtmd_bitmap> bitmaps;
75
+ mtmd::bitmaps bitmaps;
76
76
77
77
// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
78
78
// so here we don't need to keep track of chat history
@@ -115,12 +115,12 @@ struct mtmd_cli_context {
115
115
116
116
void init_vision_context (common_params & params) {
117
117
const char * clip_path = params.mmproj .path .c_str ();
118
- ctx_vision. reset ( mtmd_init_from_file (clip_path, model, mtmd_context_params{
119
- /* use_gpu */ params.mmproj_use_gpu ,
120
- /* timings */ true ,
121
- /* n_threads */ params.cpuparams .n_threads ,
122
- /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
123
- } ));
118
+ mtmd_context_params mparams = mtmd_context_params_default ();
119
+ mparams. use_gpu = params.mmproj_use_gpu ;
120
+ mparams. print_timings = true ;
121
+ mparams. n_threads = params.cpuparams .n_threads ;
122
+ mparams. verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
123
+ ctx_vision. reset ( mtmd_init_from_file (clip_path, model, mparams ));
124
124
if (!ctx_vision.get ()) {
125
125
LOG_ERR (" Failed to load vision model from %s\n " , clip_path);
126
126
exit (1 );
@@ -139,11 +139,11 @@ struct mtmd_cli_context {
139
139
}
140
140
141
141
bool load_image (const std::string & fname) {
142
- mtmd_bitmap bitmap;
143
- if (mtmd_helper_bitmap_init_from_file (fname. c_str (), bitmap) ) {
142
+ mtmd:: bitmap bmp ( mtmd_helper_bitmap_init_from_file (fname. c_str ())) ;
143
+ if (!bmp. ptr ) {
144
144
return false ;
145
145
}
146
- bitmaps.push_back (std::move (bitmap ));
146
+ bitmaps.entries . push_back (std::move (bmp ));
147
147
return true ;
148
148
}
149
149
};
@@ -193,27 +193,40 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
193
193
LOG_DBG (" formatted_chat.prompt: %s\n " , formatted_chat.prompt .c_str ());
194
194
195
195
mtmd_input_text text;
196
- text.text = formatted_chat.prompt ;
196
+ text.text = formatted_chat.prompt . c_str () ;
197
197
text.add_special = add_bos;
198
198
text.parse_special = true ;
199
- mtmd_input_chunks chunks;
200
199
201
200
if (g_is_interrupted) return 0 ;
202
201
203
- int32_t res = mtmd_tokenize (ctx.ctx_vision .get (), chunks, text, ctx.bitmaps );
202
+ mtmd::input_chunks chunks (mtmd_input_chunks_init ());
203
+ auto bitmaps_c_ptr = ctx.bitmaps .c_ptr ();
204
+ int32_t res = mtmd_tokenize (ctx.ctx_vision .get (),
205
+ chunks.ptr .get (), // output
206
+ &text, // text
207
+ bitmaps_c_ptr.data (),
208
+ bitmaps_c_ptr.size ());
204
209
if (res != 0 ) {
205
210
LOG_ERR (" Unable to tokenize prompt, res = %d\n " , res);
206
211
return 1 ;
207
212
}
208
213
209
- ctx.bitmaps .clear ();
210
-
211
- if (mtmd_helper_eval (ctx.ctx_vision .get (), ctx.lctx , chunks, ctx.n_past , 0 , ctx.n_batch )) {
214
+ ctx.bitmaps .entries .clear ();
215
+
216
+ llama_pos new_n_past;
217
+ if (mtmd_helper_eval_chunks (ctx.ctx_vision .get (),
218
+ ctx.lctx , // lctx
219
+ chunks.ptr .get (), // chunks
220
+ ctx.n_past , // n_past
221
+ 0 , // seq_id
222
+ ctx.n_batch , // n_batch
223
+ true , // logits_last
224
+ &new_n_past)) {
212
225
LOG_ERR (" Unable to eval prompt\n " );
213
226
return 1 ;
214
227
}
215
228
216
- ctx.n_past += mtmd_helper_get_n_pos (chunks) ;
229
+ ctx.n_past = new_n_past ;
217
230
218
231
LOG (" \n " );
219
232
@@ -246,7 +259,7 @@ int main(int argc, char ** argv) {
246
259
struct common_sampler * smpl = common_sampler_init (ctx.model , params.sampling );
247
260
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict ;
248
261
249
- // ctrl +C handling
262
+ // Ctrl +C handling
250
263
{
251
264
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
252
265
struct sigaction sigint_action;
0 commit comments