@@ -113,11 +113,11 @@ struct llava_context {
113
113
};
114
114
115
115
static void show_additional_info (int /* argc*/ , char ** argv) {
116
- LOG_TEE (" \n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \" describe the image in detail.\" ]\n " , argv[0 ]);
116
+ LOG_TEE (" \n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \" describe the image in detail.\" ]\n " , argv[0 ]);
117
117
LOG_TEE (" note: a lower temperature value like 0.1 is recommended for better quality.\n " );
118
118
}
119
119
120
- static struct llava_image_embed * load_image (llava_context * ctx_llava, gpt_params * params) {
120
+ static struct llava_image_embed * load_image (llava_context * ctx_llava, gpt_params * params, const std::string & fname ) {
121
121
122
122
// load and preprocess the image
123
123
llava_image_embed * embed = NULL ;
@@ -133,9 +133,9 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
133
133
}
134
134
params->prompt = remove_image_from_prompt (prompt);
135
135
} else {
136
- embed = llava_image_embed_make_with_filename (ctx_llava->ctx_clip , params->n_threads , params-> image .c_str ());
136
+ embed = llava_image_embed_make_with_filename (ctx_llava->ctx_clip , params->n_threads , fname .c_str ());
137
137
if (!embed) {
138
- LOG_TEE ( " %s: is %s really an image file?\n " , __func__, params-> image .c_str ());
138
+ fprintf (stderr, " %s: is %s really an image file?\n " , __func__, fname .c_str ());
139
139
return NULL ;
140
140
}
141
141
}
@@ -207,17 +207,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
207
207
printf (" \n " );
208
208
}
209
209
210
-
211
- static struct llava_context * llava_init (gpt_params * params) {
212
- const char * clip_path = params->mmproj .c_str ();
213
-
214
- auto prompt = params->prompt ;
215
- if (prompt.empty ()) {
216
- prompt = " describe the image in detail." ;
217
- }
218
-
219
- auto ctx_clip = clip_model_load (clip_path, /* verbosity=*/ 1 );
220
-
210
+ static struct llama_model * llava_init (gpt_params * params) {
221
211
llama_backend_init ();
222
212
llama_numa_init (params->numa );
223
213
@@ -228,6 +218,19 @@ static struct llava_context * llava_init(gpt_params * params) {
228
218
LOG_TEE (" %s: error: unable to load model\n " , __func__);
229
219
return NULL ;
230
220
}
221
+ return model;
222
+ }
223
+
224
+ static struct llava_context * llava_init_context (gpt_params * params, llama_model * model) {
225
+ const char * clip_path = params->mmproj .c_str ();
226
+
227
+ auto prompt = params->prompt ;
228
+ if (prompt.empty ()) {
229
+ prompt = " describe the image in detail." ;
230
+ }
231
+
232
+ auto ctx_clip = clip_model_load (clip_path, /* verbosity=*/ 1 );
233
+
231
234
232
235
llama_context_params ctx_params = llama_context_params_from_gpt_params (*params);
233
236
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx ; // we need a longer context size to process image embeddings
@@ -286,24 +289,30 @@ int main(int argc, char ** argv) {
286
289
show_additional_info (argc, argv);
287
290
return 1 ;
288
291
}
289
-
290
- auto ctx_llava = llava_init (¶ms);
291
- if (ctx_llava == NULL ) {
292
- LOG_TEE (" %s: error: failed to init llava\n " , __func__);
292
+ auto model = llava_init (¶ms);
293
+ if (model == NULL ) {
294
+ fprintf (stderr, " %s: error: failed to init llava model\n " , __func__);
293
295
return 1 ;
294
296
}
295
297
296
- auto image_embed = load_image (ctx_llava, ¶ms);
297
- if (!image_embed) {
298
- return 1 ;
299
- }
298
+ for (auto & image : params.image ) {
299
+ auto ctx_llava = llava_init_context (¶ms, model);
300
300
301
- // process the prompt
302
- process_prompt (ctx_llava, image_embed, ¶ms, params.prompt );
301
+ auto image_embed = load_image (ctx_llava, ¶ms, image);
302
+ if (!image_embed) {
303
+ std::cerr << " error: failed to load image " << image << " . Terminating\n\n " ;
304
+ return 1 ;
305
+ }
306
+
307
+ // process the prompt
308
+ process_prompt (ctx_llava, image_embed, ¶ms, params.prompt );
303
309
304
- llama_print_timings (ctx_llava->ctx_llama );
310
+ llama_print_timings (ctx_llava->ctx_llama );
311
+ llava_image_embed_free (image_embed);
312
+ ctx_llava->model = NULL ;
313
+ llava_free (ctx_llava);
314
+ }
315
+ llama_free_model (model);
305
316
306
- llava_image_embed_free (image_embed);
307
- llava_free (ctx_llava);
308
317
return 0 ;
309
318
}
0 commit comments