@@ -103,6 +103,7 @@ struct mtmd_context {
103
103
bool print_timings;
104
104
int n_threads;
105
105
std::string media_marker;
106
+ const bool n_embd_text;
106
107
107
108
// these are not token, but strings used to mark the beginning and end of image/audio embeddings
108
109
std::string img_beg;
@@ -137,7 +138,8 @@ struct mtmd_context {
137
138
text_model (text_model),
138
139
print_timings (ctx_params.print_timings),
139
140
n_threads (ctx_params.n_threads),
140
- media_marker (ctx_params.media_marker)
141
+ media_marker (ctx_params.media_marker),
142
+ n_embd_text (llama_model_n_embd(text_model))
141
143
{
142
144
if (std::string (ctx_params.image_marker ) != MTMD_DEFAULT_IMAGE_MARKER) {
143
145
throw std::runtime_error (" custom image_marker is not supported anymore, use media_marker instead" );
@@ -156,12 +158,26 @@ struct mtmd_context {
156
158
if (!ctx_v && !ctx_a) {
157
159
throw std::runtime_error (string_format (" Failed to load CLIP model from %s\n " , mmproj_fname));
158
160
}
161
+
162
+ // if both vision and audio mmproj are present, we need to validate their n_embd
163
+ if (ctx_v && ctx_a) {
164
+ int n_embd_v = clip_n_mmproj_embd (ctx_v);
165
+ int n_embd_a = clip_n_mmproj_embd (ctx_a);
166
+ if (n_embd_v != n_embd_a) {
167
+ throw std::runtime_error (string_format (
168
+ " mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n " ,
169
+ n_embd_v, n_embd_a));
170
+ }
171
+ }
159
172
160
- if (llama_model_n_embd (text_model) != n_embd_projected ()) {
173
+ // since we already validate n_embd of vision and audio mmproj,
174
+ // we can safely assume that they are the same
175
+ int n_embd_clip = clip_n_mmproj_embd (ctx_v ? ctx_v : ctx_a);
176
+ if (n_embd_text != n_embd_clip) {
161
177
throw std::runtime_error (string_format (
162
178
" mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n "
163
179
" hint: you may be using wrong mmproj\n " ,
164
- llama_model_n_embd (text_model), n_embd_projected () ));
180
+ n_embd_text, n_embd_clip ));
165
181
}
166
182
if (ctx_v) {
167
183
init_vision ();
@@ -294,11 +310,6 @@ struct mtmd_context {
294
310
return ctx_a ? clip_get_projector_type (ctx_a) : PROJECTOR_TYPE_UNKNOWN;
295
311
}
296
312
297
- // both audio and vision contexts have the n_embd output dimension
298
- int n_embd_projected () const {
299
- return clip_n_mmproj_embd (ctx_v ? ctx_v : ctx_a);
300
- }
301
-
302
313
~mtmd_context () {
303
314
clip_free (ctx_a);
304
315
clip_free (ctx_v);
@@ -716,7 +727,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
716
727
LOG_ERR (" %s: model does not support audio input\n " , __func__);
717
728
return 1 ;
718
729
}
719
- int n_mmproj_embd = ctx->n_embd_projected () ;
730
+ int n_mmproj_embd = ctx->n_embd_text ;
720
731
ctx->image_embd_v .resize (chunk->tokens_audio ->n_tokens * n_mmproj_embd);
721
732
bool ok = clip_image_batch_encode (
722
733
ctx->ctx_a ,
0 commit comments