3
3
#include " build-info.h"
4
4
#include " grammar-parser.h"
5
5
6
- // #define SERVER_MULTIMODAL_SUPPORT
7
-
8
- #ifdef SERVER_MULTIMODAL_SUPPORT
9
6
#include " ../llava/clip.h"
10
7
#include " stb_image.h"
11
- #endif
12
8
13
9
#ifndef NDEBUG
14
10
// crash the server in debug mode, otherwise send an http 500 error
@@ -63,7 +59,6 @@ static bool server_verbose = false;
63
59
#define LOG_WARNING (MSG, ...) server_log(" WARNING" , __func__, __LINE__, MSG, __VA_ARGS__)
64
60
#define LOG_INFO (MSG, ...) server_log(" INFO" , __func__, __LINE__, MSG, __VA_ARGS__)
65
61
66
- #ifdef SERVER_MULTIMODAL_SUPPORT
67
62
static const std::string base64_chars =
68
63
" ABCDEFGHIJKLMNOPQRSTUVWXYZ"
69
64
" abcdefghijklmnopqrstuvwxyz"
@@ -112,7 +107,6 @@ std::vector<uint8_t> base64_decode(std::string const& encoded_string) {
112
107
113
108
return ret;
114
109
}
115
- #endif
116
110
117
111
// parallel
118
112
enum slot_state
@@ -267,7 +261,6 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
267
261
return out;
268
262
}
269
263
270
- #ifdef SERVER_MULTIMODAL_SUPPORT
271
264
struct slot_image {
272
265
clip_image_u8 img_data;
273
266
bool request_encode_image = false ;
@@ -276,7 +269,6 @@ struct slot_image {
276
269
int id;
277
270
std::string prefix_prompt = " " ; // before of this image
278
271
};
279
- #endif
280
272
281
273
struct llama_client_slot
282
274
{
@@ -322,9 +314,8 @@ struct llama_client_slot
322
314
grammar_parser::parse_state parsed_grammar;
323
315
llama_grammar *grammar = nullptr ;
324
316
325
- # ifdef SERVER_MULTIMODAL_SUPPORT
317
+ // multimodal
326
318
std::vector<slot_image> images;
327
- #endif
328
319
329
320
void reset () {
330
321
num_prompt_tokens = 0 ;
@@ -347,15 +338,12 @@ struct llama_client_slot
347
338
ctx_sampling.grammar = NULL ;
348
339
}
349
340
350
- #ifdef SERVER_MULTIMODAL_SUPPORT
351
341
for (slot_image img : images) {
352
342
free (img.image_embedding );
353
343
delete[] img.img_data .data ;
354
344
img.prefix_prompt = " " ;
355
345
}
356
346
images.clear ();
357
- #endif
358
-
359
347
// llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
360
348
}
361
349
@@ -452,11 +440,9 @@ struct llama_server_context
452
440
std::string user_name = " " ; // this should be the anti prompt
453
441
std::string assistant_name = " " ; // this is for generate the prompt
454
442
455
- #ifdef SERVER_MULTIMODAL_SUPPORT
456
443
bool multimodal = false ;
457
444
clip_ctx *clp_ctx = nullptr ;
458
445
int n_embd;
459
- #endif
460
446
461
447
llama_model *model = nullptr ;
462
448
llama_context *ctx = nullptr ;
@@ -490,7 +476,6 @@ struct llama_server_context
490
476
bool loadModel (const gpt_params ¶ms_)
491
477
{
492
478
params = params_;
493
- #ifdef SERVER_MULTIMODAL_SUPPORT
494
479
if (!params.mmproj .empty ()) {
495
480
multimodal = true ;
496
481
LOG_TEE (" Multi Modal Mode Enabled" );
@@ -504,15 +489,13 @@ struct llama_server_context
504
489
params.n_ctx = 2048 ;
505
490
}
506
491
}
507
- #endif
508
492
std::tie (model, ctx) = llama_init_from_gpt_params (params);
509
493
if (model == nullptr )
510
494
{
511
495
LOG_ERROR (" unable to load model" , {{" model" , params.model }});
512
496
return false ;
513
497
}
514
498
515
- #ifdef SERVER_MULTIMODAL_SUPPORT
516
499
if (multimodal) {
517
500
int n_img_embd = clip_n_mmproj_embd (clp_ctx);
518
501
n_embd = llama_n_embd (model);
@@ -523,7 +506,6 @@ struct llama_server_context
523
506
return false ;
524
507
}
525
508
}
526
- #endif
527
509
n_ctx = llama_n_ctx (ctx);
528
510
n_vocab = llama_n_vocab (model);
529
511
candidates.reserve (n_vocab);
@@ -829,7 +811,6 @@ struct llama_server_context
829
811
return slot.has_next_token ; // continue
830
812
}
831
813
832
- #ifdef SERVER_MULTIMODAL_SUPPORT
833
814
bool processImages (llama_client_slot &slot) {
834
815
for (slot_image &img : slot.images ) {
835
816
if (!img.request_encode_image ) {
@@ -914,7 +895,6 @@ struct llama_server_context
914
895
}
915
896
return true ;
916
897
}
917
- #endif
918
898
919
899
bool updateSlots () {
920
900
// update the system prompt wait until all slots are idle state
@@ -1088,7 +1068,6 @@ struct llama_server_context
1088
1068
{" to_eval" , tokens_to_str (ctx, slot.cache_tokens .cbegin () + slot.n_past , slot.cache_tokens .cend ())},
1089
1069
});
1090
1070
1091
- #ifdef SERVER_MULTIMODAL_SUPPORT
1092
1071
bool ingest_images = processImages (slot); // has images?
1093
1072
1094
1073
// process the prefix of first image
@@ -1105,15 +1084,7 @@ struct llama_server_context
1105
1084
LOG_TEE (" failed processing images\n " );
1106
1085
return false ;
1107
1086
}
1108
- #else
1109
- for (; slot.n_past < prompt_tokens.size (); ++slot.n_past ) {
1110
- batch.token [batch.n_tokens ] = prompt_tokens[slot.n_past ];
1111
- batch.pos [batch.n_tokens ] = slot.n_past + num_tokens_system;
1112
- batch.seq_id [batch.n_tokens ] = slot.id ;
1113
- batch.logits [batch.n_tokens ] = false ;
1114
- batch.n_tokens += 1 ;
1115
- }
1116
- #endif
1087
+
1117
1088
// extract the logits only for the last token
1118
1089
if (batch.n_tokens > 0 ) {
1119
1090
batch.logits [batch.n_tokens - 1 ] = true ;
@@ -1277,9 +1248,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
1277
1248
printf (" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n " );
1278
1249
printf (" -spf FNAME, --system-prompt-file FNAME\n " );
1279
1250
printf (" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n " );
1280
- #ifdef SERVER_MULTIMODAL_SUPPORT
1281
1251
printf (" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n " );
1282
- #endif
1283
1252
printf (" \n " );
1284
1253
}
1285
1254
@@ -1570,7 +1539,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
1570
1539
);
1571
1540
llama.processSystemPromptData (json::parse (systm_content));
1572
1541
}
1573
- #ifdef SERVER_MULTIMODAL_SUPPORT
1574
1542
else if (arg == " --mmproj" ) {
1575
1543
if (++i >= argc)
1576
1544
{
@@ -1579,7 +1547,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
1579
1547
}
1580
1548
params.mmproj = argv[i];
1581
1549
}
1582
- #endif
1583
1550
else
1584
1551
{
1585
1552
fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
@@ -1697,11 +1664,7 @@ static json format_partial_response(
1697
1664
{" content" , content },
1698
1665
{" stop" , false },
1699
1666
{ " slot_id" , slot->id },
1700
- #ifdef SERVER_MULTIMODAL_SUPPORT
1701
1667
{" multimodal" , llama.multimodal }
1702
- #else
1703
- {" multimodal" , false }
1704
- #endif
1705
1668
};
1706
1669
1707
1670
if (slot->sparams .n_probs > 0 )
@@ -1810,8 +1773,8 @@ static void parse_options_completion(const json &body, llama_client_slot* slot,
1810
1773
}
1811
1774
}
1812
1775
}
1776
+
1813
1777
LOG_VERBOSE (" completion parameters parsed" , format_generation_settings (llama, slot));
1814
- #ifdef SERVER_MULTIMODAL_SUPPORT
1815
1778
if (!llama.multimodal ) {
1816
1779
return ;
1817
1780
}
@@ -1882,7 +1845,6 @@ static void parse_options_completion(const json &body, llama_client_slot* slot,
1882
1845
slot->params .cache_prompt = false ; // multimodal doesn't support cache prompt
1883
1846
}
1884
1847
}
1885
- #endif
1886
1848
}
1887
1849
1888
1850
static void parse_options_infill (const json &body, llama_server_context &llama, llama_client_slot *slot)
0 commit comments