@@ -716,13 +716,16 @@ int llama_main(
716
716
gpt_vocab vocab,
717
717
llama_model model,
718
718
int64_t t_load_us,
719
- int64_t t_main_start_us) {
719
+ int64_t t_main_start_us,
720
+ FILE *instream,
721
+ FILE *outstream,
722
+ FILE *errstream) {
720
723
721
724
if (params.seed < 0 ) {
722
725
params.seed = time (NULL );
723
726
}
724
727
725
- fprintf (stderr , " %s: seed = %d\n " , __func__, params.seed );
728
+ fprintf (errstream , " %s: seed = %d\n " , __func__, params.seed );
726
729
727
730
std::mt19937 rng (params.seed );
728
731
if (params.random_prompt ) {
@@ -768,13 +771,13 @@ int llama_main(
768
771
params.interactive = true ;
769
772
}
770
773
771
- fprintf (stderr , " \n " );
772
- fprintf (stderr , " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
773
- fprintf (stderr , " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
774
+ fprintf (errstream , " \n " );
775
+ fprintf (errstream , " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
776
+ fprintf (errstream , " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
774
777
for (int i = 0 ; i < (int ) embd_inp.size (); i++) {
775
- fprintf (stderr , " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
778
+ fprintf (errstream , " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
776
779
}
777
- fprintf (stderr , " \n " );
780
+ fprintf (errstream , " \n " );
778
781
if (params.interactive ) {
779
782
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
780
783
struct sigaction sigint_action;
@@ -786,22 +789,22 @@ int llama_main(
786
789
signal (SIGINT, sigint_handler);
787
790
#endif
788
791
789
- fprintf (stderr , " %s: interactive mode on.\n " , __func__);
792
+ fprintf (errstream , " %s: interactive mode on.\n " , __func__);
790
793
791
794
if (antipromptv_inp.size ()) {
792
795
for (size_t apindex = 0 ; apindex < antipromptv_inp.size (); ++apindex) {
793
796
auto antiprompt_inp = antipromptv_inp.at (apindex);
794
- fprintf (stderr , " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .at (apindex).c_str ());
795
- fprintf (stderr , " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
797
+ fprintf (errstream , " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .at (apindex).c_str ());
798
+ fprintf (errstream , " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
796
799
for (int i = 0 ; i < (int ) antiprompt_inp.size (); i++) {
797
- fprintf (stderr , " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
800
+ fprintf (errstream , " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
798
801
}
799
- fprintf (stderr , " \n " );
802
+ fprintf (errstream , " \n " );
800
803
}
801
804
}
802
805
}
803
- fprintf (stderr , " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
804
- fprintf (stderr , " \n\n " );
806
+ fprintf (errstream , " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
807
+ fprintf (errstream , " \n\n " );
805
808
806
809
std::vector<gpt_vocab::id> embd;
807
810
@@ -814,7 +817,7 @@ int llama_main(
814
817
std::fill (last_n_tokens.begin (), last_n_tokens.end (), 0 );
815
818
816
819
if (params.interactive ) {
817
- fprintf (stderr , " == Running in interactive mode. ==\n "
820
+ fprintf (errstream , " == Running in interactive mode. ==\n "
818
821
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
819
822
" - Press Ctrl+C to interject at any time.\n "
820
823
#endif
@@ -830,7 +833,7 @@ int llama_main(
830
833
831
834
// set the color for the prompt which will be output initially
832
835
if (params.use_color ) {
833
- printf ( ANSI_COLOR_YELLOW);
836
+ fprintf (outstream, ANSI_COLOR_YELLOW);
834
837
}
835
838
836
839
while (remaining_tokens > 0 || params.interactive ) {
@@ -839,7 +842,7 @@ int llama_main(
839
842
const int64_t t_start_us = ggml_time_us ();
840
843
841
844
if (!llama_eval (model, params.n_threads , n_past, embd, logits, mem_per_token)) {
842
- fprintf (stderr , " Failed to predict\n " );
845
+ fprintf (errstream , " Failed to predict\n " );
843
846
return 1 ;
844
847
}
845
848
@@ -898,16 +901,16 @@ int llama_main(
898
901
899
902
// reset color to default if we there is no pending user input
900
903
if (!input_noecho && params.use_color && (int ) embd_inp.size () == input_consumed) {
901
- printf ( ANSI_COLOR_RESET);
904
+ fprintf (outstream, ANSI_COLOR_RESET);
902
905
}
903
906
}
904
907
905
908
// display text
906
909
if (!input_noecho) {
907
910
for (auto id : embd) {
908
- printf ( " %s" , vocab.id_to_token [id].c_str ());
911
+ fprintf (outstream, " %s" , vocab.id_to_token [id].c_str ());
909
912
}
910
- fflush (stdout );
913
+ fflush (outstream );
911
914
}
912
915
913
916
// in interactive mode, and not currently processing queued inputs;
@@ -932,16 +935,16 @@ int llama_main(
932
935
// currently being interactive
933
936
bool another_line = true ;
934
937
while (another_line) {
935
- fflush (stdout );
938
+ fflush (outstream );
936
939
char buf[256 ] = {0 };
937
940
int n_read;
938
- if (params.use_color ) printf ( ANSI_BOLD ANSI_COLOR_GREEN);
939
- if (scanf ( " %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
941
+ if (params.use_color ) fprintf (outstream, ANSI_BOLD ANSI_COLOR_GREEN);
942
+ if (fscanf (instream, " %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
940
943
// presumable empty line, consume the newline
941
- std::ignore = scanf ( " %*c" );
944
+ std::ignore = fscanf (instream, " %*c" );
942
945
n_read=0 ;
943
946
}
944
- if (params.use_color ) printf ( ANSI_COLOR_RESET);
947
+ if (params.use_color ) fprintf (outstream, ANSI_COLOR_RESET);
945
948
946
949
if (n_read > 0 && buf[n_read-1 ]==' \\ ' ) {
947
950
another_line = true ;
@@ -974,7 +977,7 @@ int llama_main(
974
977
if (params.interactive ) {
975
978
is_interacting = true ;
976
979
} else {
977
- fprintf (stderr , " [end of text]\n " );
980
+ fprintf (errstream , " [end of text]\n " );
978
981
break ;
979
982
}
980
983
}
@@ -994,18 +997,18 @@ int llama_main(
994
997
{
995
998
const int64_t t_main_end_us = ggml_time_us ();
996
999
997
- fprintf (stderr , " \n\n " );
998
- fprintf (stderr , " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
999
- fprintf (stderr , " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
1000
- fprintf (stderr , " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
1001
- fprintf (stderr , " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
1002
- fprintf (stderr , " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
1000
+ fprintf (errstream , " \n\n " );
1001
+ fprintf (errstream , " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
1002
+ fprintf (errstream , " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
1003
+ fprintf (errstream , " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
1004
+ fprintf (errstream , " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
1005
+ fprintf (errstream , " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
1003
1006
}
1004
1007
1005
1008
ggml_free (model.ctx );
1006
1009
1007
1010
if (params.use_color ) {
1008
- printf ( ANSI_COLOR_RESET);
1011
+ fprintf (outstream, ANSI_COLOR_RESET);
1009
1012
}
1010
1013
1011
1014
return 0 ;
0 commit comments