@@ -829,41 +829,12 @@ void sigint_handler(int signo) {
829
829
}
830
830
#endif
831
831
832
- const char * llama_print_system_info (void ) {
833
- static std::string s;
834
-
835
- s = " " ;
836
- s += " AVX = " + std::to_string (ggml_cpu_has_avx ()) + " | " ;
837
- s += " AVX2 = " + std::to_string (ggml_cpu_has_avx2 ()) + " | " ;
838
- s += " AVX512 = " + std::to_string (ggml_cpu_has_avx512 ()) + " | " ;
839
- s += " FMA = " + std::to_string (ggml_cpu_has_fma ()) + " | " ;
840
- s += " NEON = " + std::to_string (ggml_cpu_has_neon ()) + " | " ;
841
- s += " ARM_FMA = " + std::to_string (ggml_cpu_has_arm_fma ()) + " | " ;
842
- s += " F16C = " + std::to_string (ggml_cpu_has_f16c ()) + " | " ;
843
- s += " FP16_VA = " + std::to_string (ggml_cpu_has_fp16_va ()) + " | " ;
844
- s += " WASM_SIMD = " + std::to_string (ggml_cpu_has_wasm_simd ()) + " | " ;
845
- s += " BLAS = " + std::to_string (ggml_cpu_has_blas ()) + " | " ;
846
- s += " SSE3 = " + std::to_string (ggml_cpu_has_sse3 ()) + " | " ;
847
- s += " VSX = " + std::to_string (ggml_cpu_has_vsx ()) + " | " ;
848
-
849
- return s.c_str ();
850
- }
851
-
852
- int llama_main (int argc, char ** argv) {
853
- ggml_time_init ();
854
- const int64_t t_main_start_us = ggml_time_us ();
855
-
856
- gpt_params params;
857
- params.model = " models/llama-7B/ggml-model.bin" ;
858
-
859
- if (gpt_params_parse (argc, argv, params) == false ) {
860
- return 1 ;
861
- }
862
-
863
- if (params.n_ctx > 2048 ) {
864
- fprintf (stderr, " %s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
865
- " expect poor results\n " , __func__, params.n_ctx );
866
- }
832
+ int llama_main (
833
+ gpt_params params,
834
+ llama_vocab vocab,
835
+ llama_model model,
836
+ int64_t t_load_us,
837
+ int64_t t_main_start_us) {
867
838
868
839
if (params.seed < 0 ) {
869
840
params.seed = time (NULL );
@@ -879,30 +850,6 @@ int llama_main(int argc, char ** argv) {
879
850
// params.prompt = R"(// this function checks if the number n is prime
880
851
// bool is_prime(int n) {)";
881
852
882
- int64_t t_load_us = 0 ;
883
-
884
- llama_vocab vocab;
885
- llama_model model;
886
-
887
- // load the model
888
- {
889
- const ggml_type memory_type = params.memory_f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
890
- const int64_t t_start_us = ggml_time_us ();
891
- if (!llama_model_load (params.model , model, vocab, params.n_ctx , params.n_parts , memory_type)) {
892
- fprintf (stderr, " %s: failed to load model from '%s'\n " , __func__, params.model .c_str ());
893
- return 1 ;
894
- }
895
-
896
- t_load_us = ggml_time_us () - t_start_us;
897
- }
898
-
899
- // print system information
900
- {
901
- fprintf (stderr, " \n " );
902
- fprintf (stderr, " system_info: n_threads = %d / %d | %s\n " ,
903
- params.n_threads , std::thread::hardware_concurrency (), llama_print_system_info ());
904
- }
905
-
906
853
std::vector<float > logits;
907
854
908
855
// determine the required inference memory per token:
0 commit comments