@@ -178,6 +178,7 @@ struct cmd_params {
178
178
std::vector<ggml_type> type_v;
179
179
std::vector<int > n_threads;
180
180
std::vector<int > n_gpu_layers;
181
+ std::vector<std::string> rpc_servers;
181
182
std::vector<llama_split_mode> split_mode;
182
183
std::vector<int > main_gpu;
183
184
std::vector<bool > no_kv_offload;
@@ -202,6 +203,7 @@ static const cmd_params cmd_params_defaults = {
202
203
/* type_v */ {GGML_TYPE_F16},
203
204
/* n_threads */ {get_math_cpu_count ()},
204
205
/* n_gpu_layers */ {99 },
206
+ /* rpc_servers */ {" " },
205
207
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
206
208
/* main_gpu */ {0 },
207
209
/* no_kv_offload */ {false },
@@ -230,6 +232,7 @@ static void print_usage(int /* argc */, char ** argv) {
230
232
printf (" -ctv, --cache-type-v <t> (default: %s)\n " , join (transform_to_str (cmd_params_defaults.type_v , ggml_type_name), " ," ).c_str ());
231
233
printf (" -t, --threads <n> (default: %s)\n " , join (cmd_params_defaults.n_threads , " ," ).c_str ());
232
234
printf (" -ngl, --n-gpu-layers <n> (default: %s)\n " , join (cmd_params_defaults.n_gpu_layers , " ," ).c_str ());
235
+ printf (" -rpc, --rpc <rpc_servers> (default: %s)\n " , join (cmd_params_defaults.rpc_servers , " ," ).c_str ());
233
236
printf (" -sm, --split-mode <none|layer|row> (default: %s)\n " , join (transform_to_str (cmd_params_defaults.split_mode , split_mode_str), " ," ).c_str ());
234
237
printf (" -mg, --main-gpu <i> (default: %s)\n " , join (cmd_params_defaults.main_gpu , " ," ).c_str ());
235
238
printf (" -nkvo, --no-kv-offload <0|1> (default: %s)\n " , join (cmd_params_defaults.no_kv_offload , " ," ).c_str ());
@@ -384,6 +387,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
384
387
}
385
388
auto p = split<int >(argv[i], split_delim);
386
389
params.n_gpu_layers .insert (params.n_gpu_layers .end (), p.begin (), p.end ());
390
+ } else if (arg == " -rpc" || arg == " --rpc" ) {
391
+ if (++i >= argc) {
392
+ invalid_param = true ;
393
+ break ;
394
+ }
395
+ auto p = split<std::string>(argv[i], split_delim);
396
+ params.rpc_servers .insert (params.rpc_servers .end (), p.begin (), p.end ());
387
397
} else if (arg == " -sm" || arg == " --split-mode" ) {
388
398
if (++i >= argc) {
389
399
invalid_param = true ;
@@ -519,6 +529,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
519
529
if (params.type_k .empty ()) { params.type_k = cmd_params_defaults.type_k ; }
520
530
if (params.type_v .empty ()) { params.type_v = cmd_params_defaults.type_v ; }
521
531
if (params.n_gpu_layers .empty ()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers ; }
532
+ if (params.rpc_servers .empty ()) { params.rpc_servers = cmd_params_defaults.rpc_servers ; }
522
533
if (params.split_mode .empty ()) { params.split_mode = cmd_params_defaults.split_mode ; }
523
534
if (params.main_gpu .empty ()) { params.main_gpu = cmd_params_defaults.main_gpu ; }
524
535
if (params.no_kv_offload .empty ()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload ; }
@@ -541,6 +552,7 @@ struct cmd_params_instance {
541
552
ggml_type type_v;
542
553
int n_threads;
543
554
int n_gpu_layers;
555
+ std::string rpc_servers;
544
556
llama_split_mode split_mode;
545
557
int main_gpu;
546
558
bool no_kv_offload;
@@ -553,6 +565,9 @@ struct cmd_params_instance {
553
565
llama_model_params mparams = llama_model_default_params ();
554
566
555
567
mparams.n_gpu_layers = n_gpu_layers;
568
+ if (!rpc_servers.empty ()) {
569
+ mparams.rpc_servers = rpc_servers.c_str ();
570
+ }
556
571
mparams.split_mode = split_mode;
557
572
mparams.main_gpu = main_gpu;
558
573
mparams.tensor_split = tensor_split.data ();
@@ -564,6 +579,7 @@ struct cmd_params_instance {
564
579
bool equal_mparams (const cmd_params_instance & other) const {
565
580
return model == other.model &&
566
581
n_gpu_layers == other.n_gpu_layers &&
582
+ rpc_servers == other.rpc_servers &&
567
583
split_mode == other.split_mode &&
568
584
main_gpu == other.main_gpu &&
569
585
use_mmap == other.use_mmap &&
@@ -618,6 +634,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
618
634
/* .type_v = */ tv,
619
635
/* .n_threads = */ nt,
620
636
/* .n_gpu_layers = */ nl,
637
+ /* .rpc_servers = */ join (params.rpc_servers , " ," ),
621
638
/* .split_mode = */ sm,
622
639
/* .main_gpu = */ mg,
623
640
/* .no_kv_offload= */ nkvo,
@@ -643,6 +660,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
643
660
/* .type_v = */ tv,
644
661
/* .n_threads = */ nt,
645
662
/* .n_gpu_layers = */ nl,
663
+ /* .rpc_servers = */ join (params.rpc_servers , " ," ),
646
664
/* .split_mode = */ sm,
647
665
/* .main_gpu = */ mg,
648
666
/* .no_kv_offload= */ nkvo,
@@ -668,6 +686,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
668
686
/* .type_v = */ tv,
669
687
/* .n_threads = */ nt,
670
688
/* .n_gpu_layers = */ nl,
689
+ /* .rpc_servers = */ join (params.rpc_servers , " ," ),
671
690
/* .split_mode = */ sm,
672
691
/* .main_gpu = */ mg,
673
692
/* .no_kv_offload= */ nkvo,
@@ -692,6 +711,7 @@ struct test {
692
711
static const bool kompute;
693
712
static const bool metal;
694
713
static const bool sycl;
714
+ static const bool rpc;
695
715
static const bool gpu_blas;
696
716
static const bool blas;
697
717
static const std::string cpu_info;
@@ -790,6 +810,9 @@ struct test {
790
810
if (sycl) {
791
811
return GGML_SYCL_NAME;
792
812
}
813
+ if (rpc) {
814
+ return " RPC" ;
815
+ }
793
816
if (gpu_blas) {
794
817
return " GPU BLAS" ;
795
818
}
@@ -803,7 +826,7 @@ struct test {
803
826
static const std::vector<std::string> & get_fields () {
804
827
static const std::vector<std::string> fields = {
805
828
" build_commit" , " build_number" ,
806
- " cuda" , " opencl" , " vulkan" , " kompute" , " metal" , " sycl" , " gpu_blas" , " blas" ,
829
+ " cuda" , " opencl" , " vulkan" , " kompute" , " metal" , " sycl" , " rpc " , " gpu_blas" , " blas" ,
807
830
" cpu_info" , " gpu_info" ,
808
831
" model_filename" , " model_type" , " model_size" , " model_n_params" ,
809
832
" n_batch" , " n_ubatch" ,
@@ -859,7 +882,7 @@ struct test {
859
882
std::vector<std::string> values = {
860
883
build_commit, std::to_string (build_number),
861
884
std::to_string (cuda), std::to_string (opencl), std::to_string (vulkan), std::to_string (vulkan),
862
- std::to_string (metal), std::to_string (sycl), std::to_string (gpu_blas), std::to_string (blas),
885
+ std::to_string (metal), std::to_string (sycl), std::to_string (rpc), std::to_string ( gpu_blas), std::to_string (blas),
863
886
cpu_info, gpu_info,
864
887
model_filename, model_type, std::to_string (model_size), std::to_string (model_n_params),
865
888
std::to_string (n_batch), std::to_string (n_ubatch),
@@ -894,6 +917,7 @@ const bool test::metal = !!ggml_cpu_has_metal();
894
917
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
895
918
const bool test::blas = !!ggml_cpu_has_blas();
896
919
const bool test::sycl = !!ggml_cpu_has_sycl();
920
+ const bool test::rpc = !!ggml_cpu_has_rpc();
897
921
const std::string test::cpu_info = get_cpu_info();
898
922
const std::string test::gpu_info = get_gpu_info();
899
923
@@ -1308,7 +1332,7 @@ int main(int argc, char ** argv) {
1308
1332
1309
1333
for (const auto & inst : params_instances) {
1310
1334
// keep the same model between tests when possible
1311
- if (!lmodel || !prev_inst || !inst.equal_mparams (*prev_inst)) {
1335
+ if (!lmodel || !prev_inst || !inst.equal_mparams (*prev_inst) || !inst. rpc_servers . empty () ) {
1312
1336
if (lmodel) {
1313
1337
llama_free_model (lmodel);
1314
1338
}
0 commit comments