Skip to content

Commit 670f849

Browse files
committed
llama-bench : add support for the RPC backend
1 parent 917dc8c commit 670f849

File tree

4 files changed

+40
-5
lines changed

4 files changed

+40
-5
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ struct cmd_params {
178178
std::vector<ggml_type> type_v;
179179
std::vector<int> n_threads;
180180
std::vector<int> n_gpu_layers;
181+
std::vector<std::string> rpc_servers;
181182
std::vector<llama_split_mode> split_mode;
182183
std::vector<int> main_gpu;
183184
std::vector<bool> no_kv_offload;
@@ -202,6 +203,7 @@ static const cmd_params cmd_params_defaults = {
202203
/* type_v */ {GGML_TYPE_F16},
203204
/* n_threads */ {get_math_cpu_count()},
204205
/* n_gpu_layers */ {99},
206+
/* rpc_servers */ {""},
205207
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
206208
/* main_gpu */ {0},
207209
/* no_kv_offload */ {false},
@@ -230,6 +232,7 @@ static void print_usage(int /* argc */, char ** argv) {
230232
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
231233
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
232234
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
235+
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
233236
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
234237
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
235238
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@@ -384,6 +387,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
384387
}
385388
auto p = split<int>(argv[i], split_delim);
386389
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
390+
} else if (arg == "-rpc" || arg == "--rpc") {
391+
if (++i >= argc) {
392+
invalid_param = true;
393+
break;
394+
}
395+
auto p = split<std::string>(argv[i], split_delim);
396+
params.rpc_servers.insert(params.rpc_servers.end(), p.begin(), p.end());
387397
} else if (arg == "-sm" || arg == "--split-mode") {
388398
if (++i >= argc) {
389399
invalid_param = true;
@@ -519,6 +529,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
519529
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
520530
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
521531
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
532+
if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
522533
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
523534
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
524535
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
@@ -541,6 +552,7 @@ struct cmd_params_instance {
541552
ggml_type type_v;
542553
int n_threads;
543554
int n_gpu_layers;
555+
std::string rpc_servers;
544556
llama_split_mode split_mode;
545557
int main_gpu;
546558
bool no_kv_offload;
@@ -553,6 +565,9 @@ struct cmd_params_instance {
553565
llama_model_params mparams = llama_model_default_params();
554566

555567
mparams.n_gpu_layers = n_gpu_layers;
568+
if (!rpc_servers.empty()) {
569+
mparams.rpc_servers = rpc_servers.c_str();
570+
}
556571
mparams.split_mode = split_mode;
557572
mparams.main_gpu = main_gpu;
558573
mparams.tensor_split = tensor_split.data();
@@ -564,6 +579,7 @@ struct cmd_params_instance {
564579
bool equal_mparams(const cmd_params_instance & other) const {
565580
return model == other.model &&
566581
n_gpu_layers == other.n_gpu_layers &&
582+
rpc_servers == other.rpc_servers &&
567583
split_mode == other.split_mode &&
568584
main_gpu == other.main_gpu &&
569585
use_mmap == other.use_mmap &&
@@ -618,6 +634,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
618634
/* .type_v = */ tv,
619635
/* .n_threads = */ nt,
620636
/* .n_gpu_layers = */ nl,
637+
/* .rpc_servers = */ join(params.rpc_servers, ","),
621638
/* .split_mode = */ sm,
622639
/* .main_gpu = */ mg,
623640
/* .no_kv_offload= */ nkvo,
@@ -643,6 +660,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
643660
/* .type_v = */ tv,
644661
/* .n_threads = */ nt,
645662
/* .n_gpu_layers = */ nl,
663+
/* .rpc_servers = */ join(params.rpc_servers, ","),
646664
/* .split_mode = */ sm,
647665
/* .main_gpu = */ mg,
648666
/* .no_kv_offload= */ nkvo,
@@ -668,6 +686,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
668686
/* .type_v = */ tv,
669687
/* .n_threads = */ nt,
670688
/* .n_gpu_layers = */ nl,
689+
/* .rpc_servers = */ join(params.rpc_servers, ","),
671690
/* .split_mode = */ sm,
672691
/* .main_gpu = */ mg,
673692
/* .no_kv_offload= */ nkvo,
@@ -692,6 +711,7 @@ struct test {
692711
static const bool kompute;
693712
static const bool metal;
694713
static const bool sycl;
714+
static const bool rpc;
695715
static const bool gpu_blas;
696716
static const bool blas;
697717
static const std::string cpu_info;
@@ -790,6 +810,9 @@ struct test {
790810
if (sycl) {
791811
return GGML_SYCL_NAME;
792812
}
813+
if (rpc) {
814+
return "RPC";
815+
}
793816
if (gpu_blas) {
794817
return "GPU BLAS";
795818
}
@@ -803,7 +826,7 @@ struct test {
803826
static const std::vector<std::string> & get_fields() {
804827
static const std::vector<std::string> fields = {
805828
"build_commit", "build_number",
806-
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
829+
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
807830
"cpu_info", "gpu_info",
808831
"model_filename", "model_type", "model_size", "model_n_params",
809832
"n_batch", "n_ubatch",
@@ -859,7 +882,7 @@ struct test {
859882
std::vector<std::string> values = {
860883
build_commit, std::to_string(build_number),
861884
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
862-
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
885+
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
863886
cpu_info, gpu_info,
864887
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
865888
std::to_string(n_batch), std::to_string(n_ubatch),
@@ -894,6 +917,7 @@ const bool test::metal = !!ggml_cpu_has_metal();
894917
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
895918
const bool test::blas = !!ggml_cpu_has_blas();
896919
const bool test::sycl = !!ggml_cpu_has_sycl();
920+
const bool test::rpc = !!ggml_cpu_has_rpc();
897921
const std::string test::cpu_info = get_cpu_info();
898922
const std::string test::gpu_info = get_gpu_info();
899923

@@ -1308,7 +1332,7 @@ int main(int argc, char ** argv) {
13081332

13091333
for (const auto & inst : params_instances) {
13101334
// keep the same model between tests when possible
1311-
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
1335+
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst) || !inst.rpc_servers.empty()) {
13121336
if (lmodel) {
13131337
llama_free_model(lmodel);
13141338
}

ggml-rpc.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ struct ggml_backend_rpc_buffer_context {
117117
std::string name;
118118
};
119119

120+
static std::unordered_map<std::string, ggml_backend_t> instances;
121+
120122
// RPC helper functions
121123

122124
static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
@@ -530,11 +532,13 @@ GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
530532

531533
GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
532534
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
535+
std::string endpoint = rpc_ctx->endpoint;
533536
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)rpc_ctx->buft->context;
534537
delete buft_ctx;
535538
delete rpc_ctx->buft;
536539
delete rpc_ctx;
537540
delete backend;
541+
instances.erase(endpoint);
538542
}
539543

540544
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
@@ -624,8 +628,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
624628
/* .event_synchronize = */ NULL,
625629
};
626630

627-
static std::unordered_map<std::string, ggml_backend_t> instances;
628-
629631
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
630632
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
631633
return backend != nullptr ? ggml_backend_rpc_get_default_buffer_type(backend) : nullptr;

ggml.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23438,6 +23438,14 @@ int ggml_cpu_has_sycl(void) {
2343823438
#endif
2343923439
}
2344023440

23441+
int ggml_cpu_has_rpc(void) {
23442+
#if defined(GGML_USE_RPC)
23443+
return 1;
23444+
#else
23445+
return 0;
23446+
#endif
23447+
}
23448+
2344123449
int ggml_cpu_has_gpublas(void) {
2344223450
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
2344323451
ggml_cpu_has_sycl();

ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2407,6 +2407,7 @@ extern "C" {
24072407
GGML_API int ggml_cpu_has_sse3 (void);
24082408
GGML_API int ggml_cpu_has_ssse3 (void);
24092409
GGML_API int ggml_cpu_has_sycl (void);
2410+
GGML_API int ggml_cpu_has_rpc (void);
24102411
GGML_API int ggml_cpu_has_vsx (void);
24112412
GGML_API int ggml_cpu_has_matmul_int8(void);
24122413

0 commit comments

Comments
 (0)