Skip to content

Commit f0722b1

Browse files
committed
clean up failed attempt at implementing control-vector hot-swapping
1 parent 92070ca commit f0722b1

File tree

3 files changed

+8
-92
lines changed

3 files changed

+8
-92
lines changed

examples/server/server.cpp

Lines changed: 1 addition & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -3176,84 +3176,6 @@ int main(int argc, char ** argv) {
31763176
res.status = 200; // HTTP OK
31773177
};
31783178

3179-
const auto handle_get_control_vectors = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
3180-
json vectors = json::array();
3181-
3182-
for (const auto & vec : ctx_server.params.control_vectors) {
3183-
vectors.push_back(json {
3184-
{ "fname", vec.fname },
3185-
{ "strength", vec.strength }
3186-
});
3187-
}
3188-
json data = {
3189-
{ "vectors", vectors },
3190-
{ "layer_start", ctx_server.params.control_vector_layer_start },
3191-
{ "layer_end", ctx_server.params.control_vector_layer_end }
3192-
};
3193-
res.set_content(data.dump(), "application/json; charset=utf-8");
3194-
};
3195-
3196-
const auto handle_set_control_vectors = [&ctx_server, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) {
3197-
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
3198-
3199-
json data = json::parse(req.body);
3200-
std::vector<llama_control_vector_load_info> vec_params;
3201-
3202-
if (data.contains("vectors") && data["vectors"].is_array()) {
3203-
for (const auto &item : data["vectors"]) {
3204-
auto v = item.get<llama_control_vector_load_info>();
3205-
std::cout << "Add vector: " << v.fname << " " << v.strength << "\n";
3206-
vec_params.push_back(v);
3207-
}
3208-
} else {
3209-
std::cerr << "No vectors passed\n";
3210-
res_error(res, format_error_response("No vectors passed", ERROR_TYPE_SERVER));
3211-
return;
3212-
}
3213-
const auto cvec = llama_control_vector_load(vec_params);
3214-
if (cvec.n_embd == -1) {
3215-
std::cerr << "Could not load control vector\n";
3216-
res_error(res, format_error_response("Could not load control vector", ERROR_TYPE_SERVER));
3217-
return;
3218-
}
3219-
3220-
if (ctx_server.params.control_vector_layer_start <= 0) {
3221-
ctx_server.params.control_vector_layer_start = 1;
3222-
}
3223-
if (ctx_server.params.control_vector_layer_end <= 0){
3224-
ctx_server.params.control_vector_layer_end = llama_n_layer(ctx_server.model);
3225-
}
3226-
int err = llama_control_vector_apply(ctx_server.ctx,
3227-
cvec.data.data(),
3228-
cvec.data.size(),
3229-
cvec.n_embd,
3230-
ctx_server.params.control_vector_layer_start,
3231-
ctx_server.params.control_vector_layer_end);
3232-
if (err) {
3233-
std::cerr << "Could not apply control vector\n";
3234-
res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER));
3235-
return;
3236-
}
3237-
ctx_server.params.control_vectors.clear();
3238-
for (auto v : vec_params) {
3239-
//std::cout << "set vector param: " << v.fname << " " << v.strength << "\n";
3240-
ctx_server.params.control_vectors.push_back(v);
3241-
}
3242-
3243-
/*std::cerr << "Maybe we need to do this initiation ritual before it werks?\n"; // No, it's still all garbled bullshit.
3244-
3245-
std::vector<llama_token> tmp = { llama_token_bos(ctx_server.model), llama_token_eos(ctx_server.model), };
3246-
std::cerr << "decode, bro\n";
3247-
llama_decode(ctx_server.ctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) ctx_server.params.n_batch), 0, 0));
3248-
std::cerr << "clear that fucking cache\n";
3249-
llama_kv_cache_clear(ctx_server.ctx);
3250-
std::cerr << "symcr0nice or what\n";
3251-
llama_synchronize(ctx_server.ctx);
3252-
std::cerr << "time will tell\n";
3253-
llama_reset_timings(ctx_server.ctx);*/
3254-
handle_get_control_vectors(req, res);
3255-
};
3256-
32573179
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
32583180
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
32593181
json data = {
@@ -3603,10 +3525,8 @@ int main(int argc, char ** argv) {
36033525
svr->Get ("/health", handle_health);
36043526
svr->Get ("/slots", handle_slots);
36053527
svr->Get ("/metrics", handle_metrics);
3606-
svr->Get ("/control-vectors", handle_get_control_vectors);
36073528
svr->Get ("/props", handle_props);
36083529
svr->Get ("/v1/models", handle_models);
3609-
svr->Post("/control-vectors", handle_set_control_vectors);
36103530
svr->Post("/completion", handle_completions); // legacy
36113531
svr->Post("/completions", handle_completions);
36123532
svr->Post("/v1/completions", handle_completions);
@@ -3681,3 +3601,4 @@ int main(int argc, char ** argv) {
36813601

36823602
return 0;
36833603
}
3604+

examples/server/utils.hpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -615,8 +615,3 @@ static json format_error_response(const std::string & message, const enum error_
615615
{"type", type_str},
616616
};
617617
}
618-
619-
void from_json(const json& j, llama_control_vector_load_info& l) {
620-
j.at("strength").get_to(l.strength);
621-
j.at("fname").get_to(l.fname);
622-
}

llama.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1950,7 +1950,6 @@ struct llama_control_vector {
19501950
}
19511951

19521952
~llama_control_vector() {
1953-
LLAMA_LOG_ERROR("Kill the control vector\n");
19541953
for (struct ggml_context * ctx : ctxs) {
19551954
ggml_free(ctx);
19561955
}
@@ -13995,9 +13994,9 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
1399513994
}
1399613995

1399713996
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
13998-
cvec.tensors.clear();
13999-
cvec.ctxs.clear();
14000-
cvec.bufs.clear();
13997+
GGML_ASSERT(cvec.tensors.empty());
13998+
GGML_ASSERT(cvec.ctxs.empty());
13999+
GGML_ASSERT(cvec.bufs.empty());
1400114000

1400214001
// count layer buffer types
1400314002
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
@@ -14063,9 +14062,10 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da
1406314062
return 1;
1406414063
}
1406514064

14066-
if (!llama_control_vector_init(cvec, model)) {
14067-
LLAMA_LOG_ERROR("%s: FUCKING BITCH\n", __func__);
14068-
return 1;
14065+
if (cvec.tensors.empty()) {
14066+
if (!llama_control_vector_init(cvec, model)) {
14067+
return 1;
14068+
}
1406914069
}
1407014070

1407114071
cvec.layer_start = il_start;

0 commit comments

Comments
 (0)