Skip to content

Commit 26df64a

Browse files
committed
Fix passing param
1 parent 12112bf commit 26df64a

File tree

4 files changed

+39
-29
lines changed

4 files changed

+39
-29
lines changed

BRANCH_SETUP.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,21 @@
3030

3131
Run main with base model and lora adapter to hot-swap
3232
```bash
33-
./main ./models/open-llama/ggml-model-f16.gguf \
34-
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
33+
./main -m ./models/open-llama/ggml-model-f16.gguf \
34+
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
3535
-ngl 0 \
3636
-n 128
3737
```
3838

39-
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors.
39+
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
4040

4141
# Logic
4242

4343

4444

45+
4546
# Current status
4647

47-
- Only ony Lora adapter can be passed.
48+
- Only one Lora adapter can be passed.
49+
- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
4850
- GPU not supported

common/common.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
24432443
cparams.n_ubatch = params.n_ubatch;
24442444
cparams.n_threads = params.n_threads;
24452445
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
2446+
const char* c_string = params.hot_lora.c_str();
2447+
strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1);
2448+
cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0'; // Ensure null-termination
2449+
24462450
cparams.seed = params.seed;
24472451
cparams.logits_all = params.logits_all;
24482452
cparams.embeddings = params.embedding;

llama.cpp

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ struct lora_info {
145145
std::string filename;
146146
float scale;
147147
};
148-
// TODO lora_data should maybe sub lora_weights in llama.cpp
148+
// TODO lora_data should maybe sub lora_weights
149149
struct lora_data {
150150
struct lora_info info;
151151
std::vector<uint8_t> data;
@@ -2502,7 +2502,7 @@ struct llama_context {
25022502

25032503
llama_cparams cparams;
25042504
bool lora_loaded = false;
2505-
std::map<std::string, lora_weights> lora_weights_map;
2505+
std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
25062506
lora_data llora_data;
25072507
float lora_scale = 1.0f;
25082508

@@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() {
1610916109
/*.n_seq_max =*/ 1,
1611016110
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
1611116111
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
16112+
/*.hot_lora =*/ "",
1611216113
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
1611316114
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
1611416115
/*.rope_freq_base =*/ 0.0f,
@@ -16321,33 +16322,35 @@ struct llama_context * llama_new_context_with_model(
1632116322
/// LORA
1632216323
struct export_lora_params * lora_params = new struct export_lora_params;
1632316324
struct lora_info lora;
16324-
lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin";
16325-
lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras
16326-
lora_params->lora.push_back(lora);
16327-
// load all loras
16328-
std::vector<struct lora_data *> loras;
16329-
for (size_t i = 0; i < lora_params->lora.size(); ++i) {
16330-
struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
16331-
if (llora_data != NULL) {
16332-
loras.push_back(llora_data);
16325+
// lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin";
16326+
lora.filename = params.hot_lora;
16327+
if (strlen(params.hot_lora) > 0) {
16328+
16329+
lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras?
16330+
lora_params->lora.push_back(lora);
16331+
// load all loras
16332+
std::vector<struct lora_data *> loras;
16333+
for (size_t i = 0; i < lora_params->lora.size(); ++i) {
16334+
struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
16335+
if (llora_data != NULL) {
16336+
loras.push_back(llora_data);
16337+
}
1633316338
}
16334-
}
16335-
if (loras.size() == 0) {
16336-
fprintf(stderr, "warning: no lora adapters will be applied.\n");
16337-
}
16338-
// Assign data
16339-
ctx->llora_data = *loras[0];
16339+
if (loras.size() == 0) {
16340+
fprintf(stderr, "warning: no lora adapters will be applied.\n");
16341+
}
16342+
// Assign data
16343+
ctx->llora_data = *loras[0];
1634016344

16341-
// build the map?
16342-
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
16343-
std::vector<std::string> keys;
16344-
for (const auto& pair : ctx->lora_weights_map) {
16345-
keys.push_back(pair.first);
16345+
// build the map?
16346+
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
16347+
std::vector<std::string> keys;
16348+
for (const auto& pair : ctx->lora_weights_map) {
16349+
keys.push_back(pair.first);
16350+
}
1634616351
}
1634716352

16348-
16349-
16350-
/// END LORA
16353+
/// LORA
1635116354

1635216355
const auto & hparams = model->hparams;
1635316356
auto & cparams = ctx->cparams;

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,7 @@ extern "C" {
292292
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
293293
uint32_t n_threads; // number of threads to use for generation
294294
uint32_t n_threads_batch; // number of threads to use for batch processing
295+
char hot_lora[256]; // path to the hot lora file
295296

296297
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
297298
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id

0 commit comments

Comments
 (0)