Skip to content

lora : improve compat with mergekit-extract-lora #11131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions convert_lora_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight")
# models produced by mergekit-extract-lora have token embeddings in the adapter
base_name = base_name.replace(".lora_embedding_A", ".weight")
base_name = base_name.replace(".lora_embedding_B", ".weight")
return base_name


Expand Down Expand Up @@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
"--base", type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
)
parser.add_argument(
"--base-model-id", type=str,
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
)
parser.add_argument(
"lora_path", type=Path,
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
Expand Down Expand Up @@ -290,6 +297,7 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:

dir_base_model: Path | None = args.base
dir_lora: Path = args.lora_path
base_model_id: str | None = args.base_model_id
lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors"

Expand All @@ -313,7 +321,10 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
lparams: dict[str, Any] = json.load(f)

# load base model
if dir_base_model is None:
if base_model_id is not None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams = load_hparams_from_hf(base_model_id)
elif dir_base_model is None:
if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}")
Expand Down Expand Up @@ -371,11 +382,16 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name)
is_lora_a = ".lora_A.weight" in name
is_lora_b = ".lora_B.weight" in name
# note: mergekit-extract-lora also adds token embeddings to the adapter
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name:
continue
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
if "_layernorm" in name or ".norm" in name:
yield (base_name, tensor)
continue
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
Expand Down Expand Up @@ -407,9 +423,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest:
# mergekit-extract-lora add these layernorm to the adapter
if "_norm" in dest_name:
assert dest_data.dim() == 1
yield (dest_name, dest_data)
continue

# otherwise, we must get the lora_A and lora_B tensors
assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B()

# note: mergekit-extract-lora flip and transpose A and B
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
if "token_embd.weight" in dest_name:
lora_a = lora_a.T

yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b)

Expand Down
24 changes: 18 additions & 6 deletions src/llama-adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,10 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
} else {
ab_map[name].b = cur;
}
} else if (str_endswith(name, "_norm.weight")) {
// TODO: add support for norm vector
// for now, we don't really care because most adapters still work fine without it
continue;
} else {
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
}
Expand All @@ -251,6 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
for (auto & it : ab_map) {
const std::string & name = it.first;
llama_lora_weight & w = it.second;
bool is_token_embd = str_endswith(name, "token_embd.weight");

if (!w.a || !w.b) {
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
Expand All @@ -259,16 +264,23 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
// device buft and device ctx
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
if (!model_tensor) {
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
}

struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
// validate tensor shape
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
}
if (w.a->ne[1] != w.b->ne[0]) {
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
if (is_token_embd) {
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
}
} else {
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
}
if (w.a->ne[1] != w.b->ne[0]) {
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
}
}

// save tensor to adapter
Expand Down
7 changes: 7 additions & 0 deletions src/llama-adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ struct llama_lora_weight {
struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr;

// get actual scale based on rank and alpha
float get_scale(float alpha, float adapter_scale) {
const float rank = (float) b->ne[0];
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
return scale;
}

llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
};
Expand Down
21 changes: 18 additions & 3 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2545,6 +2545,21 @@ static struct ggml_tensor * llm_build_inp_embd(
ggml_set_input(lctx.inp_tokens);

inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);

// apply lora for embedding tokens if needed
for (auto & it : lctx.lora_adapters) {
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
if (lora == nullptr) {
continue;
}
const float adapter_scale = it.second;
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
ctx, lora->b, // non-transposed lora_b
ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
), scale);
inpL = ggml_add(ctx, inpL, inpL_delta);
}
} else {
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
inpL = lctx.inp_embd;
Expand Down Expand Up @@ -2617,9 +2632,8 @@ static struct ggml_tensor * llm_build_lora_mm(
if (lora == nullptr) {
continue;
}
const float alpha = it.first->alpha;
const float rank = (float) lora->b->ne[0];
const float scale = alpha ? it.second * alpha / rank : it.second;
const float adapter_scale = it.second;
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
struct ggml_tensor * ab_cur = ggml_mul_mat(
ctx0, lora->b,
ggml_mul_mat(ctx0, lora->a, cur)
Expand Down Expand Up @@ -3967,6 +3981,7 @@ struct llm_build_context {

// feed-forward network
if (model.layers[il].ffn_gate_inp == nullptr) {

cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
Expand Down
Loading