Add lora support

slaren · slaren · commit 3db2bf9bd298 · 2023-04-08T13:53:24.000+02:00
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
@@ -0,0 +1,101 @@
+import os
+import re
+import struct
+import sys
+from dataclasses import dataclass
+from typing import Any, Sequence
+
+import numpy as np
+import torch
+
+
+# TODO: import this from convert.py once #545 is merged
+@dataclass(frozen=True)
+class UnquantizedDataType:
+    name: str
+
+DT_F16 = UnquantizedDataType('F16')
+DT_F32 = UnquantizedDataType('F32')
+
+@dataclass(frozen=True)
+class QuantizedDataType:
+    groupsize: int
+    have_addends: bool
+    have_g_idx: bool
+
+DataType = UnquantizedDataType
+
+DATA_TYPE_TO_FTYPE: dict[DataType, int] = {
+    DT_F32: 0,
+    DT_F16: 1,
+}
+
+DATA_TYPE_TO_NUMPY: dict[DataType, np.dtype[Any]] = {
+    DT_F16: np.dtype(np.float16),
+    DT_F32: np.dtype(np.float32),
+}
+
+NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
+
+HF_SUBLAYER_TO_GGML = {
+    "self_attn.q_proj": "attention.wq.weight",
+    "self_attn.k_proj": "attention.wk.weight",
+    "self_attn.v_proj": "attention.wv.weight",
+    "self_attn.o_proj": "attention.wo.weight",
+}
+
+def translate_tensor_name(t):
+    match = re.match(r'.*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight', t)
+    if match:
+        nn = match.group(1)
+        sub_layer = match.group(2)
+        lora_type = match.group(3)
+
+        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
+        if sub_layer_renamed is None:
+            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
+            exit(1)
+
+        output_string = f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.lora{lora_type}"
+        return output_string
+    else:
+        print(f"Error: unrecognized tensor {t}")
+        exit(1)
+
+def write_file_header(fout):
+    fout.write(b"ggla"[::-1]) # magic (ggml lora)
+    fout.write(struct.pack("i", 1)) # file version
+
+
+def write_tensor_header(self, name: str, shape: Sequence[int], data_type: 1) -> None:
+    sname = name.encode('utf-8')
+    fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]]))
+    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+    fout.write(sname)
+    fout.seek((fout.tell() + 31) & -32)
+    
+
+if len(sys.argv) < 2:
+    print(f"Usage: python {sys.argv[0]} adapter_model.bin [ggml_adapter_model.bin]")
+    sys.exit(1)
+
+input_path = sys.argv[1]
+if len(sys.argv) > 2:
+    output_path = sys.argv[2]
+else:
+    output_filename = f"ggml_{os.path.basename(input_path)}"
+    output_path = os.path.join(os.path.dirname(input_path), output_filename)
+
+model = torch.load(input_path, map_location="cpu")
+
+with open(output_path, "wb") as fout:
+    write_file_header(fout)
+    for k, v in model.items():
+        # since ggml doesn't always support other types for the second operand,
+        # the tensors are always converted and exported as f32
+        t = v.float().numpy()
+        print(f"{k} => {translate_tensor_name(k)} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+        write_tensor_header(fout, translate_tensor_name(k), t.shape, t.dtype)
+        t.tofile(fout)
+
+print(f"Converted {input_path} to {output_path}")
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -140,6 +140,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.model = argv[i];
+        } else if (arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--embedding") {
@@ -238,6 +244,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     }
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter\n");
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "\n");
diff --git a/examples/common.h b/examples/common.h
@@ -31,11 +31,11 @@ struct gpt_params {
 
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
-    std::string input_prefix = ""; // string to prefix user inputs with
-
-
+    std::string input_prefix = "";       // string to prefix user inputs with
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
+    std::string lora_adapter = "";  // lora adapter path
+
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -107,6 +107,14 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx, params.lora_adapter.c_str(), params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
     // print system information
     {
         fprintf(stderr, "\n");
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -126,6 +126,14 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx, params.lora_adapter.c_str(), params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
     // print system information
     {
         fprintf(stderr, "\n");
diff --git a/ggml.c b/ggml.c
@@ -5167,6 +5167,47 @@ static void ggml_compute_forward_add_f32(
     }
 }
 
+static void ggml_compute_forward_add_f16_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb10 = src1->nb[0];
+    const size_t nb11 = src1->nb[1];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F16);
+
+    for (int j = ith; j < n; j += nth) {
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + j*nb1);
+        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
+        for (int i = 0; i < nc; i++) {
+            float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
+
+            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
+        }
+    }
+}
+
 static void ggml_compute_forward_add(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -5177,12 +5218,15 @@ static void ggml_compute_forward_add(
             {
                 ggml_compute_forward_add_f32(params, src0, src1, dst);
             } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
+            } break;
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
-        case GGML_TYPE_F16:
         case GGML_TYPE_COUNT:
             {
                 GGML_ASSERT(false);
diff --git a/ggml.h b/ggml.h
@@ -415,6 +415,12 @@ struct ggml_tensor * ggml_add(
         struct ggml_tensor  * a,
         struct ggml_tensor  * b);
 
+
+struct ggml_tensor * ggml_add_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+
 struct ggml_tensor * ggml_sub(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.h b/llama.h

Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,14 @@ int main(int argc, char ** argv) {`
`107`	`107`	`}`
`108`	`108`	`}`
`109`	`109`
	`110`	`+ if (!params.lora_adapter.empty()) {`
	`111`	`+ int err = llama_apply_lora_from_file(ctx, params.lora_adapter.c_str(), params.n_threads);`
	`112`	`+ if (err != 0) {`
	`113`	`+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);`
	`114`	`+ return 1;`
	`115`	`+ }`
	`116`	`+ }`
	`117`	`+`
`110`	`118`	`// print system information`
`111`	`119`	`{`
`112`	`120`	`fprintf(stderr, "\n");`
Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,14 @@ int main(int argc, char ** argv) {`
`126`	`126`	`}`
`127`	`127`	`}`
`128`	`128`
	`129`	`+ if (!params.lora_adapter.empty()) {`
	`130`	`+ int err = llama_apply_lora_from_file(ctx, params.lora_adapter.c_str(), params.n_threads);`
	`131`	`+ if (err != 0) {`
	`132`	`+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);`
	`133`	`+ return 1;`
	`134`	`+ }`
	`135`	`+ }`
	`136`	`+`
`129`	`137`	`// print system information`
`130`	`138`	`{`
`131`	`139`	`fprintf(stderr, "\n");`