Export lora A matrix pre-transposed

slaren · slaren · commit 11d40ea53d9f · 2023-04-08T13:53:24.000+02:00
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
@@ -94,6 +94,8 @@ def write_tensor_header(self, name: str, shape: Sequence[int], data_type: 1) ->
         # since ggml doesn't always support other types for the second operand,
         # the tensors are always converted and exported as f32
         t = v.float().numpy()
+        if "lora_A" in k:
+            t = t.T
         print(f"{k} => {translate_tensor_name(k)} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
         write_tensor_header(fout, translate_tensor_name(k), t.shape, t.dtype)
         t.tofile(fout)
diff --git a/llama.cpp b/llama.cpp
@@ -1685,8 +1685,12 @@ int llama_model_quantize(
 
 int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, int n_threads) {
     // TODO: refactor all of this after PR #801
+    fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
+
     auto & model = ctx->model;
 
+    const int64_t t_start_lora_us = ggml_time_us();
+
     auto fin = std::ifstream(path_lora, std::ios::binary);
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
@@ -1799,7 +1803,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
             lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
 
             ggml_tensor * tensor = model.tensors[base_name];
-            ggml_tensor * loraA = ggml_transpose(lora_ctx, lora_tensors[base_name + ".loraA"]);
+            ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
             ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
 
             if (tensor->ne[0] != loraA->ne[1]) {
@@ -1826,7 +1830,11 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
                 fprintf(stderr, ".");
         }
     }
-    fprintf(stderr, " done\n");
+
+    ggml_free(lora_ctx);
+
+    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
+    fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
 
     return 0;
 }