@@ -1685,8 +1685,12 @@ int llama_model_quantize(
1685
1685
1686
1686
int llama_apply_lora_from_file (struct llama_context * ctx, const char * path_lora, int n_threads) {
1687
1687
// TODO: refactor all of this after PR #801
1688
+ fprintf (stderr, " %s: applying lora adapter from '%s' - please wait ...\n " , __func__, path_lora);
1689
+
1688
1690
auto & model = ctx->model ;
1689
1691
1692
+ const int64_t t_start_lora_us = ggml_time_us ();
1693
+
1690
1694
auto fin = std::ifstream (path_lora, std::ios::binary);
1691
1695
if (!fin) {
1692
1696
fprintf (stderr, " %s: failed to open '%s'\n " , __func__, path_lora);
@@ -1799,7 +1803,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1799
1803
lora_tensors.find (base_name + " .loraB" ) != lora_tensors.end ()) {
1800
1804
1801
1805
ggml_tensor * tensor = model.tensors [base_name];
1802
- ggml_tensor * loraA = ggml_transpose (lora_ctx, lora_tensors[base_name + " .loraA" ]) ;
1806
+ ggml_tensor * loraA = lora_tensors[base_name + " .loraA" ];
1803
1807
ggml_tensor * loraB = lora_tensors[base_name + " .loraB" ];
1804
1808
1805
1809
if (tensor->ne [0 ] != loraA->ne [1 ]) {
@@ -1826,7 +1830,11 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1826
1830
fprintf (stderr, " ." );
1827
1831
}
1828
1832
}
1829
- fprintf (stderr, " done\n " );
1833
+
1834
+ ggml_free (lora_ctx);
1835
+
1836
+ const int64_t t_lora_us = ggml_time_us () - t_start_lora_us;
1837
+ fprintf (stderr, " done (%.2f ms)\n " , t_lora_us / 1000.0 );
1830
1838
1831
1839
return 0 ;
1832
1840
}
0 commit comments