mpt : do not duplicate token_embd.weight on disk

cebtenzzre · cebtenzzre · commit 11ed1fb74673 · 2024-02-22T17:02:37.000-05:00
Previous attempt was ggml-org#3626 (cherry picked from commit 549fe80) Signed-off-by: Jared Van Bortel <jared@nomic.ai>
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -618,11 +618,6 @@ def write_tensors(self):
 
             self.gguf_writer.add_tensor(new_name, data)
 
-            # note: MPT output is tied to (same as) wte in original model;
-            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
-            if new_name == "token_embd.weight":
-                self.gguf_writer.add_tensor("output.weight", data)
-
 
 class OrionModel(Model):
     def set_vocab(self):
diff --git a/llama.cpp b/llama.cpp
@@ -4057,7 +4057,10 @@ static bool llm_load_tensors(
                     // output
                     {
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        // same as tok_embd, duplicated to allow offloading
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {

Original file line number	Diff line number	Diff line change
`@@ -4057,7 +4057,10 @@ static bool llm_load_tensors(`
`4057`	`4057`	`// output`
`4058`	`4058`	`{`
`4059`	`4059`	`model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});`
`4060`		`- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});`
	`4060`	`+ // same as tok_embd, duplicated to allow offloading`
	`4061`	`+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});`
	`4062`	`+ ml.n_created--; // artificial tensor`
	`4063`	`+ ml.size_data += ggml_nbytes(model.output);`
`4061`	`4064`	`}`
`4062`	`4065`
`4063`	`4066`	`for (int i = 0; i < n_layer; ++i) {`