Skip to content

Commit 0f98acf

Browse files
authored
llama : add support for larger Granite Code Models (20B, 34B) (#7324)
Tie the weights for ARCH_STARCODER to support the larger Granite code models. Partially addresses ggerganov/issues/7116 There still remains to be a few things to fix. Currently requires `--override-kv tokenizer.ggml.add_bos_token=bool:false`
1 parent ca57e0f commit 0f98acf

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

llama.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5188,7 +5188,14 @@ static bool llm_load_tensors(
51885188
{
51895189
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
51905190
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5191-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5191+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5192+
if (!model.output) {
5193+
// needs to be on GPU
5194+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5195+
ml.n_created--; // artificial tensor
5196+
ml.size_data += ggml_nbytes(model.output);
5197+
}
5198+
51925199
}
51935200

51945201
for (int i = 0; i < n_layer; ++i) {

0 commit comments

Comments
 (0)