Skip to content

Commit 3a7e1ce

Browse files
authored
Merge branch 'ggerganov:master' into vulkan
2 parents c01ccf8 + b56f079 commit 3a7e1ce

23 files changed

+562
-41
lines changed

convert_hf_to_gguf.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
687687
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
688688
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
689689
res = "megrez"
690+
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
691+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
692+
res = "deepseek-v3"
690693

691694
if res is None:
692695
logger.warning("\n")
@@ -3373,6 +3376,24 @@ def set_gguf_parameters(self):
33733376
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
33743377

33753378

3379+
@Model.register("Cohere2ForCausalLM")
3380+
class Cohere2Model(Model):
3381+
model_arch = gguf.MODEL_ARCH.COHERE2
3382+
3383+
def set_gguf_parameters(self):
3384+
super().set_gguf_parameters()
3385+
3386+
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
3387+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
3388+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
3389+
3390+
rotary_pct = self.hparams["rotary_pct"]
3391+
hidden_size = self.hparams["hidden_size"]
3392+
num_attention_heads = self.hparams["num_attention_heads"]
3393+
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
3394+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3395+
3396+
33763397
@Model.register("OlmoForCausalLM")
33773398
@Model.register("OLMoForCausalLM")
33783399
class OlmoModel(Model):
@@ -3831,6 +3852,7 @@ def prepare_tensors(self):
38313852

38323853

38333854
@Model.register("DeepseekV2ForCausalLM")
3855+
@Model.register("DeepseekV3ForCausalLM")
38343856
class DeepseekV2Model(Model):
38353857
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
38363858

@@ -3852,6 +3874,15 @@ def set_gguf_parameters(self):
38523874
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
38533875
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
38543876
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
3877+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
3878+
3879+
if hparams["scoring_func"] == "sigmoid":
3880+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
3881+
elif hparams["scoring_func"] == "softmax":
3882+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
3883+
else:
3884+
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
3885+
38553886
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
38563887

38573888
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
@@ -3864,6 +3895,16 @@ def set_gguf_parameters(self):
38643895
_experts: list[dict[str, Tensor]] | None = None
38653896

38663897
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3898+
# rename e_score_correction_bias tensors
3899+
if name.endswith("e_score_correction_bias"):
3900+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
3901+
3902+
# skip Multi-Token Prediction (MTP) layers
3903+
block_count = self.hparams["num_hidden_layers"]
3904+
match = re.match(r"model.layers.(\d+)", name)
3905+
if match and int(match.group(1)) >= block_count:
3906+
return []
3907+
38673908
# process the experts separately
38683909
if name.find("mlp.experts") != -1:
38693910
n_experts = self.hparams["n_routed_experts"]

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ class TOKENIZER_TYPE(IntEnum):
107107
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
108108
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
109109
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
110+
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
110111
]
111112

112113

ggml/CMakeLists.txt

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -252,26 +252,6 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
252252
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
253253
install(TARGETS ggml-base LIBRARY)
254254

255-
# FIXME: this should be done in the backend cmake files
256-
if (GGML_METAL)
257-
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
258-
install(
259-
FILES src/ggml-metal/ggml-metal.metal
260-
PERMISSIONS
261-
OWNER_READ
262-
OWNER_WRITE
263-
GROUP_READ
264-
WORLD_READ
265-
DESTINATION ${CMAKE_INSTALL_BINDIR})
266-
267-
if (NOT GGML_METAL_EMBED_LIBRARY)
268-
install(
269-
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
270-
DESTINATION ${CMAKE_INSTALL_BINDIR}
271-
)
272-
endif()
273-
endif()
274-
275255
if (GGML_STANDALONE)
276256
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
277257
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc

ggml/src/ggml-backend.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -795,9 +795,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
795795
for (int i = 0; i < graph->n_nodes; i++) {
796796
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
797797
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
798-
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
798+
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
799799
sched->splits[cur_split].n_inputs);
800800
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
801+
if (j == 0) {
802+
GGML_LOG_DEBUG(": ");
803+
}
801804
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
802805
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
803806
}

ggml/src/ggml-metal/CMakeLists.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,19 @@ else()
103103
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
104104
)
105105
endif() # GGML_METAL_EMBED_LIBRARY
106+
107+
if (NOT GGML_METAL_EMBED_LIBRARY)
108+
install(
109+
FILES src/ggml-metal/ggml-metal.metal
110+
PERMISSIONS
111+
OWNER_READ
112+
OWNER_WRITE
113+
GROUP_READ
114+
WORLD_READ
115+
DESTINATION ${CMAKE_INSTALL_BINDIR})
116+
117+
install(
118+
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
119+
DESTINATION ${CMAKE_INSTALL_BINDIR}
120+
)
121+
endif()

ggml/src/ggml-rpc/ggml-rpc.cpp

Lines changed: 134 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,23 @@ enum rpc_cmd {
9393
RPC_CMD_COPY_TENSOR,
9494
RPC_CMD_GRAPH_COMPUTE,
9595
RPC_CMD_GET_DEVICE_MEMORY,
96+
RPC_CMD_INIT_TENSOR,
97+
RPC_CMD_GET_ALLOC_SIZE,
9698
RPC_CMD_COUNT,
9799
};
98100

101+
struct rpc_msg_get_alloc_size_req {
102+
rpc_tensor tensor;
103+
};
104+
105+
struct rpc_msg_get_alloc_size_rsp {
106+
uint64_t alloc_size;
107+
};
108+
109+
struct rpc_msg_init_tensor_req {
110+
rpc_tensor tensor;
111+
};
112+
99113
struct rpc_msg_alloc_buffer_req {
100114
uint64_t size;
101115
};
@@ -461,10 +475,18 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
461475
}
462476

463477
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
464-
UNUSED(buffer);
465-
if (ggml_is_quantized(tensor->type)) {
466-
// TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
467-
GGML_ASSERT(tensor->ne[0] % 512 == 0 && "unsupported quantized tensor");
478+
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
479+
480+
// CUDA backend on the server pads everything to 512 due to CUDA limitations.
481+
// Due to bandwidth constraints, we only call the server init tensor functions if necessary.
482+
// In particular, only quantized tensors need padding
483+
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
484+
rpc_msg_init_tensor_req request;
485+
486+
request.tensor = serialize_tensor(tensor);
487+
488+
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
489+
GGML_ASSERT(status);
468490
}
469491
}
470492

@@ -577,8 +599,23 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
577599
}
578600

579601
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
580-
UNUSED(buft);
581-
return ggml_nbytes(tensor);
602+
// See comments in init_tensor.
603+
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
604+
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
605+
auto sock = get_socket(buft_ctx->endpoint);
606+
607+
rpc_msg_get_alloc_size_req request;
608+
609+
request.tensor = serialize_tensor(tensor);
610+
611+
rpc_msg_get_alloc_size_rsp response;
612+
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
613+
GGML_ASSERT(status);
614+
615+
return response.alloc_size;
616+
} else {
617+
return ggml_nbytes(tensor);
618+
}
582619
}
583620

584621
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
@@ -757,6 +794,8 @@ class rpc_server {
757794
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
758795
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
759796
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
797+
bool init_tensor(const rpc_msg_init_tensor_req & request);
798+
bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
760799

761800
private:
762801
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
@@ -770,6 +809,36 @@ class rpc_server {
770809
std::unordered_set<ggml_backend_buffer_t> buffers;
771810
};
772811

812+
bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
813+
ggml_backend_buffer_type_t buft;
814+
struct ggml_init_params params {
815+
/*.mem_size =*/ ggml_tensor_overhead(),
816+
/*.mem_buffer =*/ NULL,
817+
/*.no_alloc =*/ true,
818+
};
819+
820+
struct ggml_context * ctx = ggml_init(params);
821+
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
822+
823+
if (tensor == nullptr) {
824+
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
825+
ggml_free(ctx);
826+
return false;
827+
}
828+
829+
if (tensor->buffer == nullptr) {
830+
//No buffer allocated.
831+
buft = ggml_backend_get_default_buffer_type(backend);
832+
} else {
833+
buft = tensor->buffer->buft;
834+
}
835+
836+
response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
837+
838+
ggml_free(ctx);
839+
return true;
840+
}
841+
773842
void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
774843
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
775844
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
@@ -905,6 +974,40 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
905974
return true;
906975
}
907976

977+
bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
978+
struct ggml_init_params params {
979+
/*.mem_size =*/ ggml_tensor_overhead(),
980+
/*.mem_buffer =*/ NULL,
981+
/*.no_alloc =*/ true,
982+
};
983+
struct ggml_context * ctx = ggml_init(params);
984+
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
985+
if (tensor == nullptr) {
986+
GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
987+
ggml_free(ctx);
988+
return false;
989+
}
990+
991+
// Call the backend's buffer_init_tensor function
992+
ggml_backend_buffer_t buffer = tensor->buffer;
993+
if (buffer && buffer->iface.init_tensor) {
994+
buffer->iface.init_tensor(buffer, tensor);
995+
} else {
996+
GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n");
997+
}
998+
999+
if (tensor->extra != nullptr) {
1000+
// This pointer can either be passed around client/server, or probably better stored server-side and kept track of.
1001+
// Currently unimplemented.
1002+
GGML_LOG_ERROR("tensor->extra populated by the backend, this is currently unsupported.\n");
1003+
ggml_free(ctx);
1004+
return false;
1005+
}
1006+
1007+
ggml_free(ctx);
1008+
return true;
1009+
}
1010+
9081011
bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
9091012
struct ggml_init_params params {
9101013
/*.mem_size =*/ ggml_tensor_overhead(),
@@ -1058,6 +1161,18 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
10581161
}
10591162
break;
10601163
}
1164+
case RPC_CMD_GET_ALLOC_SIZE: {
1165+
rpc_msg_get_alloc_size_req request;
1166+
if (!recv_msg(sockfd, &request, sizeof(request))) {
1167+
return;
1168+
}
1169+
rpc_msg_get_alloc_size_rsp response;
1170+
server.get_alloc_size(request, response);
1171+
if (!send_msg(sockfd, &response, sizeof(response))) {
1172+
return;
1173+
}
1174+
break;
1175+
}
10611176
case RPC_CMD_GET_ALIGNMENT: {
10621177
if (!recv_msg(sockfd, nullptr, 0)) {
10631178
return;
@@ -1133,6 +1248,19 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
11331248
}
11341249
break;
11351250
}
1251+
case RPC_CMD_INIT_TENSOR: {
1252+
rpc_msg_init_tensor_req request;
1253+
if (!recv_msg(sockfd, &request,sizeof(request))) {
1254+
return;
1255+
}
1256+
if (!server.init_tensor(request)) {
1257+
return;
1258+
}
1259+
if (!send_msg(sockfd, nullptr, 0)) {
1260+
return;
1261+
}
1262+
break;
1263+
}
11361264
case RPC_CMD_GET_TENSOR: {
11371265
rpc_msg_get_tensor_req request;
11381266
if (!recv_msg(sockfd, &request, sizeof(request))) {

ggml/src/ggml-vulkan/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ if (Vulkan_FOUND)
7373
OUTPUT ${_ggml_vk_header}
7474
${_ggml_vk_source}
7575

76-
COMMAND ${_ggml_vk_genshaders_cmd}
76+
COMMAND "$<TARGET_FILE_DIR:vulkan-shaders-gen>/${_ggml_vk_genshaders_cmd}"
7777
--glslc ${Vulkan_GLSLC_EXECUTABLE}
7878
--input-dir ${_ggml_vk_input_dir}
7979
--output-dir ${_ggml_vk_output_dir}

0 commit comments

Comments
 (0)