Skip to content

Fix embedding when embedding layer on GPU #1873

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2481,3 +2481,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
func(tensor->src0, tensor->src1, tensor);
return true;
}

bool ggml_cuda_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst) {
//TODO: Do we need support split
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
char * src_ptr = (char *) extra->data_device[g_main_device];

CUDA_CHECK(cudaMemcpy(dst, src_ptr + offset, size, cudaMemcpyDeviceToHost));

return true;
}
2 changes: 2 additions & 0 deletions ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ void ggml_cuda_set_scratch_size(size_t scratch_size);
void ggml_cuda_free_scratch(void);
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);

bool ggml_cuda_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst);

#ifdef __cplusplus
}
#endif
7 changes: 7 additions & 0 deletions ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1193,3 +1193,10 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
tensor->data = dst;
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
}

bool ggml_cl_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst) {
CL_CHECK(clEnqueueReadBuffer(queue, (cl_mem)tensor->data, true, offset, size, dst, 0, NULL, NULL));
CL_CHECK(clFinish(queue));

return true;
}
2 changes: 2 additions & 0 deletions ggml-opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor);

void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);

bool ggml_cl_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst);

#ifdef __cplusplus
}
#endif
21 changes: 20 additions & 1 deletion llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1740,7 +1740,26 @@ static bool llama_eval_internal(
auto & embedding_out = lctx.embedding;

embedding_out.resize(n_embd);
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
switch(embeddings->backend)
{
#if defined(GGML_USE_CUBLAS)
case GGML_BACKEND_GPU:
case GGML_BACKEND_GPU_SPLIT:
ggml_cuda_get_data(embeddings, (n_embd*(N - 1)) * sizeof(float), n_embd * sizeof(float), embedding_out.data());
break;
#elif defined(GGML_USE_CLBAST)
case GGML_BACKEND_GPU:
case GGML_BACKEND_GPU_SPLIT:
ggml_cl_get_data(embeddings, (n_embd*(N - 1)) * sizeof(float), n_embd * sizeof(float), embedding_out.data());
break;
#endif
case GGML_BACKEND_CPU:
default:
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
break;
}


}

if (mem_per_token == 0) {
Expand Down