@@ -8782,8 +8782,6 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8782
8782
// TODO: mmq/mmv support
8783
8783
#endif
8784
8784
8785
- GGML_ASSERT (dst->backend == GGML_BACKEND_GPU);
8786
-
8787
8785
const int64_t nb11 = src1->nb [1 ];
8788
8786
const int64_t nb1 = dst->nb [1 ];
8789
8787
@@ -8812,13 +8810,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8812
8810
ggml_tensor src1_row = *src1;
8813
8811
ggml_tensor dst_row = *dst;
8814
8812
8813
+ src1_row.backend = GGML_BACKEND_GPU;
8814
+ dst_row.backend = GGML_BACKEND_GPU;
8815
+
8815
8816
src1_row.extra = &src1_row_extra;
8816
8817
dst_row.extra = &dst_row_extra;
8817
8818
8818
- char * src1_original = (char *) src1_extra->data_device [g_main_device];
8819
- char * dst_original = (char *) dst_extra->data_device [g_main_device];
8819
+ char * src1_original = src1->backend == GGML_BACKEND_CPU ?
8820
+ (char *) src1->data : (char *) src1_extra->data_device [g_main_device];
8821
+ char * dst_original = dst->backend == GGML_BACKEND_CPU ?
8822
+ (char *) dst->data : (char *) dst_extra->data_device [g_main_device];
8820
8823
8821
8824
if (src1->ne [1 ] == 1 ) {
8825
+ GGML_ASSERT (src1->backend == GGML_BACKEND_GPU);
8826
+ GGML_ASSERT (dst->backend == GGML_BACKEND_GPU);
8827
+
8822
8828
for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8823
8829
// int32_t row_id;
8824
8830
// CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
@@ -8846,6 +8852,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8846
8852
src1_row_extra.data_device [g_main_device] = src1_contiguous;
8847
8853
dst_row_extra.data_device [g_main_device] = dst_contiguous;
8848
8854
8855
+ const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8856
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8857
+ const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
8858
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8859
+
8849
8860
for (int32_t row_id = 0 ; row_id < n_as; ++row_id) {
8850
8861
const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8851
8862
@@ -8860,7 +8871,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8860
8871
GGML_ASSERT (row_id >= 0 && row_id < n_as);
8861
8872
8862
8873
CUDA_CHECK (cudaMemcpyAsync (src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8863
- nb11, cudaMemcpyDeviceToDevice , stream));
8874
+ nb11, src1_kind , stream));
8864
8875
num_src1_rows++;
8865
8876
}
8866
8877
@@ -8892,14 +8903,18 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8892
8903
GGML_ASSERT (row_id >= 0 && row_id < n_as);
8893
8904
8894
8905
CUDA_CHECK (cudaMemcpyAsync (dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8895
- nb1, cudaMemcpyDeviceToDevice , stream));
8906
+ nb1, dst_kind , stream));
8896
8907
num_src1_rows++;
8897
8908
}
8898
8909
}
8899
8910
8900
8911
ggml_cuda_pool_free (src1_contiguous, as_src1);
8901
8912
ggml_cuda_pool_free (dst_contiguous, as_dst);
8902
8913
}
8914
+
8915
+ if (dst->backend == GGML_BACKEND_CPU) {
8916
+ CUDA_CHECK (cudaStreamSynchronize (stream));
8917
+ }
8903
8918
}
8904
8919
8905
8920
static void ggml_cuda_scale (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -9298,7 +9313,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9298
9313
|| (tensor->src [0 ] != nullptr && (tensor->src [0 ]->backend == GGML_BACKEND_GPU || tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT))
9299
9314
|| (tensor->src [1 ] != nullptr && tensor->src [1 ]->backend == GGML_BACKEND_GPU);
9300
9315
9301
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
9316
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor-> op != GGML_OP_MUL_MAT_ID ) {
9302
9317
return false ;
9303
9318
}
9304
9319
0 commit comments