Workrounnd to set node->backend

mqy · mqy · commit bb590f14826c · 2023-06-18T14:27:56.000+08:00
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
@@ -1599,8 +1599,8 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
     // TODO: find the optimal values for these
     if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
+        dst->type == GGML_TYPE_F32 /*&&
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) {
         return true;
     }
 
diff --git a/ggml.c b/ggml.c
@@ -15938,6 +15938,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
             struct ggml_task_stage *stages = node->task_profile.stages;
 
+            // Workrounnd to set node->backend.
+            for (int j = 0; j < 3; j++) {
+                if (node->backend == GGML_BACKEND_CPU &&
+                    (stages[j].backend & GGML_TASK_BACKEND_GPU)) {
+                    if (ggml_cpu_has_cublas() || ggml_cpu_has_clblast()) {
+                        node->backend = GGML_BACKEND_GPU;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                }
+            }
+
             // compute stage n_tasks.
             int n_tasks = stages[1].parallel ? n_threads : 1;
 
@@ -16008,6 +16020,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
                         if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
+                            GGML_ASSERT(ggml_cl_can_mul_mat(node->src0, node->src1, node));
                             cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
 #else
                             GGML_ASSERT(false);
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
@@ -85,7 +85,6 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
                                      struct ggml_task_profile *profiles) {
     UNUSED(node);
     profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
     profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
     profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
     profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;