ggml : restrict op on other backends to equal head sizes

ggerganov · ggerganov · commit 1e0f5ad7997f · 2025-03-27T18:27:18.000+02:00
ggml-ci
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3232,6 +3232,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 #ifndef FLASH_ATTN_AVAILABLE
             return false;
 #endif // FLASH_ATTN_AVAILABLE
+            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+                // different head sizes of K and V are not supported yet
+                return false;
+            }
+            if (op->src[0]->ne[0] == 192) {
+                return false;
+            }
             if (op->src[0]->ne[3] != 1) {
                 return false;
             }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8764,6 +8764,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                 default:
                     return false;
                 }
+                if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+                    // different head sizes of K and V are not supported yet
+                    return false;
+                }
                 if (op->src[0]->type != GGML_TYPE_F32) {
                     return false;
                 }

Original file line number	Diff line number	Diff line change
`@@ -8764,6 +8764,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm`
`8764`	`8764`	`default:`
`8765`	`8765`	`return false;`
`8766`	`8766`	`}`
	`8767`	`+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {`
	`8768`	`+ // different head sizes of K and V are not supported yet`
	`8769`	`+ return false;`
	`8770`	`+ }`
`8767`	`8771`	`if (op->src[0]->type != GGML_TYPE_F32) {`
`8768`	`8772`	`return false;`
`8769`	`8773`	`}`