ggml-org
diff --git a/‎Makefile
Lines changed: 0 additions & 5 deletions b/‎Makefile
Lines changed: 0 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 0 additions & 1 deletion b/‎README.md
Lines changed: 0 additions & 1 deletion
diff --git a/‎ggml-alloc.c
Lines changed: 8 additions & 0 deletions b/‎ggml-alloc.c
Lines changed: 8 additions & 0 deletions
@@ -253,11 +253,6 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
-ifdef LLAMA_CUDA_MMQ_Y
-	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
-else
-	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
-endif # LLAMA_CUDA_MMQ_Y
 #ifdef LLAMA_CUDA_CUBLAS
 #	NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
 
@@ -406,7 +406,6 @@ Building the program with BLAS support may lead to some performance improvements
 --->
   | Option                  | Legal values           | Default | Description |
   |-------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_MMQ_Y        | Positive integer >= 32 |      64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
   | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
   | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
   | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
 
@@ -394,6 +394,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                     if (parent == NULL) {
                         break;
                     }
+
+                    // if the node's data is external, then we cannot re-use it
+                    if ((char *) parent->data < (char *) alloc->data ||
+                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
+                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+                        continue;
+                    }
+
                     struct hash_node * p_hn = hash_get(ht, parent);
                     if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
                         if (ggml_is_view(parent)) {