ggml-org
diff --git a/‎ci/README.md
Lines changed: 1 addition & 1 deletion b/‎ci/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/run.sh
Lines changed: 1 addition & 1 deletion b/‎ci/run.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-common.h
Lines changed: 12 additions & 6 deletions b/‎ggml/src/ggml-common.h
Lines changed: 12 additions & 6 deletions
diff --git a/‎ggml/src/ggml-cuda/common.cuh
Lines changed: 4 additions & 0 deletions b/‎ggml/src/ggml-cuda/common.cuh
Lines changed: 4 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/concat.cu
Lines changed: 2 additions & 2 deletions b/‎ggml/src/ggml-cuda/concat.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cuda/conv-transpose-1d.cu
Lines changed: 4 additions & 2 deletions b/‎ggml/src/ggml-cuda/conv-transpose-1d.cu
Lines changed: 4 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cuda/convert.cu
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cuda/convert.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/fattn-common.cuh
Lines changed: 5 additions & 4 deletions b/‎ggml/src/ggml-cuda/fattn-common.cuh
Lines changed: 5 additions & 4 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-mma-f16.cuh
Lines changed: 58 additions & 33 deletions b/‎ggml/src/ggml-cuda/fattn-mma-f16.cuh
Lines changed: 58 additions & 33 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-tile-f16.cu
Lines changed: 13 additions & 1 deletion b/‎ggml/src/ggml-cuda/fattn-tile-f16.cu
Lines changed: 13 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/fattn-tile-f32.cu
Lines changed: 12 additions & 0 deletions b/‎ggml/src/ggml-cuda/fattn-tile-f32.cu
Lines changed: 12 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-vec-f16.cuh
Lines changed: 13 additions & 1 deletion b/‎ggml/src/ggml-cuda/fattn-vec-f16.cuh
Lines changed: 13 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/fattn-vec-f32.cuh
Lines changed: 10 additions & 0 deletions b/‎ggml/src/ggml-cuda/fattn-vec-f32.cuh
Lines changed: 10 additions & 0 deletions
@@ -60,7 +60,7 @@ docker run --privileged -it \
 Inside the container, execute the following commands:
 
 ```bash
-apt update -y && apt install -y bc cmake git python3.10-venv time unzip wget
+apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
 git config --global --add safe.directory /ws
 GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
 ```
 
@@ -69,7 +69,7 @@ fi
 if [ ! -z ${GG_BUILD_MUSA} ]; then
     # Use qy1 by default (MTT S80)
     MUSA_ARCH=${MUSA_ARCH:-21}
-    CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
 fi
 ## helpers
 
 
@@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;
 
 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
 
+#ifdef _MSC_VER
+#define GGML_EXTENSION
+#else // _MSC_VER
+#define GGML_EXTENSION __extension__
+#endif // _MSC_VER
+
 #define QK4_0 32
 typedef struct {
     ggml_half d;           // delta
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b
 
 #define QK4_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
         struct {
             ggml_half d; // delta
             ggml_half m; // min
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0
 
 #define QK5_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
         struct {
             ggml_half d; // delta
             ggml_half m; // min
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block
 
 #define QK8_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
         struct {
             ggml_half d; // delta
             ggml_half s; // d * sum(qs[i])
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
 typedef struct {
     uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
     uint8_t qs[QK_K/4];      // quants
-    union {
+    GGML_EXTENSION union {
         struct {
             ggml_half d;    // super-block scale for quantized scales
             ggml_half dmin; // super-block scale for quantized mins
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 typedef struct {
-    union {
+    GGML_EXTENSION union {
         struct {
             ggml_half d;    // super-block scale for quantized scales
             ggml_half dmin; // super-block scale for quantized mins
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 typedef struct {
-    union {
+    GGML_EXTENSION union {
         struct {
             ggml_half d;    // super-block scale for quantized scales
             ggml_half dmin; // super-block scale for quantized mins
 
@@ -288,6 +288,10 @@ static __device__ void no_device_code(
     __trap();
 
     GGML_UNUSED(no_device_code); // suppress unused function warning
+
+#if defined(GGML_USE_MUSA)
+    __builtin_unreachable();
+#endif // defined(GGML_USE_MUSA)
 }
 
 #ifdef __CUDA_ARCH__
 
@@ -38,7 +38,7 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
         blockIdx.y * ne0 +
         blockIdx.z * ne0 * gridDim.y;
 
-    if (blockIdx.y < ne01) { // src0
+    if (blockIdx.y < (unsigned)ne01) { // src0
         int offset_src =
             nidx +
             blockIdx.y * ne0 +
@@ -64,7 +64,7 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
         blockIdx.y * ne0 +
         blockIdx.z * ne0 * gridDim.y;
 
-    if (blockIdx.z < ne02) { // src0
+    if (blockIdx.z < (unsigned)ne02) { // src0
         int offset_src =
             nidx +
             blockIdx.y * ne0 +
 
@@ -34,6 +34,10 @@ static  __global__ void conv_transpose_1d_kernel(
         }
     }
     dst[global_index] = accumulator;
+    GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);
+    GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
+    GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
+    GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
 }
 
 static void conv_transpose_1d_f32_f32_cuda(
@@ -75,8 +79,6 @@ void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor
     const int p0 = 0;//opts[3];
     const int d0 = 1;//opts[4];
 
-    const int64_t kernel_size = ggml_nelements(src0);
-    const int64_t input_size = ggml_nelements(src1);
     const int64_t output_size = ggml_nelements(dst);
 
     conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
 
@@ -577,7 +577,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
         return;
     }
 
-    const src_t * x = (src_t *) vx;
+    const src_t * x = (const src_t *) vx;
 
     y[i] = x[i];
 }
 
@@ -315,14 +315,14 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
 
     float vals[sizeof(int)] = {0.0f};
 #pragma unroll
-    for (int l = 0; l < sizeof(int); ++l) {
+    for (int l = 0; l < int(sizeof(int)); ++l) {
         vals[l] = scale * x[4*threadIdx.x + l];
     }
 
     float amax = fabsf(vals[0]);
     float sum  = vals[0];
 #pragma unroll
-    for (int l = 1; l < sizeof(int); ++l) {
+    for (int l = 1; l < int(sizeof(int)); ++l) {
         amax = fmaxf(amax, fabsf(vals[l]));
         sum += vals[l];
     }
@@ -338,7 +338,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
 
     if (d != 0.0f) {
 #pragma unroll
-        for (int l = 0; l < sizeof(int); ++l) {
+        for (int l = 0; l < int(sizeof(int)); ++l) {
             q8[l] = roundf(vals[l] / d);
         }
     }
@@ -638,7 +638,7 @@ static __global__ void flash_attn_combine_results(
     float VKQ_denominator = 0.0f;
     for (int l = 0; l < parallel_blocks; ++l) {
         const float diff = meta[l].x - kqmax;
-        const float KQ_max_scale = expf(diff);
+        float KQ_max_scale = expf(diff);
         const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
         *((uint32_t *) &KQ_max_scale) &= ftz_mask;
 
@@ -649,6 +649,7 @@ static __global__ void flash_attn_combine_results(
     dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
 }
 
+[[noreturn]]
 static void on_no_fattn_vec_case(const int D) {
     if (D == 64) {
         fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
 
@@ -406,6 +406,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #endif // CP_ASYNC_AVAILABLE
 
 #else
+    GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
+    GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
+    GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV);
+    GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
+    GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
+    GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
+    GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
+    GGML_UNUSED(kb0);
     NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
@@ -797,6 +806,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         __syncthreads();
     }
 #else
+    GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
+    GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
+    GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
+    GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask);
+    GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
     NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
@@ -931,6 +946,16 @@ static __global__ void flash_attn_ext_f16(
         (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
          ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
 #else
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
+    GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
     NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 }
@@ -985,38 +1010,38 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
     extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
     extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \
 
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,   8);
-
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  16);
-
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  32);
-
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  64);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,   8)
+
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  16)
+
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  32)
+
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  64)
 
 // Kernels with ncols == 128 are only 4% faster due to register pressure.
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128); // Needs too much shared memory.
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory.
@@ -282,7 +282,19 @@ static __global__ void flash_attn_tile_ext_f16(
         }
     }
 #else
-   NO_DEVICE_CODE;
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 }
 
 
@@ -281,6 +281,18 @@ static __global__ void flash_attn_tile_ext_f32(
         }
     }
 #else
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
     NO_DEVICE_CODE;
 #endif // FLASH_ATTN_AVAILABLE
 }
 
@@ -292,7 +292,19 @@ static __global__ void flash_attn_vec_ext_f16(
         dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
     }
 #else
-   NO_DEVICE_CODE;
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 }
 
 
@@ -277,6 +277,16 @@ static __global__ void flash_attn_vec_ext_f32(
         dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
     }
 #else
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
+    GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
     NO_DEVICE_CODE;
 #endif // FLASH_ATTN_AVAILABLE
 }
Original file line number	Diff line number	Diff line change
`@@ -288,6 +288,10 @@ static __device__ void no_device_code(`
`288`	`288`	`__trap();`
`289`	`289`
`290`	`290`	`GGML_UNUSED(no_device_code); // suppress unused function warning`
	`291`	`+`
	`292`	`+#if defined(GGML_USE_MUSA)`
	`293`	`+ __builtin_unreachable();`
	`294`	`+#endif // defined(GGML_USE_MUSA)`
`291`	`295`	`}`
`292`	`296`
`293`	`297`	`#ifdef __CUDA_ARCH__`
Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,10 @@ static __global__ void conv_transpose_1d_kernel(`
`34`	`34`	`}`
`35`	`35`	`}`
`36`	`36`	`dst[global_index] = accumulator;`
	`37`	`+ GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);`
	`38`	`+ GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);`
	`39`	`+ GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);`
	`40`	`+ GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);`
`37`	`41`	`}`
`38`	`42`
`39`	`43`	`static void conv_transpose_1d_f32_f32_cuda(`
`@@ -75,8 +79,6 @@ void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor`
`75`	`79`	`const int p0 = 0;//opts[3];`
`76`	`80`	`const int d0 = 1;//opts[4];`
`77`	`81`
`78`		`- const int64_t kernel_size = ggml_nelements(src0);`
`79`		`- const int64_t input_size = ggml_nelements(src1);`
`80`	`82`	`const int64_t output_size = ggml_nelements(dst);`
`81`	`83`
`82`	`84`	`conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,`
Original file line number	Diff line number	Diff line change
`@@ -577,7 +577,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res`
`577`	`577`	`return;`
`578`	`578`	`}`
`579`	`579`
`580`		`- const src_t * x = (src_t *) vx;`
	`580`	`+ const src_t * x = (const src_t *) vx;`
`581`	`581`
`582`	`582`	`y[i] = x[i];`
`583`	`583`	`}`
Original file line number	Diff line number	Diff line change
`@@ -281,6 +281,18 @@ static __global__ void flash_attn_tile_ext_f32(`
`281`	`281`	`}`
`282`	`282`	`}`
`283`	`283`	`#else`
	`284`	`+ GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);`
	`285`	`+ GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);`
	`286`	`+ GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);`
	`287`	`+ GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);`
	`288`	`+ GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);`
	`289`	`+ GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);`
	`290`	`+ GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);`
	`291`	`+ GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);`
	`292`	`+ GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);`
	`293`	`+ GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);`
	`294`	`+ GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);`
	`295`	`+ GGML_UNUSED(ne2); GGML_UNUSED(ne3);`
`284`	`296`	`NO_DEVICE_CODE;`
`285`	`297`	`#endif // FLASH_ATTN_AVAILABLE`
`286`	`298`	`}`