Skip to content

Commit 492d7f1

Browse files
authored
musa: fix all warnings, re-enable -DLLAMA_FATAL_WARNINGS=ON in ci and update doc (#12611)
* musa: fix all warnings Signed-off-by: Xiaodong Ye <[email protected]> * musa: enable -DLLAMA_FATAL_WARNINGS=ON in run.sh Signed-off-by: Xiaodong Ye <[email protected]> * musa: update ci doc (install ccache) Signed-off-by: Xiaodong Ye <[email protected]> * fix Windows build issue Signed-off-by: Xiaodong Ye <[email protected]> * Address review comments Signed-off-by: Xiaodong Ye <[email protected]> * Address review comments Signed-off-by: Xiaodong Ye <[email protected]> --------- Signed-off-by: Xiaodong Ye <[email protected]>
1 parent d3f1f0a commit 492d7f1

20 files changed

+194
-80
lines changed

ci/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ docker run --privileged -it \
6060
Inside the container, execute the following commands:
6161

6262
```bash
63-
apt update -y && apt install -y bc cmake git python3.10-venv time unzip wget
63+
apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
6464
git config --global --add safe.directory /ws
6565
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
6666
```

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ fi
6969
if [ ! -z ${GG_BUILD_MUSA} ]; then
7070
# Use qy1 by default (MTT S80)
7171
MUSA_ARCH=${MUSA_ARCH:-21}
72-
CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
72+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
7373
fi
7474
## helpers
7575

ggml/src/ggml-common.h

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;
158158

159159
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
160160

161+
#ifdef _MSC_VER
162+
#define GGML_EXTENSION
163+
#else // _MSC_VER
164+
#define GGML_EXTENSION __extension__
165+
#endif // _MSC_VER
166+
161167
#define QK4_0 32
162168
typedef struct {
163169
ggml_half d; // delta
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b
167173

168174
#define QK4_1 32
169175
typedef struct {
170-
union {
176+
GGML_EXTENSION union {
171177
struct {
172178
ggml_half d; // delta
173179
ggml_half m; // min
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0
188194

189195
#define QK5_1 32
190196
typedef struct {
191-
union {
197+
GGML_EXTENSION union {
192198
struct {
193199
ggml_half d; // delta
194200
ggml_half m; // min
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block
209215

210216
#define QK8_1 32
211217
typedef struct {
212-
union {
218+
GGML_EXTENSION union {
213219
struct {
214220
ggml_half d; // delta
215221
ggml_half s; // d * sum(qs[i])
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
250256
typedef struct {
251257
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
252258
uint8_t qs[QK_K/4]; // quants
253-
union {
259+
GGML_EXTENSION union {
254260
struct {
255261
ggml_half d; // super-block scale for quantized scales
256262
ggml_half dmin; // super-block scale for quantized mins
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
277283
// weight is represented as x = a * q + b
278284
// Effectively 4.5 bits per weight
279285
typedef struct {
280-
union {
286+
GGML_EXTENSION union {
281287
struct {
282288
ggml_half d; // super-block scale for quantized scales
283289
ggml_half dmin; // super-block scale for quantized mins
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
294300
// weight is represented as x = a * q + b
295301
// Effectively 5.5 bits per weight
296302
typedef struct {
297-
union {
303+
GGML_EXTENSION union {
298304
struct {
299305
ggml_half d; // super-block scale for quantized scales
300306
ggml_half dmin; // super-block scale for quantized mins

ggml/src/ggml-cuda/common.cuh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,10 @@ static __device__ void no_device_code(
288288
__trap();
289289

290290
GGML_UNUSED(no_device_code); // suppress unused function warning
291+
292+
#if defined(GGML_USE_MUSA)
293+
__builtin_unreachable();
294+
#endif // defined(GGML_USE_MUSA)
291295
}
292296

293297
#ifdef __CUDA_ARCH__

ggml/src/ggml-cuda/concat.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
3838
blockIdx.y * ne0 +
3939
blockIdx.z * ne0 * gridDim.y;
4040

41-
if (blockIdx.y < ne01) { // src0
41+
if (blockIdx.y < (unsigned)ne01) { // src0
4242
int offset_src =
4343
nidx +
4444
blockIdx.y * ne0 +
@@ -64,7 +64,7 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
6464
blockIdx.y * ne0 +
6565
blockIdx.z * ne0 * gridDim.y;
6666

67-
if (blockIdx.z < ne02) { // src0
67+
if (blockIdx.z < (unsigned)ne02) { // src0
6868
int offset_src =
6969
nidx +
7070
blockIdx.y * ne0 +

ggml/src/ggml-cuda/conv-transpose-1d.cu

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ static __global__ void conv_transpose_1d_kernel(
3434
}
3535
}
3636
dst[global_index] = accumulator;
37+
GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);
38+
GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
39+
GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
40+
GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
3741
}
3842

3943
static void conv_transpose_1d_f32_f32_cuda(
@@ -75,8 +79,6 @@ void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor
7579
const int p0 = 0;//opts[3];
7680
const int d0 = 1;//opts[4];
7781

78-
const int64_t kernel_size = ggml_nelements(src0);
79-
const int64_t input_size = ggml_nelements(src1);
8082
const int64_t output_size = ggml_nelements(dst);
8183

8284
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,

ggml/src/ggml-cuda/convert.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
577577
return;
578578
}
579579

580-
const src_t * x = (src_t *) vx;
580+
const src_t * x = (const src_t *) vx;
581581

582582
y[i] = x[i];
583583
}

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -315,14 +315,14 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
315315

316316
float vals[sizeof(int)] = {0.0f};
317317
#pragma unroll
318-
for (int l = 0; l < sizeof(int); ++l) {
318+
for (int l = 0; l < int(sizeof(int)); ++l) {
319319
vals[l] = scale * x[4*threadIdx.x + l];
320320
}
321321

322322
float amax = fabsf(vals[0]);
323323
float sum = vals[0];
324324
#pragma unroll
325-
for (int l = 1; l < sizeof(int); ++l) {
325+
for (int l = 1; l < int(sizeof(int)); ++l) {
326326
amax = fmaxf(amax, fabsf(vals[l]));
327327
sum += vals[l];
328328
}
@@ -338,7 +338,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
338338

339339
if (d != 0.0f) {
340340
#pragma unroll
341-
for (int l = 0; l < sizeof(int); ++l) {
341+
for (int l = 0; l < int(sizeof(int)); ++l) {
342342
q8[l] = roundf(vals[l] / d);
343343
}
344344
}
@@ -638,7 +638,7 @@ static __global__ void flash_attn_combine_results(
638638
float VKQ_denominator = 0.0f;
639639
for (int l = 0; l < parallel_blocks; ++l) {
640640
const float diff = meta[l].x - kqmax;
641-
const float KQ_max_scale = expf(diff);
641+
float KQ_max_scale = expf(diff);
642642
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
643643
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
644644

@@ -649,6 +649,7 @@ static __global__ void flash_attn_combine_results(
649649
dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
650650
}
651651

652+
[[noreturn]]
652653
static void on_no_fattn_vec_case(const int D) {
653654
if (D == 64) {
654655
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 58 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
406406
#endif // CP_ASYNC_AVAILABLE
407407

408408
#else
409+
GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
410+
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
411+
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
412+
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV);
413+
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
414+
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
415+
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
416+
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
417+
GGML_UNUSED(kb0);
409418
NO_DEVICE_CODE;
410419
#endif // NEW_MMA_AVAILABLE
411420
}
@@ -797,6 +806,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
797806
__syncthreads();
798807
}
799808
#else
809+
GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
810+
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
811+
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
812+
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
813+
GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask);
814+
GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
800815
NO_DEVICE_CODE;
801816
#endif // NEW_MMA_AVAILABLE
802817
}
@@ -931,6 +946,16 @@ static __global__ void flash_attn_ext_f16(
931946
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
932947
ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
933948
#else
949+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
950+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
951+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
952+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
953+
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
954+
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
955+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
956+
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
957+
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
958+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
934959
NO_DEVICE_CODE;
935960
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
936961
}
@@ -985,38 +1010,38 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
9851010
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
9861011
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \
9871012

988-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8);
989-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8);
990-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8);
991-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8);
992-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8);
993-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8);
994-
995-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16);
996-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16);
997-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16);
998-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16);
999-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16);
1000-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16);
1001-
1002-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32);
1003-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32);
1004-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32);
1005-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32);
1006-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32);
1007-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32);
1008-
1009-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64);
1010-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64);
1011-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64);
1012-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64);
1013-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64);
1014-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64);
1013+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8)
1014+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8)
1015+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8)
1016+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8)
1017+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8)
1018+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8)
1019+
1020+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16)
1021+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16)
1022+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16)
1023+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16)
1024+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16)
1025+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16)
1026+
1027+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32)
1028+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32)
1029+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32)
1030+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32)
1031+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32)
1032+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32)
1033+
1034+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64)
1035+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64)
1036+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64)
1037+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64)
1038+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64)
1039+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64)
10151040

10161041
// Kernels with ncols == 128 are only 4% faster due to register pressure.
1017-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128);
1018-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128);
1019-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128);
1020-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128);
1021-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128);
1022-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128); // Needs too much shared memory.
1042+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
1043+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
1044+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
1045+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
1046+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
1047+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory.

ggml/src/ggml-cuda/fattn-tile-f16.cu

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,19 @@ static __global__ void flash_attn_tile_ext_f16(
282282
}
283283
}
284284
#else
285-
NO_DEVICE_CODE;
285+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
286+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
287+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
288+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
289+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
290+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
291+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
292+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
293+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
294+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
295+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
296+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
297+
NO_DEVICE_CODE;
286298
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
287299
}
288300

ggml/src/ggml-cuda/fattn-tile-f32.cu

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,18 @@ static __global__ void flash_attn_tile_ext_f32(
281281
}
282282
}
283283
#else
284+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
285+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
286+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
287+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
288+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
289+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
290+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
291+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
292+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
293+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
294+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
295+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
284296
NO_DEVICE_CODE;
285297
#endif // FLASH_ATTN_AVAILABLE
286298
}

ggml/src/ggml-cuda/fattn-vec-f16.cuh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,19 @@ static __global__ void flash_attn_vec_ext_f16(
292292
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
293293
}
294294
#else
295-
NO_DEVICE_CODE;
295+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
296+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
297+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
298+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
299+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
300+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
301+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
302+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
303+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
304+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
305+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
306+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
307+
NO_DEVICE_CODE;
296308
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
297309
}
298310

ggml/src/ggml-cuda/fattn-vec-f32.cuh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,16 @@ static __global__ void flash_attn_vec_ext_f32(
277277
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
278278
}
279279
#else
280+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
281+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
282+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
283+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
284+
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
285+
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
286+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
287+
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
288+
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
289+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
280290
NO_DEVICE_CODE;
281291
#endif // FLASH_ATTN_AVAILABLE
282292
}

0 commit comments

Comments
 (0)