Skip to content

Commit 3b5c2f7

Browse files
ggerganovhodlen
authored andcommitted
ggml : fix 32-bit ARM compat for IQ2_XS (whisper/1758)
* ggml : fix 32-bit ARM compat * ggml : fix fix * ggml : fix fix fix
1 parent c530d4e commit 3b5c2f7

File tree

1 file changed

+35
-4
lines changed

1 file changed

+35
-4
lines changed

ggml-quants.c

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
272272

273273
// vaddvq_s16
274274
// vpaddq_s16
275+
// vpaddq_s32
275276
// vaddvq_s32
276277
// vaddvq_f32
277278
// vmaxvq_f32
278279
// vcvtnq_s32_f32
280+
// vzip1_u8
281+
// vzip2_u8
279282

280283
inline static int32_t vaddvq_s16(int16x8_t v) {
281284
return
@@ -291,6 +294,12 @@ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
291294
return vcombine_s16(a0, b0);
292295
}
293296

297+
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
298+
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
299+
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
300+
return vcombine_s32(a0, b0);
301+
}
302+
294303
inline static int32_t vaddvq_s32(int32x4_t v) {
295304
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
296305
}
@@ -316,6 +325,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
316325
return res;
317326
}
318327

328+
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
329+
uint8x8_t res;
330+
331+
res[0] = a[0]; res[1] = b[0];
332+
res[2] = a[1]; res[3] = b[1];
333+
res[4] = a[2]; res[5] = b[2];
334+
res[6] = a[3]; res[7] = b[3];
335+
336+
return res;
337+
}
338+
339+
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
340+
uint8x8_t res;
341+
342+
res[0] = a[4]; res[1] = b[4];
343+
res[2] = a[5]; res[3] = b[5];
344+
res[4] = a[6]; res[5] = b[6];
345+
res[6] = a[7]; res[7] = b[7];
346+
347+
return res;
348+
}
349+
319350
// vld1q_s16_x2
320351
// vld1q_u8_x2
321352
// vld1q_u8_x4
@@ -7554,9 +7585,9 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
75547585

75557586
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
75567587

7557-
int8x16x4_t q2u;
7558-
int8x16x4_t q2s;
7559-
int8x16x4_t q8b;
7588+
ggml_int8x16x4_t q2u;
7589+
ggml_int8x16x4_t q2s;
7590+
ggml_int8x16x4_t q8b;
75607591

75617592
int32x4x4_t scales32;
75627593

@@ -7578,7 +7609,7 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
75787609
scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
75797610
int32x4_t sumi = vdupq_n_s32(0);
75807611
for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
7581-
q8b = vld1q_s8_x4(q8); q8 += 64;
7612+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
75827613
q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
75837614
q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
75847615
q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));

0 commit comments

Comments
 (0)